原创 

python selenium debuggerAddress 模式自动化打开固定端口浏览器并设置代理

分类:python,爬虫    442人阅读    IT小君  2022-09-30 21:35
import json
import os
import threading
import time
from urllib import parse

from browsermobproxy import Server
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait

from mailService import mailService
from model import course, ses, catalog, courseDetail


class lagouSpider:
    def __init__(self):
        # sys.setrecursionlimit(50)
        browsermobproxy_location = r"F:\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat"

        # browsermobproxy_location = r"/opt/browsermob-proxy-2.1.4/bin/browsermob-proxy"
        dict = {'port': 9528}
        self.server = Server(path=browsermobproxy_location, options=dict)  #
        self.server.start()
        self.proxy = self.server.create_proxy()

        url = parse.urlparse(self.proxy.proxy).path
        print(url)
        sub = threading.Thread(target=self.cmd_process, args=(url,))
        sub.start()

        chrome_options = webdriver.ChromeOptions()
        capabilities = DesiredCapabilities.CHROME
        capabilities["pageLoadStrategy"] = "none"
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('lang=zh_CN.UTF-8')
        # chrome_options.add_argument('--proxy-server=%s' % self.proxy.proxy)
        chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9527")
        self.browser = webdriver.Chrome(executable_path=r'F:\Program Files (x86)\webdriver\chromedriver.exe',
                                        chrome_options=chrome_options, desired_capabilities=capabilities)
        self.wait = WebDriverWait(self.browser, 60)
        self.fileCount = 0
        self.sender = mailService()
        # self.log = Logger('./logs/all.log', level='debug')
        self.h = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}

    def cmd_process(self, url):
        os.chdir('C:\Program Files (x86)\Google\Chrome\Application')
        # you should set these options in this place
        cmd_ = '''
            chrome.exe --remote-debugging-port=9527 --user-data-dir="D:/test"  --ignore-certificate-errors --proxy-server={0}'''.format(
            url)
        os.system(cmd_)

    def login(self):
        self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
        time.sleep(50)

    def getCourse(self):
        self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
        self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
        time.sleep(10)
        entries = self.proxy.har['log']["entries"]
        for entry in entries:
            if 'request' in entry.keys():
                ur = entry['request']['url']
                print(ur)
                if "/getAllCoursePurchasedRecordForPC" in ur:
                    _response = entry['response']
                    if 'text' in _response['content']:
                        datas = json.loads(_response['content']['text'])['content']['allCoursePurchasedRecord'][1][
                            'courseRecordList']
                        for item in datas:
                            exists = ses.query(course).filter_by(origId=item['id']).all()
                            if len(exists)<=0:
                                cou = course()
                                cou.title = item['name']
                                cou.cover = item['image']
                                cou.description = ''
                                cou.origId = item['id']
                                cou.status = "ready"
                                cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                ses.add(cou)
                                ses.commit()

    def getCatalog(self):
        courses = ses.query(course).filter_by(status='ready').all()
        for item in courses:
            self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
            self.browser.get("https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + item.origId)
            time.sleep(10)
            entries = self.proxy.har['log']["entries"]
            for entry in entries:
                if 'request' in entry.keys():
                    ur = entry['request']['url']
                    print(ur)
                    if "/getCourseLessons" in ur:
                        _response = entry['response']
                        if 'text' in _response['content']:
                            datas = json.loads(_response['content']['text'])['content']['courseSectionList']
                            for ite in datas:
                                exists = ses.query(catalog).filter_by(origId=ite['id']).all()
                                if len(exists) <= 0:
                                    cou = catalog()
                                    cou.title = ite['sectionName']
                                    cou.origId = ite['id']
                                    cou.status = "ready"
                                    cou.courseId = item.id
                                    cou.origCourseId = item.origId
                                    cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                    ses.add(cou)
                                    ses.commit()
                                    for it in ite['courseLessons']:
                                        detail = courseDetail()
                                        detail.Title = it['theme']
                                        detail.origId = it['id']
                                        detail.status = 'ready'
                                        detail.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                        detail.catalogId = cou.id
                                        detail.origCourseId = it['courseId']
                                        detail.courseId = item.id
                                        ses.add(detail)
                                        ses.commit()
                                    cou.status = "done"
                                item.status = 'done'
                            ses.commit()

    def getCourseDetail(self):
        courses = ses.query(courseDetail).filter_by(status='ready').all()
        for item in courses:
            self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
            self.browser.get(
                "https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + str(
                    item.origCourseId) + "#/detail/pc?id=" + str(item.origId))
            time.sleep(4)
            entries = self.proxy.har['log']["entries"]
            for entry in entries:
                if 'request' in entry.keys():
                    ur = entry['request']['url']
                    print(ur)
                    if "/getCourseLessonDetail" in ur:
                        _response = entry['response']
                        if 'text' in _response['content']:
                            textContent = json.loads(_response['content']['text'])['content']['textContent']
                            item.Content = textContent
                            item.status = "done"
                            ses.commit()


if __name__ == '__main__':
    spider = lagouSpider()
    spider.getCourse()
    spider.getCatalog()
    spider.getCourseDetail()
    print('跑完了')

 说明:

使用browsermob-proxy生成代理

开启新线程通过cmd_process 方法打开固定端口浏览器

使用debuggerAddress连接打开的浏览器

 

点击广告,支持我们为你提供更好的服务

HTML5 Canvas竖直流动线条背景动画特效

html5 svg夜空中星星流星动画场景特效

响应式太阳能能源公司网站模板

中小型创意设计服务公司网站模板

html5 canvas彩色碎片组合球形旋转动画特效

html5图标下拉搜索框自动匹配代码

小众时尚单品在线电子商务网站模板

css+js实现的颜色渐变数字时钟动画特效

canvas炫酷鼠标移动文字粒子特效

现代时尚家具公司网站模板

响应式咖啡饮品宣传网站模板

jQuery右端悬浮带返回顶部特效

网页设计开发公司网站模板

响应式时尚单品在线商城网站模板

js+css3抽奖转盘旋转点餐代码

html5 canvas进度条圆环图表统计动画特效

HTML5数字产品服务公司网站模板

css鼠标跟随文字模糊特效

有机水果蔬菜HTML5网站模板

HTML5现代家居装潢公司网站模板

点击广告,支持我们为你提供更好的服务
 工具推荐 更多»
点击广告,支持我们为你提供更好的服务