原创  python selenium debuggerAddress 模式自动化打开固定端口浏览器并设置代理

分类:python,爬虫    136人阅读    IT小君  2022-09-30 21:35
import json
import os
import threading
import time
from urllib import parse

from browsermobproxy import Server
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait

from mailService import mailService
from model import course, ses, catalog, courseDetail


class lagouSpider:
    def __init__(self):
        # sys.setrecursionlimit(50)
        browsermobproxy_location = r"F:\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat"

        # browsermobproxy_location = r"/opt/browsermob-proxy-2.1.4/bin/browsermob-proxy"
        dict = {'port': 9528}
        self.server = Server(path=browsermobproxy_location, options=dict)  #
        self.server.start()
        self.proxy = self.server.create_proxy()

        url = parse.urlparse(self.proxy.proxy).path
        print(url)
        sub = threading.Thread(target=self.cmd_process, args=(url,))
        sub.start()

        chrome_options = webdriver.ChromeOptions()
        capabilities = DesiredCapabilities.CHROME
        capabilities["pageLoadStrategy"] = "none"
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('lang=zh_CN.UTF-8')
        # chrome_options.add_argument('--proxy-server=%s' % self.proxy.proxy)
        chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9527")
        self.browser = webdriver.Chrome(executable_path=r'F:\Program Files (x86)\webdriver\chromedriver.exe',
                                        chrome_options=chrome_options, desired_capabilities=capabilities)
        self.wait = WebDriverWait(self.browser, 60)
        self.fileCount = 0
        self.sender = mailService()
        # self.log = Logger('./logs/all.log', level='debug')
        self.h = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}

    def cmd_process(self, url):
        os.chdir('C:\Program Files (x86)\Google\Chrome\Application')
        # you should set these options in this place
        cmd_ = '''
            chrome.exe --remote-debugging-port=9527 --user-data-dir="D:/test"  --ignore-certificate-errors --proxy-server={0}'''.format(
            url)
        os.system(cmd_)

    def login(self):
        self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
        time.sleep(50)

    def getCourse(self):
        self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
        self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
        time.sleep(10)
        entries = self.proxy.har['log']["entries"]
        for entry in entries:
            if 'request' in entry.keys():
                ur = entry['request']['url']
                print(ur)
                if "/getAllCoursePurchasedRecordForPC" in ur:
                    _response = entry['response']
                    if 'text' in _response['content']:
                        datas = json.loads(_response['content']['text'])['content']['allCoursePurchasedRecord'][1][
                            'courseRecordList']
                        for item in datas:
                            exists = ses.query(course).filter_by(origId=item['id']).all()
                            if len(exists)<=0:
                                cou = course()
                                cou.title = item['name']
                                cou.cover = item['image']
                                cou.description = ''
                                cou.origId = item['id']
                                cou.status = "ready"
                                cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                ses.add(cou)
                                ses.commit()

    def getCatalog(self):
        courses = ses.query(course).filter_by(status='ready').all()
        for item in courses:
            self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
            self.browser.get("https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + item.origId)
            time.sleep(10)
            entries = self.proxy.har['log']["entries"]
            for entry in entries:
                if 'request' in entry.keys():
                    ur = entry['request']['url']
                    print(ur)
                    if "/getCourseLessons" in ur:
                        _response = entry['response']
                        if 'text' in _response['content']:
                            datas = json.loads(_response['content']['text'])['content']['courseSectionList']
                            for ite in datas:
                                exists = ses.query(catalog).filter_by(origId=ite['id']).all()
                                if len(exists) <= 0:
                                    cou = catalog()
                                    cou.title = ite['sectionName']
                                    cou.origId = ite['id']
                                    cou.status = "ready"
                                    cou.courseId = item.id
                                    cou.origCourseId = item.origId
                                    cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                    ses.add(cou)
                                    ses.commit()
                                    for it in ite['courseLessons']:
                                        detail = courseDetail()
                                        detail.Title = it['theme']
                                        detail.origId = it['id']
                                        detail.status = 'ready'
                                        detail.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                        detail.catalogId = cou.id
                                        detail.origCourseId = it['courseId']
                                        detail.courseId = item.id
                                        ses.add(detail)
                                        ses.commit()
                                    cou.status = "done"
                                item.status = 'done'
                            ses.commit()

    def getCourseDetail(self):
        courses = ses.query(courseDetail).filter_by(status='ready').all()
        for item in courses:
            self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
            self.browser.get(
                "https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + str(
                    item.origCourseId) + "#/detail/pc?id=" + str(item.origId))
            time.sleep(4)
            entries = self.proxy.har['log']["entries"]
            for entry in entries:
                if 'request' in entry.keys():
                    ur = entry['request']['url']
                    print(ur)
                    if "/getCourseLessonDetail" in ur:
                        _response = entry['response']
                        if 'text' in _response['content']:
                            textContent = json.loads(_response['content']['text'])['content']['textContent']
                            item.Content = textContent
                            item.status = "done"
                            ses.commit()


if __name__ == '__main__':
    spider = lagouSpider()
    spider.getCourse()
    spider.getCatalog()
    spider.getCourseDetail()
    print('跑完了')

 说明:

使用browsermob-proxy生成代理

开启新线程通过cmd_process 方法打开固定端口浏览器

使用debuggerAddress连接打开的浏览器

 

服务器费用不足...

CSS3机械工业风齿轮转动特效

时尚的社交网站前端界面HTML模板 - Cirkle

区块链数字货币管理系统网页模板 - Cryptio

物品租赁买卖业务平台HTML5模板 - Doremi

bootstrap框架web UI工具包后台模板 - MegaDin

JavaScript模拟网页星际旅行特效

Vue轻量级后台管理系统基础框架模板--精

UFO适合404页面的jQuery特效

Vue 3、Vite和TailwindCss开发的管理面板

bootstrap风格后台界面管理系统模板 - Voler

一个alert网页小部件

程序员向妹子表白专用代码

JS+CSS3卡通汽车行驶特效

HTML5大气导航栏鼠标悬停特效

Hotspot Map - 强大的图片热点注释和提示工具

滚动下拉图片切斜HTML5特效

给乌龟喂食卡通HTML5特效

黑客帝国文字雨矩阵动画特效

建筑工程机械设备租赁网站HTML模板 - Antek

时尚和轻量设计Bootstrap4管理系统模板 - Sunny

服务器费用不足...
 工具推荐 更多»