原创 

python selenium debuggerAddress 模式自动化打开固定端口浏览器并设置代理

分类:python,爬虫    532人阅读    IT小君  2022-09-30 21:35
import json
import os
import threading
import time
from urllib import parse

from browsermobproxy import Server
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait

from mailService import mailService
from model import course, ses, catalog, courseDetail


class lagouSpider:
    def __init__(self):
        # sys.setrecursionlimit(50)
        browsermobproxy_location = r"F:\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat"

        # browsermobproxy_location = r"/opt/browsermob-proxy-2.1.4/bin/browsermob-proxy"
        dict = {'port': 9528}
        self.server = Server(path=browsermobproxy_location, options=dict)  #
        self.server.start()
        self.proxy = self.server.create_proxy()

        url = parse.urlparse(self.proxy.proxy).path
        print(url)
        sub = threading.Thread(target=self.cmd_process, args=(url,))
        sub.start()

        chrome_options = webdriver.ChromeOptions()
        capabilities = DesiredCapabilities.CHROME
        capabilities["pageLoadStrategy"] = "none"
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('lang=zh_CN.UTF-8')
        # chrome_options.add_argument('--proxy-server=%s' % self.proxy.proxy)
        chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9527")
        self.browser = webdriver.Chrome(executable_path=r'F:\Program Files (x86)\webdriver\chromedriver.exe',
                                        chrome_options=chrome_options, desired_capabilities=capabilities)
        self.wait = WebDriverWait(self.browser, 60)
        self.fileCount = 0
        self.sender = mailService()
        # self.log = Logger('./logs/all.log', level='debug')
        self.h = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}

    def cmd_process(self, url):
        os.chdir('C:\Program Files (x86)\Google\Chrome\Application')
        # you should set these options in this place
        cmd_ = '''
            chrome.exe --remote-debugging-port=9527 --user-data-dir="D:/test"  --ignore-certificate-errors --proxy-server={0}'''.format(
            url)
        os.system(cmd_)

    def login(self):
        self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
        time.sleep(50)

    def getCourse(self):
        self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
        self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
        time.sleep(10)
        entries = self.proxy.har['log']["entries"]
        for entry in entries:
            if 'request' in entry.keys():
                ur = entry['request']['url']
                print(ur)
                if "/getAllCoursePurchasedRecordForPC" in ur:
                    _response = entry['response']
                    if 'text' in _response['content']:
                        datas = json.loads(_response['content']['text'])['content']['allCoursePurchasedRecord'][1][
                            'courseRecordList']
                        for item in datas:
                            exists = ses.query(course).filter_by(origId=item['id']).all()
                            if len(exists)<=0:
                                cou = course()
                                cou.title = item['name']
                                cou.cover = item['image']
                                cou.description = ''
                                cou.origId = item['id']
                                cou.status = "ready"
                                cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                ses.add(cou)
                                ses.commit()

    def getCatalog(self):
        courses = ses.query(course).filter_by(status='ready').all()
        for item in courses:
            self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
            self.browser.get("https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + item.origId)
            time.sleep(10)
            entries = self.proxy.har['log']["entries"]
            for entry in entries:
                if 'request' in entry.keys():
                    ur = entry['request']['url']
                    print(ur)
                    if "/getCourseLessons" in ur:
                        _response = entry['response']
                        if 'text' in _response['content']:
                            datas = json.loads(_response['content']['text'])['content']['courseSectionList']
                            for ite in datas:
                                exists = ses.query(catalog).filter_by(origId=ite['id']).all()
                                if len(exists) <= 0:
                                    cou = catalog()
                                    cou.title = ite['sectionName']
                                    cou.origId = ite['id']
                                    cou.status = "ready"
                                    cou.courseId = item.id
                                    cou.origCourseId = item.origId
                                    cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                    ses.add(cou)
                                    ses.commit()
                                    for it in ite['courseLessons']:
                                        detail = courseDetail()
                                        detail.Title = it['theme']
                                        detail.origId = it['id']
                                        detail.status = 'ready'
                                        detail.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                        detail.catalogId = cou.id
                                        detail.origCourseId = it['courseId']
                                        detail.courseId = item.id
                                        ses.add(detail)
                                        ses.commit()
                                    cou.status = "done"
                                item.status = 'done'
                            ses.commit()

    def getCourseDetail(self):
        courses = ses.query(courseDetail).filter_by(status='ready').all()
        for item in courses:
            self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
            self.browser.get(
                "https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + str(
                    item.origCourseId) + "#/detail/pc?id=" + str(item.origId))
            time.sleep(4)
            entries = self.proxy.har['log']["entries"]
            for entry in entries:
                if 'request' in entry.keys():
                    ur = entry['request']['url']
                    print(ur)
                    if "/getCourseLessonDetail" in ur:
                        _response = entry['response']
                        if 'text' in _response['content']:
                            textContent = json.loads(_response['content']['text'])['content']['textContent']
                            item.Content = textContent
                            item.status = "done"
                            ses.commit()


if __name__ == '__main__':
    spider = lagouSpider()
    spider.getCourse()
    spider.getCatalog()
    spider.getCourseDetail()
    print('跑完了')

 说明:

使用browsermob-proxy生成代理

开启新线程通过cmd_process 方法打开固定端口浏览器

使用debuggerAddress连接打开的浏览器

 

支付宝打赏 微信打赏

如果文章对你有帮助,欢迎点击上方按钮打赏作者

 工具推荐 更多»