import json
import os
import threading
import time
from urllib import parse
from browsermobproxy import Server
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from mailService import mailService
from model import course, ses, catalog, courseDetail
class lagouSpider:
def __init__(self):
# sys.setrecursionlimit(50)
browsermobproxy_location = r"F:\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat"
# browsermobproxy_location = r"/opt/browsermob-proxy-2.1.4/bin/browsermob-proxy"
dict = {'port': 9528}
self.server = Server(path=browsermobproxy_location, options=dict) #
self.server.start()
self.proxy = self.server.create_proxy()
url = parse.urlparse(self.proxy.proxy).path
print(url)
sub = threading.Thread(target=self.cmd_process, args=(url,))
sub.start()
chrome_options = webdriver.ChromeOptions()
capabilities = DesiredCapabilities.CHROME
capabilities["pageLoadStrategy"] = "none"
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('lang=zh_CN.UTF-8')
# chrome_options.add_argument('--proxy-server=%s' % self.proxy.proxy)
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9527")
self.browser = webdriver.Chrome(executable_path=r'F:\Program Files (x86)\webdriver\chromedriver.exe',
chrome_options=chrome_options, desired_capabilities=capabilities)
self.wait = WebDriverWait(self.browser, 60)
self.fileCount = 0
self.sender = mailService()
# self.log = Logger('./logs/all.log', level='debug')
self.h = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
def cmd_process(self, url):
os.chdir('C:\Program Files (x86)\Google\Chrome\Application')
# you should set these options in this place
cmd_ = '''
chrome.exe --remote-debugging-port=9527 --user-data-dir="D:/test" --ignore-certificate-errors --proxy-server={0}'''.format(
url)
os.system(cmd_)
def login(self):
self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
time.sleep(50)
def getCourse(self):
self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
self.browser.get("https://kaiwu.lagou.com/hasBuy/special")
time.sleep(10)
entries = self.proxy.har['log']["entries"]
for entry in entries:
if 'request' in entry.keys():
ur = entry['request']['url']
print(ur)
if "/getAllCoursePurchasedRecordForPC" in ur:
_response = entry['response']
if 'text' in _response['content']:
datas = json.loads(_response['content']['text'])['content']['allCoursePurchasedRecord'][1][
'courseRecordList']
for item in datas:
exists = ses.query(course).filter_by(origId=item['id']).all()
if len(exists)<=0:
cou = course()
cou.title = item['name']
cou.cover = item['image']
cou.description = ''
cou.origId = item['id']
cou.status = "ready"
cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
ses.add(cou)
ses.commit()
def getCatalog(self):
courses = ses.query(course).filter_by(status='ready').all()
for item in courses:
self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
self.browser.get("https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + item.origId)
time.sleep(10)
entries = self.proxy.har['log']["entries"]
for entry in entries:
if 'request' in entry.keys():
ur = entry['request']['url']
print(ur)
if "/getCourseLessons" in ur:
_response = entry['response']
if 'text' in _response['content']:
datas = json.loads(_response['content']['text'])['content']['courseSectionList']
for ite in datas:
exists = ses.query(catalog).filter_by(origId=ite['id']).all()
if len(exists) <= 0:
cou = catalog()
cou.title = ite['sectionName']
cou.origId = ite['id']
cou.status = "ready"
cou.courseId = item.id
cou.origCourseId = item.origId
cou.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
ses.add(cou)
ses.commit()
for it in ite['courseLessons']:
detail = courseDetail()
detail.Title = it['theme']
detail.origId = it['id']
detail.status = 'ready'
detail.createAt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
detail.catalogId = cou.id
detail.origCourseId = it['courseId']
detail.courseId = item.id
ses.add(detail)
ses.commit()
cou.status = "done"
item.status = 'done'
ses.commit()
def getCourseDetail(self):
courses = ses.query(courseDetail).filter_by(status='ready').all()
for item in courses:
self.proxy.new_har("dataStore", options={'captureContent': True, 'captureContent': True})
self.browser.get(
"https://kaiwu.lagou.com/course/courseInfo.htm?courseId=" + str(
item.origCourseId) + "#/detail/pc?id=" + str(item.origId))
time.sleep(4)
entries = self.proxy.har['log']["entries"]
for entry in entries:
if 'request' in entry.keys():
ur = entry['request']['url']
print(ur)
if "/getCourseLessonDetail" in ur:
_response = entry['response']
if 'text' in _response['content']:
textContent = json.loads(_response['content']['text'])['content']['textContent']
item.Content = textContent
item.status = "done"
ses.commit()
if __name__ == '__main__':
spider = lagouSpider()
spider.getCourse()
spider.getCatalog()
spider.getCourseDetail()
print('跑完了')
说明:
使用browsermob-proxy生成代理
开启新线程通过cmd_process 方法打开固定端口浏览器
使用debuggerAddress连接打开的浏览器