import requests
from lxml import etree
import re
import sys
import parsel
headers = {
#填自己的Cookie
"Cookie": "fspop=test; cy=110; cye=hefei; _lxsdk_cuid=17dfb8bf972c8-0c4c616511a98a-4303066-144000-17dfb8bf972c8; _lxsdk=17dfb8bf972c8-0c4c616511a98a-4303066-144000-17dfb8bf972c8; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1640602795; _hc.v=d150de83-84c5-2ee1-a1c6-aa83653791e1.1640602795; _dp.ac.v=c8f2cef9-b940-47e4-b32e-9c884fbaf54d; thirdtoken=581f2cb3-714f-4c48-a7b8-292e1947347f; ll=7fd06e815b796be3df069dec7836c3df; ua=SuperHeart; ctu=bcdbecb4bc930198b20a4167e454de79d3a04ce69dd35fbacb12c5802ac08084; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1640615271",
"Host": "www.dianping.com",
"Referer": "http://www.dianping.com/shop/l1MNeHgCskgs6qiN",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
headers1 = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
url = "http://www.dianping.com/shop/G6KeVfRgAOACdDbG/review_all"
response = requests.get(url=url,headers=headers)
response.encoding='UTF-8' #编码用utf-8
#保存爬取到的的html
with open(sys.path[0] + "/" + '大众点评.html','w',encoding='UTF-8') as f:
f.write(response.text)
content = response.content
html = etree.HTML(content)
css_url = 'http:'+html.xpath('/html/head/link[4]/@href')[0]
#css的url
print(css_url)
css_response = requests.get(url=css_url,headers=headers1)
response.encoding = 'windows-1252'
print(css_response.text)
#这里我对 css网页也进行了保存
with open(sys.path[0] + "/" + r'大众点评.css','w',encoding='UTF-8') as f:
f.write(css_response.text)
#这里xpath无法用,我们可以用正则
svg_group = re.search(r'svgmtsi\[class\^="(\w+)"].*?background-image: url\((.*?)\);', css_response.text)
#key_letter是为了获取属性class的值。
key_letter = svg_group[1]
svg_url = 'http:'+svg_group[2]
# print(key_letter)
# print(svg_url)
svg_response = requests.get(url=svg_url,headers=headers1)
with open(sys.path[0] + "/" + "大众点评.svg",mode='w',encoding='UTF-8') as f:
f.write(svg_response.text)
with open(sys.path[0] + "/" + "大众点评.svg",'r',encoding='UTF-8') as f:
svg_html = f.read()
list1=re.findall(r' x="(\d+)" y="(\d+)"',svg_html)
y_list=[]
for i in list1:
y_list.append(str(int(i[1])-23))
sel = parsel.Selector(svg_html)
texts = re.findall(r'text x="(.*?)" y="(.*?)">(.*?)</text>',svg_html)
lines = {}
for text in texts:
lines[int(text[1])] = text[2]
with open(sys.path[0] + "/" + '大众点评.css','r',encoding='utf-8') as f:
css_html = f.read()
class_map = re.findall('\.('+key_letter+'\w+){background:-(\d+)\.0px -(\d+)\.0px;\}', css_html)
# print(class_map)
class_map = [(cls_name, int(x), int(y)) for cls_name, x,y in class_map]
resDic = {}
for one_char in class_map:
cls_name, x, y = one_char
for d in y_list:
if y == int(d):
index = (int)(x / 14)
resDic[cls_name] = lines[int(d)+23][index]
break
with open(sys.path[0] + "/" + '大众点评.html',mode="r",encoding="UTF-8") as f:
html = f.read()
comment_list=re.findall('<div class="review-words Hide">\s+(.*?)\s+<div class="less-words">',html)
for i in comment_list:
key_list=re.findall('<svgmtsi class="(\w+)"></svgmtsi>',i)
for n in key_list:
#这里是通过循环把类似<svgmtsi class="izu41"></svgmtsi>的替换相应的汉字
i = i.replace('<svgmtsi class="{}"></svgmtsi>'.format(n),resDic[n])
#这里这里img_list是为了把那些评论的表情包删掉。)
i = re.sub(r'<img (.*?)/>',"",i)
i = re.sub(r'&(.*?);',"",i)
print(i + "\n")