Python如何爬取京东的评价信息
admin
2023-07-06 18:44:29
0

Python如何爬取京东的评价信息

模块:requests,BeautifulSoup

import re
import time
import csv
import requests
from bs4 import BeautifulSoup

def write_a_row_in_csv(data, csv_doc):
    "save good information into a row in csv document"
    with open(csv_doc, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(data)

# add headers, download page, check status code, return page
url = 'https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BAp20&enc=utf-8&suggest=1.def.0.V13&wq=%E5%8D%8E%E4%B8%BA&pvid=f47b5d05bba84d9dbfabf983575a6875'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
    }
response = requests.get(url, headers=headers)
print(response.status_code)

# save as html document
with open('html.html', 'w', encoding='utf8') as f:
    f.write(response.text)

# save as csv document
with open('phone.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    fields = ('id', '名称', '价格', '评价人数', '好评率')
    writer.writerow(fields)

# find elements, such as name, item, price, comment, goodrate, comment count
soup_all = BeautifulSoup(response.content, 'lxml')
sp_all_items = soup_all.find_all('li', attrs={'class': 'gl-item'})
for soup in sp_all_items[:3]:
    print('-' * 50)
    name = soup.find('div', attrs={'class': 'p-name p-name-type-2'}).find('em').text
    print('name: ', name)
    item = soup.find('div', attrs={'class': 'p-name p-name-type-2'}).find('a')
    print('item: ', item['href'], re.search(r'(\d+)', item['href']).group())
    price = soup.find_all('div', attrs={'class': 'p-price'})
    print('price:', price[0].i.string)
    comment = soup.find_all('div', attrs={'class': 'p-commit'})
    print('comment url:', comment[0].find('a').attrs['href'])
    time.sleep(0.2)

    # need add referer into headers
    item_id = re.search(r'(\d+)', item['href']).group()
    url = f'https://sclub.jd.com/comment/productPageComments.action?productId={item_id}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
    headers = {
        "referer": f"https://item.jd.com/{item_id}.html",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
    }
    response = requests.get(url, headers=headers)
    with open('html.json', 'w', encoding='utf8') as f:
        f.write(response.text)
    data = response.json()
    comment_count = data['productCommentSummary']['commentCount']
    print('评价人数:', comment_count)
    good_rate = data['productCommentSummary']['goodRate']
    print('好评率:', good_rate)

    # record data into CSV sheet
    write_a_row_in_csv(('id'+item_id, name, price[0].i.string, comment_count, good_rate), 'phone.csv')

相关内容

热门资讯

我国科学家为细胞信号“导航”开... 新华社济南5月31日电(记者张力元)人体细胞犹如一座精密的通信城市,每天都有大量“指令”穿梭传递,调...
极端大风突袭哈尔滨!过山车停摆... 极目新闻记者 詹钘5月31日,受强对流天气影响,哈尔滨国际会展中心体育场相关设施受到损坏,原计划当晚...
三原电缆取得电缆接头连接用防护... 国家知识产权局信息显示,上海三原电缆附件有限公司取得一项名为“一种电缆接头连接用防护结构”的专利,授...
原创 识... 还是那句话,机圈苦大屏久已…… 虽然大屏有大屏的美,但是小屏也有小屏的俏。在大屏旗舰占据主流的手机市...
玄戒技术取得分频电路专利,实现... 国家知识产权局信息显示,北京玄戒技术有限公司取得一项名为“分频电路、分频器、射频芯片和电子设备”的专...
为什么今年香会基调明显变了 5月29日—31日在新加坡举行的第23届香格里拉对话会(简称“香会”),见证着元首引领下大国关系继续...
成本几毛钱、假驱蚊液香精兑水,... 入夏升温,蚊虫进入活跃期,驱蚊防护成为民生刚需,《财经调查》持续接到消费者投诉,他们买到的多款网红驱...
越来越多80后90后,正在丧失... 六一儿童节到来之际,朋友圈里开始出现一种熟悉的热闹。有人晒出零食礼包,有人半开玩笑地向伴侣讨礼物,还...
洋保电子取得用于低温环境的电气... 国家知识产权局信息显示,洋保电子(太仓)有限公司取得一项名为“一种用于低温环境的电气柜”的专利,授权...
中日韩飞手争霸宁波!2026无... 潮新闻客户端 记者 陈冲 通讯员 朱凝 5月31日,2026小遛·无人机竞速世界杯(中国·宁波鄞州站...