Python爬取豆瓣高分电影前250名
admin
2023-07-12 04:43:30
0
import requests
import pymysql
import time
import re
import xlwt
from lxml import etree

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
          'Cookie': 'gr_user_id = c6f58a39 - ea25 - 4f58 - b448 - 545070192c4e;59a81cc7d8c04307ba183d331c373ef6_gr_session_id = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_sid_with_cs1 = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_cs1 = N % 2FA;59a81cc7d8c04307ba183d331c373ef6_gr_session_id_e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26 = true;grwng_uid = 9ec14ad9 - 5ac0 - 4bb1 - 81c1 - bc60d2685710;abtest_ABTest4SearchDate = b;xzuuid = 79426b52;_uab_collina = 154660443606130958890473;TY_SESSION_ID = 907f32df - c060 - 49ca - b945 - 98215cc03475;rule_math = pvzq3r06hi'}

conn = pymysql.connect(host= 'localhost',user= 'root',passwd='momiao5201314',db='doubanmovie',port=3306,charset='utf8')
cursor = conn.cursor() #创建光标对象

'''
# 创建一个workbook设置编码
workbook = xlwt.Workbook(encoding = 'utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
#定义表头
header = ['movie_name','director','actors,style','country','release_time','time','score']
for h in range(len(header)):
    workbook.write(0,h,header[h])
'''

def get_movie_url(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    movie_urls = selector.xpath('//div[@class="hd"]/a/@href')
    for movie_url in movie_urls:
        #print(movie_url)
        get_movie_info(movie_url)

def get_movie_info(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    try:
        movie_name = selector.xpath('//*[@id="content"]/h2/span[1]/text()')  #1电影名称
        #print(movie_name)
        director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()') #2导演
        #print(director)
        actors = selector.xpath('//*[@id="info"]/span[3]/span[2]')[0]  #Xpath疑问?
        actor = actors.xpath('string(.)')  #3演员
        #print(actor)
        style = re.findall('(.*?)',html.text,re.S)[0] + re.findall('(.*?)',html.text,re.S)[1]  #4类型
        #print(style)
        country = re.findall('制片国家/地区:(.*?)
',html.text,re.S) #5制片地区 #print(country) release_time = re.findall('上映日期:.*?>(.*?)',html.text,re.S) #6上映时间 #print(release_time) time = re.findall('片长:.*?>(.*?)',html.text,re.S) #7片长 #print(time) score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()') #8评分 #print(score) print(str(movie_name)) #sql = 'insert into doubanmovie(name,director,actor,style,country,release_time,time,score,) values("{}","{}","{}","{}","{}","{}","{}","{}")'.format(movie_name,director,actor,style,country,release_time,time,score) #多一个逗号 cursor.execute("insert into doubanmovie(name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",(str(movie_name),str(director),str(actor),str(style),str(country),str(release_time),str(time),str(score))) except IndexError: pass if __name__ == '__main__': urls = ['https://movie.douban.com/top250?start={}&filter='.format(num)for num in range(0,250,25)] for url in urls: get_movie_url(url) time.sleep(2) conn.commit()

Python爬取豆瓣高分电影前250名

相关内容

热门资讯

OpenAI,正式组建机器人事... 人工智能(AI)领域巨头OpenAI发布公告,宣布大力扩张内部机器人事业部,正式全面切入硬件赛道,实...
星火空间完成近亿元Pre-A轮... 据星火空间消息,6月1日,合肥星火空间科技有限公司完成近亿元Pre-A轮融资。本轮融资由云泽资本和轨...
刚刚,宇树IPO闪电过会!王兴... 智东西 作者 | 许丽思 编辑 | 漠影 智东西6月1日报道,刚刚,宇树通过上交所上市委会议审议。 ...
京东工业发起百川计划 携手上游... 京东工业大模型生态发布会6月1日在北京举行,京东工业携手合作伙伴正式开启“百川计划”,从数据、模型、...
强脑科技预计今年机械手销量大涨... IT之家 6 月 2 日消息,据彭博社 2 日(今天)报道,强脑科技预计,随着中国人形机器人产业快速...
一图看懂差距!iPhone 1... 快科技6月2日消息,iPhone 18 Pro不同版本电池容量不同的相关话题冲上社交平台热搜榜,引发...
iPhone 18 Pro 或... 据科技狐,近日,知名爆料人 Sonny Dickson 分享了 iPhone 18 Pro 全套机模...
武契奇:不排除卸任总统后担任总... 塞尔维亚总统武契奇近期密集释放政坛人事与大选相关信号,明确无意在 2027 年总统任期届满后谋求连任...
6月新机夯到拉盘点,告诉你哪台... 现在这形势,手机升价是不可能躲得过的了,而且涨价期至少持续两年。那既然内存涨价躲不过,就只能选升级大...
伊朗公开已故最高领袖哈梅内伊安... 新华社德黑兰6月2日电 据伊朗伊斯兰共和国通讯社2日报道,根据伊朗已故最高领袖阿里·哈梅内伊生前遗愿...