如何用python爬取知乎话题?
admin
2023-03-14 08:21:06
0

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

#coding:utf-8
"""
@author:haoning
@create time:2015.8.5
"""
from __future__ import division  # 精确除法
from Queue import Queue
from __builtin__ import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
   'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
   'X-Requested-With':'XMLHttpRequest',
   'Referer':'https://www.zhihu.com/topics',
   'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
}

DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASS = 'root'

queue= Queue() #接收队列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,3) #在这里应该加入代理
        html = response.read()
        return html
    except:
        pass
    return None

def getTopics():
    url = 'https://www.zhihu.com/topics'
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
        html = response.read().decode('utf-8')
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})

        for li in lis:
            data_id=li.get('data-id')
            name=li.text
            curr.execute('select id from classify_new where name=%s',(name))
            y= curr.fetchone()
            if not y:
                curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e

def get_extension(name):  
    where=name.rfind('.')
    if where!=-1:
        return name[where:len(name)]
    return None

def which_platform():
    sys_str = platform.system()
    return sys_str

def GetDateString():
    when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
    foldername = str(when)
    return foldername 

def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + '//' + GetDateString() + '//'  +str(classify)
            if which_platform()=="Linux":
                newFolderName=par + '/' + GetDateString() + "/" +str(classify)
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None 
    except Exception,e:
        print "kk",e
    return None 

def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,3)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        try:
            if "e82bab09c_m" in str(url):
                return True
            if not os.path.exists(filename):
                file_object = open(filename,'w+b')
                file_object.write(dataimg)
                file_object.close()
                return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
            else:
                print "file exist"
                return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #如果没有下载下来就利用原来网站的链接

def getChildren(node,name):
    global queue,nodeSet
    try:
        url="https://www.zhihu.com/topic/"+str(node)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        p_ch='父话题'
        node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text
        topic_cla=soup.find('div', {'class' : 'child-topic'})
        if topic_cla is not None:
            try:
                p_ch=str(topic_cla.text)
                aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点
                if u'子话题' in p_ch:
                    for a in aList:
                        token=a.get('data-token')
                        a=str(a).replace('\n','').replace('\t','').replace('\r','')
                        start=str(a).find('>')
                        end=str(a).rfind('')
                        new_node=str(str(a)[start+1:end])
                        curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同
                        y= curr.fetchone()
                        if not y:
                            print "y=",y,"new_node=",new_node,"token=",token
                            queue.put((token,new_node,node_name))
            except Exception as e:
                print "add queue error",e
    except Exception as e:
        print "get html error",e

def getContent(n,name,p,top_id):
    try:
        global counter
        curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
        y= curr.fetchone()
        print "exist?? ",y,"n=",n
        if not y:
            url="https://www.zhihu.com/topic/"+str(n)+"/hot"
            html=get_html(url)
            if html is None:
                return
            soup = BeautifulSoup(html)
            title=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text
            pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
            description=soup.find('div',{'class':'zm-editable-content'})
            if description is not None:
                description=description.text

            if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环
                description=None

            tag_path=download_img(pic_path,top_id)
            print "tag_path=",tag_path
            if (tag_path is not None) or tag_path==True:
                if tag_path==True:
                    tag_path=None
                father_id=2 #默认为杂谈
                curr.execute('select id from rooms where name=%s',(p))
                results = curr.fetchall()
                for r in results:
                    father_id=r[0]
                name=title
                curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
                y= curr.fetchone()
                print "store see..",y
                if not y:
                    friends_num=0
                    temp = time.time()
                    x = time.localtime(float(temp))
                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                    create_time
                    creater_id=None
                    room_avatar=tag_path
                    is_pass=1
                    has_index=0
                    reason_id=None  
                    #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                    ######################有资格入库的内容
                    counter=counter+1
                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                    conn.commit() #必须时时进入数据库,不然找不到父节点
                    if counter % 200==0:
                        print "current node",name,"num",counter
    except Exception as e:
        print "get content error",e       

def work():
    global queue
    curr.execute('select id,node,parent,name from classify where status=1')
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        node=r[1]
        parent=r[2]
        name=r[3]
        try:
            queue.put((node,name,parent)) #首先放入队列
            while queue.qsize() >0:
                n,p=queue.get() #顶节点出队
                getContent(n,p,top_id)
                getChildren(n,name) #出队内容的子节点
            conn.commit()
        except Exception as e:
            print "what's wrong",e  

def new_work():
    global queue
    curr.execute('select id,data_id,name from classify_new_copy where status=1')
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        data_id=r[1]
        name=r[2]
        try:
            get_topis(data_id,name,top_id)
        except:
            pass

def get_topis(data_id,name,top_id):
    global queue
    url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
    isGet = True;
    offset = -20;
    data_id=str(data_id)
    while isGet:
        offset = offset + 20
        values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
        try:
            msg=None
            try:
                data = urllib.urlencode(values)
                request = urllib2.Request(url,data,headers)
                response = urllib2.urlopen(request,None,5)
                html=response.read().decode('utf-8')
                json_str = json.loads(html)
                ms=json_str['msg']
                if len(ms) <5:
                    break
                msg=ms[0]
            except Exception as e:
                print "eeeee",e
            #print msg
            if msg is not None:
                soup = BeautifulSoup(str(msg))
                blks = soup.find_all('div', {'class' : 'blk'})
                for blk in blks:
                    page=blk.find('a').get('href')
                    if page is not None:
                        node=page.replace("/topic/","") #将更多的种子入库
                        parent=name
                        ne=blk.find('strong').text
                        try:
                            queue.put((node,ne,parent)) #首先放入队列
                            while queue.qsize() >0:
                                n,name,p=queue.get() #顶节点出队
                                size=queue.qsize()
                                if size > 0:
                                    print size
                                getContent(n,name,p,top_id)
                                getChildren(n,name) #出队内容的子节点
                            conn.commit()
                        except Exception as e:
                            print "what's wrong",e  
        except urllib2.URLError, e:
            print "error is",e
            pass 

if __name__ == '__main__':
    i=0
    while i<400:
        new_work()
        i=i+1

说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。

有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。

相关内容

热门资讯

凤凰直击东盟峰会:菲方推海事中... 5月8日,第48届东盟峰会落幕。菲律宾提出设立“东盟海事中心”的倡议,成为本届峰会焦点之一。菲律宾总...
快30岁,还不像个大人,我们怎... 2025年,邓鹭下定决心裸辞。此后长达一年的gap时间里,她频繁觉得自己像漂在海上,且不知靠岸何处。...
历史性大手笔!中国向巴基斯坦交... 中国向巴基斯坦交付首艘“麒麟”级常规动力潜艇,这也是近年中国向世界出口的首艘新型现代潜艇,它注定成为...
郑丽文回故乡现身“云林人之夜”... 海峡导报综合报道 国民党主席郑丽文9日出席云林县“云林人之夜”活动,她表示特别穿着中山装来走秀,衣服...
以军空袭黎巴嫩南部多地,造成至... 总台记者获悉,当地时间5月9日下午,以军空袭黎巴嫩南部西顿、宾特朱拜勒、迈法敦等多地,位于黎南部的萨...
豪华邮轮3人死于汉坦,专家:中... 经济观察报 记者 刘晓诺“陈某下船至今39天,无相关症状,汉坦病毒核酸检测结果均为阴性。经专家研判,...
斯塔默迎来至暗时刻 入主唐宁街10号不到两年,斯塔默迎来至暗时刻,成为史上最不受欢迎的英国首相之一。5月7日举行的地方选...
视频丨天舟货运“零差评”背后 ... 5月8日,天舟十号货运飞船与长征七号遥十一运载火箭组合体垂直转运至发射区,计划于近日择机实施发射。天...
媒体:精神病院成十大股东,不该... 近日,一则“精神病院现身A股公司前十大股东”的消息引发广泛关注。有投资者发现,盛通股份2026年一季...
原创 美... 最近有个消息,可能不少人都刷到了。工信部正式批了一个6G试验的频率使用许可,支持在部分地区先搞一搞6...