使用Python编写的知乎内容爬取脚本
要使用Python爬取知乎内容,可以使用requests库获取网页源代码,然后使用BeautifulSoup库解析HTML,提取所需信息。首先安装这两个库:`pip install requests beautifulsoup4`。然后编写脚本,导入库,设置请求头模拟浏览器访问,发送请求获取网页源代码。使用BeautifulSoup解析HTML,根据需要提取信息。最后将提取到的信息保存到文件或数据库中。注意遵守知乎的爬虫政策,合理设置爬取频率,避免对网站造成过大压力。
题主的数据科学导论作业,关于舆情分析负责信息爬取。
可能会对大家有点帮助,如果有哪写的不太好的地方,希望可以告诉我
如果不想看,直接用的话把js代码命名为 g_encrypt.js 和Python代码放在同一级目录就可以了(要搭建nodejs环境,具体可以参考Nodejs安装及环境配置 - 简书 (jianshu.com))
importhashlibimportjsonimportreimporttimeimportpandasaspdimportexecjsimportrequestsimporturllib.parsedefstampTotime(timestamp):timeArray=time.localtime(timestamp)otherStyleTime=time.strftime("%Y-%m-%d%H:%M:%S",timeArray)returnotherStyleTimedefstrToUrl(str):returnurllib.parse.quote(str)defgetDataFromKey(keyword):keywordUrl=strToUrl(keyword)url=f"/api/v4/search_v3?t=general&q={keywordUrl}&correction=1&offset=0&limit=40&lc_idx=0&show_all_topics=0"referer="https://www.zhihu.com/search?type=content&q="+keywordUrlf="+".join(["3_2.0",url,'"AFAaXJyCNBKPTmvMea6p4UCHCN4Ji2J2wk8=|1605541545"'])fmd5=hashlib.new('md5',f.encode()).hexdigest()withopen('g_encrypt.js','r')asf:ctx1=execjs.compile(f.read(),cwd=r'D:programnode_modules')encrypt_str=ctx1.call('b',fmd5)session=requests.session()session.headers={"user-agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/90.0.4430.93Safari/537.36","referer":"https://www.zhihu.com/",}#"encodedParams":"=="},"response=session.get("https://www.zhihu.com/search?type=content&q="+keywordUrl).textx_ab_pb=re.findall('encodedParams":"(.*?)"},"trigge',response,re.DOTALL)[0].replace("u002F","/")headers={"authority":"www.zhihu.com","method":"GET","referer":referer,"user-agent":"Mozilla/5.0(Macintosh;IntelMacOSX10_15_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/84.0.4147.89Safari/537.36","x-ab-pb":x_ab_pb,"sec-fetch-dest":"empty","sec-fetch-mode":"cors","sec-fetch-site":"same-origin",'cookie':'d_c0="AFAaXJyCNBKPTmvMea6p4UCHCN4Ji2J2wk8=|1605541545";',"x-ab-param":"tp_zrec=1;top_test_4_liguangyi=1;li_sp_mqbk=0;li_edu_page=old;zr_expslotpaid=1;tp_contents=2;li_panswer_topic=0;se_ffzx_jushen1=0;zr_slotpaidexp=1;pf_adjust=1;li_vip_verti_search=0;tp_dingyue_video=0;li_paid_answer_exp=0;pf_noti_entry_num=2;tp_topic_style=0;qap_question_visitor=0;qap_question_author=0","x-api-version":"3.0.91","x-zse-83":"3_2.0","x-zse-86":"2.0_%s"%encrypt_str,}r=requests.get("https://www.zhihu.com"+url,headers=headers)print(r.text)returnr.textdefgetData(data_json,i):title=data_json["data"][i]["highlight"]["title"]description=data_json["data"][i]["highlight"]["description"]content=data_json["data"][i]["object"]["content"]voteup_count=data_json["data"][i]["object"]["voteup_count"]comment_count=data_json["data"][i]["object"]["comment_count"]created_time=stampTotime(int(data_json["data"][i]["object"]["created_time"]))id=data_json["data"][i]["object"]["id"]question=data_json["data"][i]["object"]["question"]["name"]question_id=data_json["data"][i]["object"]["question"]["id"]print((title,description,content,voteup_count,created_time,id,question,question_id))return(title,description,content,voteup_count,comment_count,created_time,id,question,question_id)defgetAllData(keyword):page_data=[]column_name=('标题','简介','内容','点赞数','评论数','创建时间','文章id','问题','问题id')data_json=json.loads(getDataFromKey(keyword))foriinrange(0,30):try:page_data.append(dict(zip(column_name,getData(data_json,i))))except:print(f"索引异常,跳过本次{i}")returnpage_datadefpackDateToexcel(all_data):df=pd.DataFrame(all_data)df.to_excel('zhihu'+time.strftime("%Y-%m-%d-%H-%M",time.localtime(time.time()))+'.xlsx',index=False)if__name__=='__main__':all_data=[]#关键词keywords=["耐克","Nike","李宁","LI-NING","阿迪达斯","adidas","安踏","ANTA","新疆棉","棉花","新疆","国产崛起"]forkeywordinkeywords:one_data=getAllData(keyword)all_data+=one_datapackDateToexcel(all_data)
js文件
constjsdom=require("jsdom");const{JSDOM}=jsdom;constdom=newJSDOM(`<!DOCTYPEhtml><p>Helloworld</p>`);window=dom.window;document=window.document;XMLHttpRequest=window.XMLHttpRequest;varexports={}functiont(e){return(t="function"==typeofSymbol&&"symbol"==typeofSymbol.A?function(e){returntypeofe}:function(e){returne&&"function"==typeofSymbol&&e.constructor===Symbol&&e!==Symbol.prototype?"symbol":typeofe})(e)}Object.defineProperty(exports,"__esModule",{value:!0});varA="2.0",__g={};functions(){}functioni(e){this.t=(2048&e)>>11,this.s=(1536&e)>>9,this.i=511&e,this.h=511&e}functionh(e){this.s=(3072&e)>>10,this.h=1023&e}functiona(e){this.a=(3072&e)>>10,this.c=(768&e)>>8,this.n=(192&e)>>6,this.t=63&e}functionc(e){this.s=e>>10&3,this.i=1023&e}functionn(){}functione(e){this.a=(3072&e)>>10,this.c=(768&e)>>8,this.n=(192&e)>>6,this.t=63&e}functiono(e){this.h=(4095&e)>>2,this.t=3&e}functionr(e){this.s=e>>10&3,this.i=e>>2&255,this.t=3&e}s.prototype.e=function(e){e.o=!1},i.prototype.e=function(e){switch(this.t){case0:e.r[this.s]=this.i;break;case1:e.r[this.s]=e.k[this.h]}},h.prototype.e=function(e){e.k[this.h]=e.r[this.s]},a.prototype.e=function(e){switch(this.t){case0:e.r[this.a]=e.r[this.c]+e.r[this.n];break;case1:e.r[this.a]=e.r[this.c]-e.r[this.n];break;case2:e.r[this.a]=e.r[this.c]*e.r[this.n];break;case3:e.r[this.a]=e.r[this.c]/e.r[this.n];break;case4:e.r[this.a]=e.r[this.c]%e.r[this.n];break;case5:e.r[this.a]=e.r[this.c]==e.r[this.n];break;case6:e.r[this.a]=e.r[this.c]>=e.r[this.n];break;case7:e.r[this.a]=e.r[this.c]||e.r[this.n];break;case8:e.r[this.a]=e.r[this.c]&&e.r[this.n];break;case9:e.r[this.a]=e.r[this.c]!==e.r[this.n];break;case10:e.r[this.a]=t(e.r[this.c]);break;case11:e.r[this.a]=e.r[this.c]ine.r[this.n];break;case12:e.r[this.a]=e.r[this.c]>e.r[this.n];break;case13:e.r[this.a]=-e.r[this.c];break;case14:e.r[this.a]=e.r[this.c]<e.r[this.n];break;case15:e.r[this.a]=e.r[this.c]&e.r[this.n];break;case16:e.r[this.a]=e.r[this.c]^e.r[this.n];break;case17:e.r[this.a]=e.r[this.c]<<e.r[this.n];break;case18:e.r[this.a]=e.r[this.c]>>>e.r[this.n];break;case19:e.r[this.a]=e.r[this.c]|e.r[this.n];break;case20:e.r[this.a]=!e.r[this.c]}},c.prototype.e=function(e){e.Q.push(e.C),e.B.push(e.k),e.C=e.r[this.s],e.k=[];for(vart=0;t<this.i;t++)e.k.unshift(e.f.pop());e.g.push(e.f),e.f=[]},n.prototype.e=function(e){e.C=e.Q.pop(),e.k=e.B.pop(),e.f=e.g.pop()},e.prototype.e=function(e){switch(this.t){case0:e.u=e.r[this.a]>=e.r[this.c];break;case1:e.u=e.r[this.a]<=e.r[this.c];break;case2:e.u=e.r[this.a]>e.r[this.c];break;case3:e.u=e.r[this.a]<e.r[this.c];break;case4:e.u=e.r[this.a]==e.r[this.c];break;case5:e.u=e.r[this.a]!=e.r[this.c];break;case6:e.u=e.r[this.a];break;case7:e.u=!e.r[this.a]}},o.prototype.e=function(e){switch(this.t){case0:e.C=this.h;break;case1:e.u&&(e.C=this.h);break;case2:e.u||(e.C=this.h);break;case3:e.C=this.h,e.w=null}e.u=!1},r.prototype.e=function(e){switch(this.t){case0:for(vart=[],n=0;n<this.i;n++)t.unshift(e.f.pop());e.r[3]=e.r[this.s](t[0],t[1]);break;case1:for(varr=e.f.pop(),o=[],i=0;i<this.i;i++)o.unshift(e.f.pop());e.r[3]=e.r[this.s][r](o[0],o[1]);break;case2:for(vara=[],c=0;c<this.i;c++)a.unshift(e.f.pop());e.r[3]=newe.r[this.s](a[0],a[1])}};vark=function(e){for(vart=66,n=[],r=0;r<e.length;r++){varo=24^e.charCodeAt(r)^t;n.push(String.fromCharCode(o)),t=o}returnn.join("")};functionQ(e){this.t=(4095&e)>>10,this.s=(1023&e)>>8,this.i=1023&e,this.h=63&e}functionC(e){this.t=(4095&e)>>10,this.a=(1023&e)>>8,this.c=(255&e)>>6}functionB(e){this.s=(3072&e)>>10,this.h=1023&e}functionf(e){this.h=4095&e}functiong(e){this.s=(3072&e)>>10}functionu(e){this.h=4095&e}functionw(e){this.t=(3840&e)>>8,this.s=(192&e)>>6,this.i=63&e}functionG(){this.r=[0,0,0,0],this.C=0,this.Q=[],this.k=[],this.B=[],this.f=[],this.g=[],this.u=!1,this.G=[],this.b=[],this.o=!1,this.w=null,this.U=null,this.F=[],this.R=0,this.J={0:s,1:i,2:h,3:a,4:c,5:n,6:e,7:o,8:r,9:Q,10:C,11:B,12:f,13:g,14:u,15:w}}Q.prototype.e=function(e){switch(this.t){case0:e.f.push(e.r[this.s]);break;case1:e.f.push(this.i);break;case2:e.f.push(e.k[this.h]);break;case3:e.f.push(k(e.b[this.h]))}},C.prototype.e=function(A){switch(this.t){case0:vart=A.f.pop();A.r[this.a]=A.r[this.c][t];break;case1:vars=A.f.pop(),i=A.f.pop();A.r[this.c][s]=i;break;case2:varh=A.f.pop();A.r[this.a]=eval(h)}},B.prototype.e=function(e){e.r[this.s]=k(e.b[this.h])},f.prototype.e=function(e){e.w=this.h},g.prototype.e=function(e){throwe.r[this.s]},u.prototype.e=function(e){vart=this,n=[0];e.k.forEach(function(e){n.push(e)});varr=function(r){varo=newG;returno.k=n,o.k[0]=r,o.v(e.G,t.h,e.b,e.F),o.r[3]};r.toString=function(){return"(){[nativecode]}"},e.r[3]=r},w.prototype.e=function(e){switch(this.t){case0:for(vart={},n=0;n<this.i;n++){varr=e.f.pop();t[e.f.pop()]=r}e.r[this.s]=t;break;case1:for(varo=[],i=0;i<this.i;i++)o.unshift(e.f.pop());e.r[this.s]=o}},G.prototype.D=function(e){console.log(window.atob(e));for(vart=window.atob(e),n=t.charCodeAt(0)<<8|t.charCodeAt(1),r=[],o=2;o<n+2;o+=2)r.push(t.charCodeAt(o)<<8|t.charCodeAt(o+1));this.G=r;for(vari=[],a=n+2;a<t.length;){varc=t.charCodeAt(a)<<8|t.charCodeAt(a+1),s=t.slice(a+2,a+2+c);i.push(s),a+=c+2}this.b=i},G.prototype.v=function(e,t,n){for(t=t||0,n=n||[],this.C=t,"string"==typeofe?this.D(e):(this.G=e,this.b=n),this.o=!0,this.R=Date.now();this.o;){varr=this.G[this.C++];if("number"!=typeofr)break;varo=Date.now();if(500<o-this.R)return;this.R=o;try{this.e(r)}catch(e){this.U=e,this.w&&(this.C=this.w)}}},G.prototype.e=function(e){vart=(61440&e)>>12;newthis.J[t](e).e(this)},(newG).v("AxjgB5MAnACoAJwBpAAAABAAIAKcAqgAMAq0AzRJZAZwUpwCqACQACACGAKcBKAAIAOcBagAIAQYAjAUGgKcBqFAuAc5hTSHZAZwqrAIGgA0QJEAJAAYAzAUGgOcCaFANRQ0R2QGcOKwChoANECRACQAsAuQABgDnAmgAJwMgAGcDYwFEAAzBmAGcSqwDhoANECRACQAGAKcD6AAGgKcEKFANEcYApwRoAAxB2AGcXKwEhoANECRACQAGAKcE6AAGgKcFKFANEdkBnGqsBUaADRAkQAkABgCnBagAGAGcdKwFxoANECRACQAGAKcGKAAYAZx+rAZGgA0QJEAJAAYA5waoABgBnIisBsaADRAkQAkABgCnBygABoCnB2hQDRHZAZyWrAeGgA0QJEAJAAYBJwfoAAwFGAGcoawIBoANECRACQAGAOQALAJkAAYBJwfgAlsBnK+sCEaADRAkQAkABgDkACwGpAAGAScH4AJbAZy9rAiGgA0QJEAJACwI5AAGAScH6AAkACcJKgAnCWgAJwmoACcJ4AFnA2MBRAAMw5gBnNasCgaADRAkQAkABgBEio0R5EAJAGwKSAFGACcKqAAEgM0RCQGGAYSATRFZAZzshgAtCs0QCQAGAYSAjRFZAZz1hgAtCw0QCQAEAAgB7AtIAgYAJwqoAASATRBJAkYCRIANEZkBnYqEAgaBxQBOYAoBxQEOYQ0giQKGAmQABgAnC6ABRgBGgo0UhD/MQ8zECALEAgaBxQBOYAoBxQEOYQ0gpEAJAoYARoKNFIQ/zEPkAAgChgLGgkUATmBkgAaAJwuhAUaCjdQFAg5kTSTJAsQCBoHFAE5gCgHFAQ5hDSCkQAkChgBGgo0UhD/MQ+QACAKGAsaCRQCOYGSABoAnC6EBRoKN1AUEDmRNJMkCxgFGgsUPzmPkgAaCJwvhAU0wCQFGAUaCxQGOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQMOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQSOZISPzZPkQAaCJwvhAU0wCQFGAkSAzRBJAlz/B4FUAAAAwUYIAAIBSITFQkTERwABi0GHxITAAAJLwMSGRsXHxMZAAk0Fw8HFh4NAwUABhU1EBceDwAENBcUEAAGNBkTGRcBAAFKAAkvHg4PKz4aEwIAAUsACDIVHB0QEQ4YAAsuAzs7AAoPKToKDgAHMx8SGQUvMQABSAALORoVGCQgERcCAxoACAU3ABEXAgMaAAsFGDcAERcCAxoUCgABSQAGOA8LGBsPAAYYLwsYGw8AAU4ABD8QHAUAAU8ABSkbCQ4BAAFMAAktCh8eDgMHCw8AAU0ADT4TGjQsGQMaFA0FHhkAFz4TGjQsGQMaFA0FHhk1NBkCHgUbGBEPAAFCABg9GgkjIAEmOgUHDQ8eFSU5DggJAwEcAwUAAUMAAUAAAUEADQEtFw0FBwtdWxQTGSAACBwrAxUPBR4ZAAkqGgUDAwMVEQ0ACC4DJD8eAx8RAAQ5GhUYAAFGAAAABjYRExELBAACWhgAAVoAQAg/PTw0NxcQPCQ5C3JZEBs9fkcnDRcUAXZia0Q4EhQgXHojMBY3MWVCNT0uDhMXcGQ7AUFPHigkQUwQFkhaAkEACjkTEQspNBMZPC0ABjkTEQsrLQ==");functionb(e){console.log(e);console.log(encodeURIComponent(e));return__g._encrypt(encodeURIComponent(e))};