分享Python微博爬虫脚本
以下是一个简单的Python微博爬虫脚本,使用了requests和BeautifulSoup库来抓取和解析网页内容。首先,我们需要安装这两个库,可以使用pip install requests和pip install beautifulsoup4命令进行安装。然后,我们使用requests库发送GET请求获取网页内容,并使用BeautifulSoup库解析HTML代码。最后,我们可以提取出所需的信息,如用户名、发布时间等。 ```python import requests from bs4 import BeautifulSoup url = 'https://weibo.com/u/xxxxxx' # 替换为你想要爬取的微博用户的主页链接 response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 提取用户名 username = soup.find('div', {'class': 'name'}).text print('用户名:', username) # 提取微博内容
python微博爬虫脚本,输入关键词,调整爬取的博文时间,即可运行使用
需要自己去获取自己账号的Cookies, 放到写好的cookies位置
importdatetimeimportjsonimportrandomimportreimporttimeimporttracebackimportpymysqlimportrequestsfromlxmlimportetreeimporturllib3importopenpyxlurllib3.disable_warnings()importrandom#标识头user_agents=["Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.116Safari/537.36","Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/534.57.2(KHTML,likeGecko)Version/5.1.7Safari/534.57.2","Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/30.0.1599.101Safari/537.36","Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/39.0.2171.95Safari/537.36OPR/26.0.1656.60","Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.1(KHTML,likeGecko)Chrome/21.0.1180.71Safari/537.1LBBROWSER","Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Maxthon/4.4.3.4000Chrome/30.0.1599.101Safari/537.36","Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/38.0.2125.122UBrowser/4.0.3214.0Safari/537.36","Mozilla/5.0(Macintosh;IntelMacOSX10_10_1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/41.0.2227.1Safari/537.36","Opera/12.80(WindowsNT5.1;U;en)Presto/2.10.289Version/12.02",]headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','Accept-Encoding':'gzip,deflate','Accept-Language':'zh-CN,zh;q=0.9',#'Cache-Control':'no-cache',#'Connection':'keep-alive',#'referer':'https://www.google.com/',#'Upgrade-Insecure-Requests':'1','User-Agent':random.choice(user_agents)}#获取当前的Urldefget_html(url):num=0whileTrue:num+=1try:print("当前请求url:",url)time.sleep(2)headers['cookie']='这里需要你的账号的cookies'response=requests.get(url,headers=headers,timeout=10,verify=False,proxies='')ifresponse.status_code==200:returnresponseelifresponse.status_code==404:return''else:print('请求响应吗错误:{}请求url{}重新请求'.format(response.status_code,url))exceptExceptionase:print("等待代{过}{滤}理更新")time.sleep(10)pass#编码defdecodeContent(html):importcchardetaschardetgbk_list=["gb2312","GB2312","GBK","GB18030"]ifisinstance(html,bytes):char=chardet.detect(html)confidence=char['confidence']if"encoding"incharandconfidence>0.7:items=[char["encoding"]]else:items=re.compile(r'charset=([^\'\"]*?)[\'\"/\s]*?>').findall(str(html))ifnotitems:items=re.compile(r'charset=[\'\"](.*?)[\'\"]').findall(str(html))ifnotitems:items=re.compile(r'charset=(.*?)[\'\"]').findall(str(html))ifitems:charset='gbk'ifitems[0]ingbk_listelseitems[0]try:res=html.decode(charset)exceptExceptionase:ifcharset=='gbk':try:res=html.decode('gbk','ignore')exceptExceptionase:res=""else:try:res=html.decode('utf-8','ignore')exceptExceptionase:res=""else:try:res=html.decode('utf-8')exceptExceptionase:try:res=html.decode('gbk')exceptExceptionase:try:res=html.decode('utf-8','ignore')exceptExceptionase:res=""returnresreturnhtml#提取网页内容,并存储到工作簿中wb=openpyxl.Workbook()ws=wb.activews.title='Sheet1'ws.append((["content"]))defcomment_info(res,keyword):try:contents_lis=res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="content"]')digg=res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="card-act"]')user_lis=res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="avator"]')print(len(contents_lis))forindex,iinenumerate(contents_lis):try:content=''.join(i.xpath('p[@node-type="feed_list_content"]//text()')).replace("\n",'').strip()print("@@@@@@@@@@@@@@",content)result_list=[content]ws.append((result_list))wb.save('weibo_info.xlsx')except:traceback.print_exc()except:pass#时间表示deftime_end_start(i,start_time):aaa=datetime.datetime.strptime(start_time,'%Y-%m-%d')threeDayAgo=(aaa+datetime.timedelta(days=i))threeDayAgosss=(threeDayAgo-datetime.timedelta(days=1))returnthreeDayAgo,threeDayAgosss#程序进程defrun(lkll):#关键词lis=[lkll]#开始时间结束时间start_time="2021-01-01"end_time="2022-01-01"d1=datetime.datetime.strptime(start_time,'%Y-%m-%d')d2=datetime.datetime.strptime(end_time,'%Y-%m-%d')delta=d2-d1ccc=delta.daysprint(ccc)foriinrange(0,int(ccc)+1):tim,threeDayAgosss=time_end_start(i,start_time)tim=str(tim).replace("00:00:00","").replace("","")threeDayAgosss=str(threeDayAgosss).replace("00:00:00","").replace("","")print(tim)iftim:forjinlis:print(tim,threeDayAgosss,j)get_page(tim,threeDayAgosss,j)else:time.sleep(60)#通过给定信息获取Urldefget_page(tim,threeDayAgosss,j):page=1whileTrue:try:print("________________当前第{}页_______________".format(page))url='https://s.weibo.com/weibo?q={}&typeall=1&suball=1×cope=custom:{}:{}&Refer=g&page={}'.format(j,threeDayAgosss+'-0',tim+'-0',page)print("############",url)res=get_html(url)res=etree.HTML(res.text)comment_info(res,j)pagss=''.join(res.xpath("//div[@class='m-page']/div/span/ul/li[last()]//text()"))print("!!!!!!!",pagss)pages=pagss.replace("第",'').replace("页",'')print(pages)ifpages:ifpage<int(pages):page+=1else:breakelse:breakexcept:print("微博cookie失效,请更换cookie")traceback.print_exc()#程序入口if__name__=='__main__':lkll=input("请输入关键词:")run(lkll)