菜鸡源码,专注精品下载!
当前位置:首页 > 建站教程 > 建站知识

编写一个Python脚本,用于爬取Wallpaper全站高清壁纸

发布时间:2024-01-05  栏目:建站知识   浏览:   分类:python教程 Python爬取 Wallhaven

要爬取Wallpaper全站高清壁纸,可以使用Python的requests库和BeautifulSoup库。以下是一个简单的示例代码: ```python import requests from bs4 import BeautifulSoup import os def get_wallpaper_urls(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') img_tags = soup.find_all('img', class_='wallpaper') wallpaper_urls = [img['src'] for img in img_tags] return wallpaper_urls def download_wallpapers(wallpaper_urls, save_path): if not os.path.exists(save_path): os.makedirs(save_path) for url in wallpaper_urls: response =

话不多说直接上代码,都有注释,如果有不懂的可以提出来或者有更好方案也可以提出来,大家一起学习。

#!/usr/bin/envpython3#-*-coding:utf-8-*-importrequestsimporturllib.parsefromlxmlimportetreeimportreimportos#自定义错误classError(Exception):def__init__(self,message):self.message=messageclassWallpaper:"""0x0所有分辨率1920x1080"""def__init__(self,url,path,page,min_resolution):self.url=urlself.path=pathself.params={"page":page}self.min_resolution=min_resolution#post请求参数self.data={#分页"view":"paged",#分辨率"min_resolution":min_resolution,#分辨率等于还是至少是可以大于等于"resolution_equals":"=",#排序方法#newest最新上传#rating最多赞数"sort":"newest",}#创建文件夹@staticmethoddefcreate_dir(path):ifnotos.path.exists(path):os.makedirs(path)else:return"文件夹已存在"#返回图片类型的文件夹名称,根据链接的不同类型defget_folder_name(self):#标签链接名称获取https://wall.alphacoders.com/tag/ahri-(league-of-legends)-wallpapers?lang=Chinesepattern=r"tag/([\w-]+)-(.*)\?lang=Chinese"match=re.search(pattern,self.url)ifmatch:content1=match.group(1)#获取第一个捕获组的内容content2=match.group(2)#获取第二个捕获组的内容image_dir_name=content1+content2returnimage_dir_name#分类链接名称获取https://wall.alphacoders.com/by_sub_category.php?id=169908&name=%E8%8B%B1%E9%9B%84%E8%81%94%E7%9B%9F+%E5%A3%81%E7%BA%B8&lang=Chineseelifnotmatch:try:params={}url_params=self.url.split("?")[1]temp=url_params.split("&")forparamintemp:key=param.split("=")[0]value=param.split("=")[1]params[key]=valuename=params.get("name",None)#获取分类名称ifnameisnotNone:image_dir_name=urllib.parse.unquote(name.split("+")[0])returnimage_dir_name#获取不到名称名称,那链接中的就是这种类型https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160&lang=Chineseelse:w=params.get("w")h=params.get("h")image_dir_name=w+"x"+hreturnimage_dir_name#获取索引名称https://wall.alphacoders.com/search.php?search=landscape&lang=ChineseexceptAttributeError:pattern=r"search=([^&]+)&lang=Chinese"match=re.search(pattern,self.url)ifmatch:image_dir_name=match.group(1)returnimage_dir_name#获取每一页的所有图片页面的图片链接defget_image_urls(self):url="https://wall.alphacoders.com"response=requests.post(self.url,params=self.params,data=self.data,allow_redirects=False)#超出页数会重定向到最大的页面,进行判断,防止重复爬取ifresponse.status_code==200:html=etree.HTML(response.text)image_page_params=html.xpath('//*[@id="page_container"]//div//div[@class="thumb-container"]//div[@class="boxgrid"]//@href')#判断当前页面有没有图片iflen(image_page_params)==0:raiseError("获取不到当前页码的图片,请检查页码有否有效!")else:result=[]forimage_page_paraminimage_page_params:image_page_url=url+image_page_paramresponse_image=requests.get(image_page_url).texthtml=etree.HTML(response_image)image_urls=html.xpath("/html/body/div[2]/div[2]/div[2]/div[1]/img//@src")#这里可以用推导式foriinimage_urls:result.append(i)returnlen(result),resultraiseError("获取不到当前页码的图片,请检查页码有否有效!")defdownload_image(self):mun=0error=0self.create_dir(self.path)images_dir_name=self.get_folder_name()images_mun,images_urls=self.get_image_urls()forimage_urlinimages_urls:image_name_temp=re.search(r'https://[^/]+/[^/]+/(\d+)(\.png|\.jpg)',image_url)#匹配图片页面的图片是否是这两种格式,不是则跳过ifimage_name_tempisnotNone:image_name=image_name_temp.group(1)#判断图片是否重复下载ifnotos.path.exists(os.path.join(self.path,images_dir_name,image_name+".png")):self.create_dir(os.path.join(self.path,images_dir_name))download=requests.get(image_url).contentwithopen(os.path.join(self.path,images_dir_name,image_name+".png"),"wb")asf:f.write(download)print("图片{}下载完成,图片地址:{}".format(image_name,image_url))mun=mun+1ifmun==images_mun:print("当前页面图片下载完成,一共{}张图片".format(mun))else:print("已有图片:{},图片地址:{}".format(image_name,image_url))continueelse:error=error+1print("下载失败{}张图片".format(error))if__name__=='__main__':url=input("请输入壁纸url!\n")#存放图片的文件夹path="images"'''分辨率0x0所有分辨率'''print("""常用分辨率1920x10802560x14402560x16003840x21605120x28807680x4320""")resolution=input("请输入需要下载的分辨率!\n")#200可以修为自己想怕的最大页数,当链接所有页数大于或小于都能正常爬取链接的最大页数的图片,这里可以取一个折中的值forpage_numinrange(1,200):print("正在下载第{}页的图片".format(page_num))page_one=Wallpaper(url,path,str(page_num),resolution)print(page_one.data)
评论
建站知识
建站知识
使用技巧
调试安装
运营推广