圆月山庄资源网 Design By www.vgjia.com
本文实例为大家分享了python实现爬取图书封面的具体代码,供大家参考,具体内容如下
kongfuzi.py
利用更换代理ip,延迟提交数据,设置请求头破解网站的反爬虫机制
import requests import random import time class DownLoad(): def __init__(self): self.ip_list = ['191.33.179.242:8080', '122.72.108.53:80', '93.190.142.214:80', '189.8.88.125:65301', '36.66.55.181:8080', '170.84.102.5:8080', '177.200.72.214:20183', '115.229.115.190:9000'] self.user_agent_list = [ 'User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' ] def get(self, url, proxy=None, timeout=20, num=5): print("正在请求%s" % url) UA = random.choice(self.user_agent_list) headers = {'User-Agent': UA} if proxy == None: try: return requests.get(url, headers=headers, timeout=timeout) except: if num > 0: time.sleep(10) return self.get(url, num=num - 1) else: time.sleep(10) IP = ''.join(random.choice(self.ip_list).strip()) proxy = {'http': IP} return self.get(url, proxy=proxy, timeout=timeout) else: try: IP = ''.join(random.choice(self.ip_list).strip()) proxy = {'http': IP} return requests.get(url, headers=headers, proxy=proxy, timeout=timeout) except: if num > 0: time.sleep(10) IP = ''.join(random.choice(self.ip_list).strip()) proxy = {'http': IP} print("正在更换代理") print("当前代理%s" % proxy) return self.get(url, proxy=proxy, num=num - 1)
main.py
将爬取的图片保存到本地,然后展示到界面
import kongfuzi import os import requests import bs4 from tkinter import * from PIL import Image, ImageTk # 下载图片,生成图片地址列表和图书信息列表 def download(): baseUrl = "http://search.kongfz.com" keyword = e1.get() url = baseUrl + "/product_result/" + keyword print("下载链接:" + url) show(url) # bs4处理 def changesoup(html): htm = html.content html_doc = str(htm, 'utf-8') soup = bs4.BeautifulSoup(html_doc, "html.parser") return soup # 图书信息集合 def bookinfo(soup): # 图书价格列表 price = [] soupprice = soup.select(".first-info .f_right .bold") for i in soupprice: price.append(i.string) # 书店名列表 storename = [] soupstorename = soup.select(".text a span") for each in soupstorename: if each.string == None: soupstorename.remove(each) for i in soupstorename: storename.append(i.string) # 商家地区列表 place = [] soupplace = soup.select(".user-place") for i in soupplace: place.append(i.string) # 书名列表 bookname = [] bookname1 = soup.select( ".search-wrap .search-main .search-main-result .result-content .result-list .item .item-info .title .link") # print(len(bookname1)) # print(bookname1) for each in bookname1: print(each) # a = bs4.BeautifulSoup(each, "html.parser") a = each.get_text() print(a) # type(a) # a = bs4.BeautifulSoup(a, "html.parser") # b = a.get_text() bookname.append(a) # print(bookname) # print(len(bookname)) return bookname, price, place, storename # 保存图片 def imgsave(soup): dirName = "image" os.makedirs(dirName, exist_ok=True) filePathList = [] imgUrl = soup.select(".search-main-result .result-content .result-list .item .item-img .img-box img") # print(imgUrl) if not imgUrl: print("没有找到当前节点下图片") else: i = 0 for imageUrls in imgUrl: # 找到图片地址 获取它 downloadUrl = imageUrls.get('src') # if downloadUrl == "/searchfront/img/error.jpg": # downloadUrl = "http://book.kongfz.com/img/pc/error.jpg" print("打印要下载的图片地址:", downloadUrl) # http://book.kongfz.com/img/pc/error.jpg # 分割字符 split = downloadUrl.split("/") # 只保留最后一个元素 fileName = str(i) + "-" + os.path.basename(split[len(split) - 1]) print("文件名:" + fileName) # 建立一个新路径 filePath = os.path.join(dirName, fileName) filePathList.append(filePath) if not os.path.exists(filePath): imageUrlPath = requests.get(downloadUrl) # 检查当前网络是否请求成功 imageUrlPath.raise_for_status() # 'wb'二进制模式打开img适用 imageFile = open(filePath, 'wb') for image in imageUrlPath.iter_content(10000): # 把每次遍历的文件图像都存储进文件夹中 imageFile.write(image) # 关闭文件 imageFile.close() i = i + 1 return filePathList # 图片展示 def show(url): xz = kongfuzi.DownLoad() html = xz.get(url) # 添加代理ip到ip_list add_ip = e2.get() xz.ip_list.append(add_ip) soup = changesoup(html) bookname, price, place, storename = bookinfo(soup) # print(bookname) # print(price) # print(place) # print(storename) filePathList = imgsave(soup) root1 = Toplevel() root1.geometry("1720x800") root1.title("孔网图片爬取") # 处理图片,转换成可以显示 photo = [] temp = [] for each in filePathList: temp = Image.open(each) photo.append(ImageTk.PhotoImage(temp)) canvas = Canvas(root1, width=1700, height=800, scrollregion=(0, 0, 0, 4000)) # 创建canvas canvas.place(x=10, y=10) # 放置canvas的位置 frame = Frame(canvas) # 把frame放在canvas里 frame.place(width=1680, height=800) for i in range(50): # 图片行列 rownum = int(i / 5) columnnum = i % 5 # photo = ImageTk.PhotoImage(Image.open(filePathList[i])) imgLabel1 = Label(frame, image=photo[i], width=280, height=280) imgLabel1.grid(row=rownum * 5, column=columnnum, padx=10, pady=5) infoLabel1 = Label(frame, text="书名:" + bookname[i], bg="#FFF8DC", justify=LEFT) infoLabel1.grid(row=rownum * 5 + 1, column=columnnum, padx=45, pady=2, sticky=W) infoLabel2 = Label(frame, text="价格:" + price[i] + "元", bg="#FFF8DC", justify=LEFT) infoLabel2.grid(row=rownum * 5 + 2, column=columnnum, padx=45, pady=2, sticky=W) infoLabel3 = Label(frame, text="发货地区:" + place[i], bg="#FFF8DC", justify=LEFT) infoLabel3.grid(row=rownum * 5 + 3, column=columnnum, padx=45, pady=2, sticky=W) infoLabel4 = Label(frame, text="书店:" + storename[i], bg="#FFF8DC", justify=LEFT) infoLabel4.grid(row=rownum * 5 + 4, column=columnnum, padx=45, pady=2, sticky=W) vbar = Scrollbar(canvas, orient=VERTICAL) # 竖直滚动条 vbar.place(x=1680, width=20, height=800) vbar.configure(command=canvas.yview) canvas.config(yscrollcommand=vbar.set) # 设置 canvas.create_window((800, 2000), window=frame) mainloop() if __name__ == '__main__': # 界面 root = Tk() root.title("孔网图片爬取") e1 = Entry(root) e2 = Entry(root) e1.grid(row=0, column=0, padx=20, pady=20) e2.grid(row=0, column=2, padx=20, pady=20) label1 = Label(root, text="关键字", width=10).grid(row=0, column=1, padx=10, pady=5) label2 = Label(root, text="添加代理ip", width=10).grid(row=0, column=3, padx=10, pady=5) btn1 = Button(root, text="搜索", width=10, command=download).grid(row=1, column=1, padx=10, pady=5) # print(e1.get()) mainloop()
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。
圆月山庄资源网 Design By www.vgjia.com
广告合作:本站广告合作请联系QQ:858582 申请时备注:广告合作(否则不回)
免责声明:本站文章均来自网站采集或用户投稿,网站不提供任何软件下载或自行开发的软件! 如有用户或公司发现本站内容信息存在侵权行为,请邮件告知! 858582#qq.com
免责声明:本站文章均来自网站采集或用户投稿,网站不提供任何软件下载或自行开发的软件! 如有用户或公司发现本站内容信息存在侵权行为,请邮件告知! 858582#qq.com
圆月山庄资源网 Design By www.vgjia.com
暂无评论...
P70系列延期,华为新旗舰将在下月发布
3月20日消息,近期博主@数码闲聊站 透露,原定三月份发布的华为新旗舰P70系列延期发布,预计4月份上市。
而博主@定焦数码 爆料,华为的P70系列在定位上已经超过了Mate60,成为了重要的旗舰系列之一。它肩负着重返影像领域顶尖的使命。那么这次P70会带来哪些令人惊艳的创新呢?
根据目前爆料的消息来看,华为P70系列将推出三个版本,其中P70和P70 Pro采用了三角形的摄像头模组设计,而P70 Art则采用了与上一代P60 Art相似的不规则形状设计。这样的外观是否好看见仁见智,但辨识度绝对拉满。
更新日志
2024年11月07日
2024年11月07日
- 雨林唱片《赏》新曲+精选集SACD版[ISO][2.3G]
- 罗大佑与OK男女合唱团.1995-再会吧!素兰【音乐工厂】【WAV+CUE】
- 草蜢.1993-宝贝对不起(国)【宝丽金】【WAV+CUE】
- 杨培安.2009-抒·情(EP)【擎天娱乐】【WAV+CUE】
- 周慧敏《EndlessDream》[WAV+CUE]
- 彭芳《纯色角3》2007[WAV+CUE]
- 江志丰2008-今生为你[豪记][WAV+CUE]
- 罗大佑1994《恋曲2000》音乐工厂[WAV+CUE][1G]
- 群星《一首歌一个故事》赵英俊某些作品重唱企划[FLAC分轨][1G]
- 群星《网易云英文歌曲播放量TOP100》[MP3][1G]
- 方大同.2024-梦想家TheDreamer【赋音乐】【FLAC分轨】
- 李慧珍.2007-爱死了【华谊兄弟】【WAV+CUE】
- 王大文.2019-国际太空站【环球】【FLAC分轨】
- 群星《2022超好听的十倍音质网络歌曲(163)》U盘音乐[WAV分轨][1.1G]
- 童丽《啼笑姻缘》头版限量编号24K金碟[低速原抓WAV+CUE][1.1G]