欢迎光临
我们一直在努力

python小白学习记录生产者消费者模型爬取斗图啊网站(源码有待修改)

from lxml import etree import requests from urllib import request import time import os from queue import Queue import threading import re class Procuder(threading.Thread): def __init__(self,page_queue,image_queue,*args,**kwargs): super(Procuder, self).__init__(*args,**kwargs) self.image_queue = image_queue self.page_queue = page_queue def run(self): while True: if self.page_queue.empty(): break href = self.page_queue.get() print(href) self.get_package(href) def get_package(self , href): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36" } time.sleep(0.4) resp = requests.get(href, headers=headers) result = resp.text html = etree.HTML(result) imagetitle = html.xpath('//div[@class]/h1/a/text()')[0] imagenames = html.xpath('//div[@class="artile_des"]//img/@alt') imagecontentes = html.xpath('//div[@class="artile_des"]//img/@src') global number path = "f:/testimages/" + "--" + str(imagetitle) path = path.replace(".", "") path = path.replace(",", "[") path = path.replace("", "[") path = path.replace("<", "[") path = path.replace(">", "]") path = path.replace("?", "") path = path.replace("|", "") os.makedirs(path) # print(path) for index in range(len(imagecontentes)): suffix = os.path.splitext(imagecontentes[index]) indexname = str(imagenames[index]).replace("?", "") # request.urlretrieve(imagecontentes[index],path+"/"+indexname+"--"+str(index)+"--"+suffix[1]) self.image_queue.put((path + "/" + indexname + "--" + str(index) + "--" + suffix[1], imagecontentes[index])) class Consumer(threading.Thread): def __init__(self,page_queue,image_queue,*args,**kwargs): super(Consumer, self).__init__(*args,**kwargs) self.image_queue = image_queue self.page_queue = page_queue def run(self): while True: if self.image_queue.empty() and self.page_queue.empty(): break image_path,image_content = self.image_queue.get() request.urlretrieve(image_content,image_path) # res = requests.get(image_content) #print(image_content) # res.raise_for_status() # playFile = open(image_path, 'wb') # for chunk in res.iter_content(100000): # playFile.write(chunk) print(image_content," 已完成") def getpackagetag(url,page_queue): time.sleep(0.4) def main(): image_queue = Queue(1000) page_queue = Queue(1000) #os.makedirs("f:/testimages") for x in range(1, 10): url = "https://www.doutula.com/article/list/?page=%s" % x print(url) # getpackagetag(url,page_queue) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36" } resp = requests.get(url, headers=headers) time.sleep(0.4) result = resp.text print(result) html = etree.HTML(result) listhref = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/@href') # listtitle = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/div[@class="random_title"]/text()') for index in range(len(listhref)): # page_queue.put((listtitle[index],listhref[index])) # while True: # if page_queue.empty(): # break # title,href = page_queue.get() # print(" title ",title," href ",href) print(listhref[index]) page_queue.put(listhref[index]) for index in range(3): x = Procuder(page_queue, image_queue) x.start() for index in range(3): x = Consumer(page_queue, image_queue) x.start() if __name__ == '__main__': main()

源码出问题

  使用单线程不会出错

 

 使用多线程时

在queue队列中放至url时,会同时在一个位置放置多个url 大概????  

导致解析url时报错

  • 海报
海报图正在生成中...
赞(0) 打赏
声明:
1、本博客不从事任何主机及服务器租赁业务,不参与任何交易,也绝非中介。博客内容仅记录博主个人感兴趣的服务器测评结果及一些服务器相关的优惠活动,信息均摘自网络或来自服务商主动提供;所以对本博客提及的内容不作直接、间接、法定、约定的保证,博客内容也不具备任何参考价值及引导作用,访问者需自行甄别。
2、访问本博客请务必遵守有关互联网的相关法律、规定与规则;不能利用本博客所提及的内容从事任何违法、违规操作;否则造成的一切后果由访问者自行承担。
3、未成年人及不能独立承担法律责任的个人及群体请勿访问本博客。
4、一旦您访问本博客,即表示您已经知晓并接受了以上声明通告。
文章名称:《python小白学习记录生产者消费者模型爬取斗图啊网站(源码有待修改)》
文章链接:https://www.456zj.com/9038.html
本站资源仅供个人学习交流,请于下载后24小时内删除,不允许用于商业用途,否则法律问题自行承担。

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址