from lxml import etree import requests from urllib import request import time import os from queue import Queue import threading import re class Procuder(threading.Thread): def __init__(self,page_queue,image_queue,*args,**kwargs): super(Procuder, self).__init__(*args,**kwargs) self.image_queue = image_queue self.page_queue = page_queue def run(self): while True: if self.page_queue.empty(): break href = self.page_queue.get() print(href) self.get_package(href) def get_package(self , href): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36" } time.sleep(0.4) resp = requests.get(href, headers=headers) result = resp.text html = etree.HTML(result) imagetitle = html.xpath('//div[@class]/h1/a/text()')[0] imagenames = html.xpath('//div[@class="artile_des"]//img/@alt') imagecontentes = html.xpath('//div[@class="artile_des"]//img/@src') global number path = "f:/testimages/" + "--" + str(imagetitle) path = path.replace(".", "") path = path.replace(",", "[") path = path.replace(",", "[") path = path.replace("<", "[") path = path.replace(">", "]") path = path.replace("?", "") path = path.replace("|", "") os.makedirs(path) # print(path) for index in range(len(imagecontentes)): suffix = os.path.splitext(imagecontentes[index]) indexname = str(imagenames[index]).replace("?", "") # request.urlretrieve(imagecontentes[index],path+"/"+indexname+"--"+str(index)+"--"+suffix[1]) self.image_queue.put((path + "/" + indexname + "--" + str(index) + "--" + suffix[1], imagecontentes[index])) class Consumer(threading.Thread): def __init__(self,page_queue,image_queue,*args,**kwargs): super(Consumer, self).__init__(*args,**kwargs) self.image_queue = image_queue self.page_queue = page_queue def run(self): while True: if self.image_queue.empty() and self.page_queue.empty(): break image_path,image_content = self.image_queue.get() request.urlretrieve(image_content,image_path) # res = requests.get(image_content) #print(image_content) # res.raise_for_status() # playFile = open(image_path, 'wb') # for chunk in res.iter_content(100000): # playFile.write(chunk) print(image_content," 已完成") def getpackagetag(url,page_queue): time.sleep(0.4) def main(): image_queue = Queue(1000) page_queue = Queue(1000) #os.makedirs("f:/testimages") for x in range(1, 10): url = "https://www.doutula.com/article/list/?page=%s" % x print(url) # getpackagetag(url,page_queue) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36" } resp = requests.get(url, headers=headers) time.sleep(0.4) result = resp.text print(result) html = etree.HTML(result) listhref = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/@href') # listtitle = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/div[@class="random_title"]/text()') for index in range(len(listhref)): # page_queue.put((listtitle[index],listhref[index])) # while True: # if page_queue.empty(): # break # title,href = page_queue.get() # print(" title ",title," href ",href) print(listhref[index]) page_queue.put(listhref[index]) for index in range(3): x = Procuder(page_queue, image_queue) x.start() for index in range(3): x = Consumer(page_queue, image_queue) x.start() if __name__ == '__main__': main()
源码出问题
使用单线程不会出错
使用多线程时
在queue队列中放至url时,会同时在一个位置放置多个url 大概????
导致解析url时报错
- 海报