import requests # import parsel import os import re # 正则表达式模块 内置模块 import time import concurrent.futures from urllib.parse import urlparse from fake_useragent import UserAgent #保存数据 def save(path, title, img_url, headers: dict = None): downloadUrlParse = urlparse(img_url) agent = UserAgent() if headers is None: headers = { 'User-Agent': agent.random, 'Referer': f'{downloadUrlParse.scheme}://{downloadUrlParse.netloc}', 'Host': downloadUrlParse.netloc, } # print(img_url) img_data = requests.get(img_url).content try: img_data = requests.get(img_url, headers=headers, timeout=20).content except TimeoutError: print(f'下载图片超时:{img_url}') except Exception as e: print(f'下载图片失败:{img_url} -> 原因:{e}') real_path = "." + path + '/' mkdirFile(real_path) try: with open(real_path + title, mode='wb') as f: f.write(img_data) # print(img_name, '爬取成功') except Exception as e: print(f'保存图片失败:{img_url} -> 原因:{e}') #创建文件夹 def mkdirFile(path): # 判断该路径的文件夹是否存在 folder = os.path.exists(path) if not folder: os.makedirs(path) # print("正在新建文件夹···") # print("成功新建savePictures文件夹!") # else: # print("已经存在该文件夹,可直接下载!") #主函数 def main(url): img_url_arr = urlparse(url) path = os.path.split(img_url_arr.path)[0] title = os.path.split(img_url_arr.path)[1] save(path, title, url) # print(title, '爬取成功!!!') #程序的入口 if __name__ == '__main__': # main( # "https://www.xxx.com/uploads/230258dad14489.jpg" # ) time_1 = time.time() data = [] for line in open("img_path.txt", "r"): #设置文件对象并读取每一行文件 data.append(re.sub('\n', '', line)) #将每一行文件加入到list中 # print(data) exe = concurrent.futures.ThreadPoolExecutor(max_workers=3) for index in range(len(data)): exe.submit(main, data[index]) exe.shutdown() time_2 = time.time() use_time = int(time_2) - int(time_1) print(f'总计耗时:{use_time}秒')