import requests
# import parsel
import os
import re # 正则表达式模块 内置模块
import time
import concurrent.futures
from urllib.parse import urlparse
from fake_useragent import UserAgent
#保存数据
def save(path, title, img_url, headers: dict = None):
downloadUrlParse = urlparse(img_url)
agent = UserAgent()
if headers is None:
headers = {
'User-Agent': agent.random,
'Referer':
f'{downloadUrlParse.scheme}://{downloadUrlParse.netloc}',
'Host': downloadUrlParse.netloc,
}
# print(img_url)
img_data = requests.get(img_url).content
try:
img_data = requests.get(img_url, headers=headers, timeout=20).content
except TimeoutError:
print(f'下载图片超时:{img_url}')
except Exception as e:
print(f'下载图片失败:{img_url} -> 原因:{e}')
real_path = "." + path + '/'
mkdirFile(real_path)
try:
with open(real_path + title, mode='wb') as f:
f.write(img_data)
# print(img_name, '爬取成功')
except Exception as e:
print(f'保存图片失败:{img_url} -> 原因:{e}')
#创建文件夹
def mkdirFile(path):
# 判断该路径的文件夹是否存在
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
# print("正在新建文件夹···")
# print("成功新建savePictures文件夹!")
# else:
# print("已经存在该文件夹,可直接下载!")
#主函数
def main(url):
img_url_arr = urlparse(url)
path = os.path.split(img_url_arr.path)[0]
title = os.path.split(img_url_arr.path)[1]
save(path, title, url)
# print(title, '爬取成功!!!')
#程序的入口
if __name__ == '__main__':
# main(
# "https://www.xxx.com/uploads/230258dad14489.jpg"
# )
time_1 = time.time()
data = []
for line in open("img_path.txt", "r"): #设置文件对象并读取每一行文件
data.append(re.sub('\n', '', line)) #将每一行文件加入到list中
# print(data)
exe = concurrent.futures.ThreadPoolExecutor(max_workers=3)
for index in range(len(data)):
exe.submit(main, data[index])
exe.shutdown()
time_2 = time.time()
use_time = int(time_2) - int(time_1)
print(f'总计耗时:{use_time}秒')