下载链接
import os
import requests
import time
import random
# 指定保存文件的目录
download_dir = r'G:\xz'
# 确保目录存在,如果不存在则创建
if not os.path.exists(download_dir):
os.makedirs(download_dir)
# 用于记录失败的URL
failed_urls = []
# 随机User-Agent列表,模拟不同浏览器和设备
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', # IE 11
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0',
'Mozilla/5.0 (Android 11; Mobile; rv:68.0) Gecko/68.0 Firefox/88.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36'
]
# 读取 txt 文件中的每一行 URL
with open(r'g:\url.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
# 下载每一行的数据并设置延迟
for line in lines:
url = line.strip() # 去掉每行的换行符
if not url:
continue # 跳过空行
# 获取文件名
file_name = url.split('/')[-1].replace("?", "_").replace("&", "_").replace("=", "_").replace("#", "_")
file_path = os.path.join(download_dir, file_name)
# 检查文件是否已存在,如果存在则跳过
if os.path.exists(file_path):
print(f"文件已存在,跳过: {file_path}")
continue
# 重试机制
max_retries = 5 # 最大重试次数
base_delay = 10 # 基础延迟(秒)
for attempt in range(max_retries):
# 随机选择User-Agent
headers = {'User-Agent': random.choice(user_agents)}
try:
# 发送 GET 请求下载文件,带随机User-Agent
response = requests.get(url, headers=headers, timeout=15)
if response.status_code == 200:
# 保存文件到指定路径
with open(file_path, 'wb') as f:
f.write(response.content)
print(f"成功下载: {url} 到 {file_path}")
break # 成功后退出重试循环
elif response.status_code == 429:
# 处理429状态码,随机化等待时间
wait_time = base_delay + random.uniform(5, 15) # 随机等待10-25秒
print(f"请求失败: {url},状态码: 429,等待 {wait_time:.2f} 秒后重试 ({attempt + 1}/{max_retries})")
time.sleep(wait_time)
else:
print(f"请求失败: {url},状态码: {response.status_code}")
failed_urls.append(url) # 记录失败的URL
break # 其他状态码不重试
except requests.exceptions.RequestException as e:
print(f"请求错误: {url},错误详情: {e}")
failed_urls.append(url) # 记录失败的URL
break
# 如果达到最大重试次数仍失败,记录URL
if attempt == max_retries - 1:
print(f"达到最大重试次数,放弃: {url}")
failed_urls.append(url)
# 每次请求之间的随机基础延迟
time.sleep(random.uniform(2, 3)) # 随机5-10秒
# 将失败的URL保存到文件
if failed_urls:
with open(os.path.join(download_dir, 'failed_urls.txt'), 'w', encoding='utf-8') as f:
for url in failed_urls:
f.write(f"{url}\n")
print(f"已将 {len(failed_urls)} 个失败的URL保存到 {os.path.join(download_dir, 'failed_urls.txt')}")
else:
print("所有URL处理完成,无失败记录")