完整指南:从 URL 下载文件
当需要使用 Python 从 Internet 下载文件时,可以使用几种可靠的方法。本指南介绍了从基本下载到处理大文件和管理常见边缘情况的所有内容。让我们探索完成此作的实用方法。
使用 urllib 下载基本文件
'urllib' 库内置于 Python 中,可以很好地处理简单的下载:
from urllib.request import urlretrieve
def download_file_simple(url, filename):
try:
urlretrieve(url, filename)
print(f"Successfully downloaded {filename}")
except Exception as e:
print(f"An error occurred: {e}")
# Example usage
url = "https://example.com/sample.pdf"
download_file_simple(url, "sample.pdf")
此方法适用于基本下载,但缺乏进度跟踪和高级功能。让我们看看更好的选择。
使用请求:推荐的方法
'requests' 库提供了更多功能和更好的错误处理:
import requests
def download_file(url, filename):
try:
# Send a GET request to the URL
response = requests.get(url, stream=True)
response.raise_for_status() # Raises an HTTPError for bad responses
# Open the local file to write the downloaded content
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
return True
except requests.exceptions.RequestException as e:
print(f"Error downloading file: {e}")
return False
# Example usage
url = "https://example.com/large-file.zip"
success = download_file(url, "large-file.zip")
if success:
print("Download completed successfully")
'chunk_size=8192' 参数通过读取较小的内容来帮助在下载大型文件时管理内存使用情况。
添加进度跟踪
让我们添加一个进度条来查看下载的进展情况:
import requests
from tqdm import tqdm
def download_with_progress(url, filename):
try:
# Send GET request
response = requests.get(url, stream=True)
response.raise_for_status()
# Get the file size from headers
total_size = int(response.headers.get('content-length', 0))
# Open file and create progress bar
with open(filename, 'wb') as file, \
tqdm(desc=filename,
total=total_size,
unit='iB',
unit_scale=True) as progress_bar:
for data in response.iter_content(chunk_size=8192):
size = file.write(data)
progress_bar.update(size)
return True
except requests.exceptions.RequestException as e:
print(f"Download error: {e}")
return False
# Example usage
url = "https://example.com/large-file.zip"
download_with_progress(url, "large-file.zip")
此版本显示一个进度条,其中包含下载速度和估计剩余时间。
处理不同类型的文件
下面是一个更强大的函数,用于处理各种文件类型并包含基本验证:
import requests
import os
from urllib.parse import urlparse
import mimetypes
def smart_download(url, output_dir="."):
try:
# Send HEAD request first to get headers
head_response = requests.head(url)
head_response.raise_for_status()
# Get filename from URL or Content-Disposition
content_disposition = head_response.headers.get('content-disposition')
if content_disposition:
import re
fname = re.findall("filename=(.+)", content_disposition)
if fname:
filename = fname[0].strip('"')
else:
filename = os.path.basename(urlparse(url).path)
else:
filename = os.path.basename(urlparse(url).path)
# If no extension in filename, try to guess from content-type
if '.' not in filename:
content_type = head_response.headers.get('content-type')
if content_type:
ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
if ext:
filename = f"download{ext}"
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
filepath = os.path.join(output_dir, filename)
# Download the file with progress tracking
print(f"Downloading {url} to {filepath}")
response = requests.get(url, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
with open(filepath, 'wb') as file, \
tqdm(desc=filename,
total=total_size,
unit='iB',
unit_scale=True) as progress_bar:
for data in response.iter_content(chunk_size=8192):
size = file.write(data)
progress_bar.update(size)
return filepath
except requests.exceptions.RequestException as e:
print(f"Download failed: {e}")
return None
# Example usage
urls = [
"https://example.com/document.pdf",
"https://example.com/image.jpg",
"https://example.com/data.csv"
]
for url in urls:
downloaded_file = smart_download(url, "downloads")
if downloaded_file:
print(f"Successfully downloaded to {downloaded_file}")
处理身份验证和标头
从 API 或受保护的资源下载时,您可能需要处理身份验证:
import requests
def download_with_auth(url, filename, headers=None, auth=None):
try:
# Set default headers if none provided
if headers is None:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
# Make request with authentication and headers
response = requests.get(url,
headers=headers,
auth=auth,
stream=True)
response.raise_for_status()
# Download with progress tracking
total_size = int(response.headers.get('content-length', 0))
with open(filename, 'wb') as file, \
tqdm(desc=filename,
total=total_size,
unit='iB',
unit_scale=True) as progress_bar:
for chunk in response.iter_content(chunk_size=8192):
size = file.write(chunk)
progress_bar.update(size)
return True
except requests.exceptions.RequestException as e:
print(f"Download failed: {e}")
return False
# Example usage with basic auth
url = "https://api.example.com/files/document.pdf"
headers = {
'Authorization': 'Bearer your-access-token',
'Accept': 'application/pdf'
}
auth = ('username', 'password') # Basic authentication
success = download_with_auth(url, "document.pdf", headers=headers, auth=auth)
实际示例:下载多个文件
下面是一个同时下载多个文件的实际示例:
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os
def download_file_threaded(args):
url, output_dir = args
try:
filename = os.path.basename(urlparse(url).path)
filepath = os.path.join(output_dir, filename)
response = requests.get(url, stream=True)
response.raise_for_status()
with open(filepath, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
return filepath
except Exception as e:
return None
def download_multiple_files(urls, output_dir="downloads", max_workers=5):
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Prepare arguments for thread pool
args = [(url, output_dir) for url in urls]
# Download files using thread pool
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(tqdm(
executor.map(download_file_threaded, args),
total=len(urls),
desc="Downloading files"
))
# Process results
successful = [r for r in results if r is not None]
failed = len(results) - len(successful)
print(f"\nDownload complete:")
print(f"- Successfully downloaded: {len(successful)} files")
print(f"- Failed downloads: {failed} files")
return successful
# Example usage
urls = [
"https://example.com/file1.pdf",
"https://example.com/file2.jpg",
"https://example.com/file3.zip",
]
downloaded_files = download_multiple_files(urls, "downloads", max_workers=3)
重要提示
1. 始终使用 'raise_for_status()' 来捕获 HTTP 错误
2. 使用 'stream=True' 和 'iter_content()' 流式传输大文件
3. 为网络问题添加适当的错误处理
4. 使用进度条获得更好的用户体验
5. 在内容完整性很重要时验证下载的文件
6. 下载多个文件时考虑速率限制
7. 处理慢速连接的超时
下面是文件验证的快速示例:
import hashlib
def validate_download(filepath, expected_hash):
sha256_hash = hashlib.sha256()
with open(filepath, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
actual_hash = sha256_hash.hexdigest()
return actual_hash == expected_hash
# Example usage
filepath = "downloaded_file.zip"
expected_hash = "a1b2c3..." # Expected SHA-256 hash
if validate_download(filepath, expected_hash):
print("File integrity verified")
else:
print("File may be corrupted")
通过使用这些方法和模式,您可以在处理常见问题和边缘情况时可靠地从 Internet 下载文件。请记住,要始终考虑安全隐患,并针对您的特定用例实施适当的错误处理。