四时宝库

程序员的知识宝库

一文学会在 Python 中从 URL 下载文件

完整指南:从 URL 下载文件

当需要使用 Python 从 Internet 下载文件时,可以使用几种可靠的方法。本指南介绍了从基本下载到处理大文件和管理常见边缘情况的所有内容。让我们探索完成此作的实用方法。

使用 urllib 下载基本文件

'urllib' 库内置于 Python 中,可以很好地处理简单的下载:

from urllib.request import urlretrieve

def download_file_simple(url, filename):
    try:
        urlretrieve(url, filename)
        print(f"Successfully downloaded {filename}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
url = "https://example.com/sample.pdf"
download_file_simple(url, "sample.pdf")

此方法适用于基本下载,但缺乏进度跟踪和高级功能。让我们看看更好的选择。

使用请求:推荐的方法

'requests' 库提供了更多功能和更好的错误处理:

import requests

def download_file(url, filename):
    try:
        # Send a GET request to the URL
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        
        # Open the local file to write the downloaded content
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return False

# Example usage
url = "https://example.com/large-file.zip"
success = download_file(url, "large-file.zip")
if success:
    print("Download completed successfully")

'chunk_size=8192' 参数通过读取较小的内容来帮助在下载大型文件时管理内存使用情况。

添加进度跟踪

让我们添加一个进度条来查看下载的进展情况:

import requests
from tqdm import tqdm

def download_with_progress(url, filename):
    try:
        # Send GET request
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Get the file size from headers
        total_size = int(response.headers.get('content-length', 0))
        
        # Open file and create progress bar
        with open(filename, 'wb') as file, \
             tqdm(desc=filename,
                  total=total_size,
                  unit='iB',
                  unit_scale=True) as progress_bar:
            
            for data in response.iter_content(chunk_size=8192):
                size = file.write(data)
                progress_bar.update(size)
                
        return True
    except requests.exceptions.RequestException as e:
        print(f"Download error: {e}")
        return False

# Example usage
url = "https://example.com/large-file.zip"
download_with_progress(url, "large-file.zip")

此版本显示一个进度条,其中包含下载速度和估计剩余时间。

处理不同类型的文件

下面是一个更强大的函数,用于处理各种文件类型并包含基本验证:

import requests
import os
from urllib.parse import urlparse
import mimetypes

def smart_download(url, output_dir="."):
    try:
        # Send HEAD request first to get headers
        head_response = requests.head(url)
        head_response.raise_for_status()
        
        # Get filename from URL or Content-Disposition
        content_disposition = head_response.headers.get('content-disposition')
        if content_disposition:
            import re
            fname = re.findall("filename=(.+)", content_disposition)
            if fname:
                filename = fname[0].strip('"')
            else:
                filename = os.path.basename(urlparse(url).path)
        else:
            filename = os.path.basename(urlparse(url).path)
        
        # If no extension in filename, try to guess from content-type
        if '.' not in filename:
            content_type = head_response.headers.get('content-type')
            if content_type:
                ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
                if ext:
                    filename = f"download{ext}"
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        filepath = os.path.join(output_dir, filename)
        
        # Download the file with progress tracking
        print(f"Downloading {url} to {filepath}")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        
        with open(filepath, 'wb') as file, \
             tqdm(desc=filename,
                  total=total_size,
                  unit='iB',
                  unit_scale=True) as progress_bar:
            
            for data in response.iter_content(chunk_size=8192):
                size = file.write(data)
                progress_bar.update(size)
        
        return filepath
    
    except requests.exceptions.RequestException as e:
        print(f"Download failed: {e}")
        return None

# Example usage
urls = [
    "https://example.com/document.pdf",
    "https://example.com/image.jpg",
    "https://example.com/data.csv"
]

for url in urls:
    downloaded_file = smart_download(url, "downloads")
    if downloaded_file:
        print(f"Successfully downloaded to {downloaded_file}")

处理身份验证和标头

从 API 或受保护的资源下载时,您可能需要处理身份验证:

import requests

def download_with_auth(url, filename, headers=None, auth=None):
    try:
        # Set default headers if none provided
        if headers is None:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
        
        # Make request with authentication and headers
        response = requests.get(url,
                              headers=headers,
                              auth=auth,
                              stream=True)
        response.raise_for_status()
        
        # Download with progress tracking
        total_size = int(response.headers.get('content-length', 0))
        
        with open(filename, 'wb') as file, \
             tqdm(desc=filename,
                  total=total_size,
                  unit='iB',
                  unit_scale=True) as progress_bar:
            
            for chunk in response.iter_content(chunk_size=8192):
                size = file.write(chunk)
                progress_bar.update(size)
        
        return True
    
    except requests.exceptions.RequestException as e:
        print(f"Download failed: {e}")
        return False

# Example usage with basic auth
url = "https://api.example.com/files/document.pdf"
headers = {
    'Authorization': 'Bearer your-access-token',
    'Accept': 'application/pdf'
}
auth = ('username', 'password')  # Basic authentication

success = download_with_auth(url, "document.pdf", headers=headers, auth=auth)

实际示例:下载多个文件

下面是一个同时下载多个文件的实际示例:

import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

def download_file_threaded(args):
    url, output_dir = args
    try:
        filename = os.path.basename(urlparse(url).path)
        filepath = os.path.join(output_dir, filename)
        
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(filepath, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        
        return filepath
    except Exception as e:
        return None

def download_multiple_files(urls, output_dir="downloads", max_workers=5):
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Prepare arguments for thread pool
    args = [(url, output_dir) for url in urls]
    
    # Download files using thread pool
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(
            executor.map(download_file_threaded, args),
            total=len(urls),
            desc="Downloading files"
        ))
    
    # Process results
    successful = [r for r in results if r is not None]
    failed = len(results) - len(successful)
    
    print(f"\nDownload complete:")
    print(f"- Successfully downloaded: {len(successful)} files")
    print(f"- Failed downloads: {failed} files")
    
    return successful

# Example usage
urls = [
    "https://example.com/file1.pdf",
    "https://example.com/file2.jpg",
    "https://example.com/file3.zip",
]

downloaded_files = download_multiple_files(urls, "downloads", max_workers=3)

重要提示

1. 始终使用 'raise_for_status()' 来捕获 HTTP 错误
2. 使用 'stream=True' 和 'iter_content()' 流式传输大文件
3. 为网络问题添加适当的错误处理
4. 使用进度条获得更好的用户体验
5. 在内容完整性很重要时验证下载的文件
6. 下载多个文件时考虑速率限制
7. 处理慢速连接的超时

下面是文件验证的快速示例:

import hashlib

def validate_download(filepath, expected_hash):
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    actual_hash = sha256_hash.hexdigest()
    return actual_hash == expected_hash

# Example usage
filepath = "downloaded_file.zip"
expected_hash = "a1b2c3..."  # Expected SHA-256 hash
if validate_download(filepath, expected_hash):
    print("File integrity verified")
else:
    print("File may be corrupted")

通过使用这些方法和模式,您可以在处理常见问题和边缘情况时可靠地从 Internet 下载文件。请记住,要始终考虑安全隐患,并针对您的特定用例实施适当的错误处理。

发表评论:

控制面板
您好,欢迎到访网站!
  查看权限
网站分类
最新留言
    友情链接