如何使用Python实现基于代理IP的多线程文件下载

2023-04-17 00:00:00 文件 多线程 如何使用

实现基于代理IP的多线程文件下载可以分为以下几个步骤:
1. 获取代理IP池
2. 创建多线程下载任务,并设置代理IP
3. 实现断点续传
4. 下载完成后合并文件
下面我们详细介绍如何使用Python实现这些步骤。
1. 获取代理IP池
我们可以使用第三方网站提供的免费代理IP,例如 http://www.xicidaili.com/,这里我们使用 requests 库发送 HTTP 请求,然后解析 HTML 文档中的代理 IP 和端口号,保存到一个代理IP池中。

import requests
from bs4 import BeautifulSoup
def get_proxy_ip():
    url = 'http://www.xicidaili.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    trs = soup.find_all('tr')
    proxy_ips = []
    for tr in trs[1:]:
        tds = tr.find_all('td')
        if len(tds) != 0:
            ip = tds[1].text
            port = tds[2].text
            proxy_ips.append(ip + ':' + port)
    return proxy_ips
  1. 创建多线程下载任务,并设置代理IP
    使用 Python 自带的 threading 库,创建多个线程下载文件,并指定代理IP。每个线程负责下载文件的某一部分,以提高下载速度。同时,我们可以使用 requests 库实现代理的配置。
import threading
import requests
proxy_ips = get_proxy_ip()
def download_file(url, start, end, file_name, proxy_ip):
    headers = {
        'Range': 'bytes={}-{}'.format(start, end),
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    proxies = {
        'http': 'http://' + proxy_ip,
        'https': 'https://' + proxy_ip,
    }
    try:
        r = requests.get(url, headers=headers, proxies=proxies, stream=True, timeout=10)
        with open(file_name, 'r+b') as f:
            f.seek(start)
            f.write(r.content)
    except:
        pass
def multi_thread_download(url, num_thread=8):
    r = requests.get(url, stream=True)
    file_name = url.split('/')[-1]
    total_size = int(r.headers['Content-Length'])
    part = total_size // num_thread  
    intervals = []
    for i in range(num_thread):
        start = i * part
        end = start + part - 1
        if i == num_thread - 1:
            end = total_size
        intervals.append((start, end))
    with open(file_name, 'wb') as f:
        f.truncate(0)  # 清空文件内容
        f.write(b'\x00' * total_size)
    threads = []
    proxy_ip = proxy_ips[0]
    for start, end in intervals:
        t = threading.Thread(target=download_file, args=(url, start, end, file_name, proxy_ip))
        threads.append(t)
        proxy_ips.append(proxy_ips.pop(0)) # 每个线程使用不同的代理IP
        proxy_ip = proxy_ips[0]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
  1. 实现断点续传
    如果下载过程中程序异常退出或下载任务中断,我们需要保留已经下载的文件段,以便继续下载。我们可以在程序运行前,检查文件是否已经下载部分文件,如果是,则获取已经下载的文件大小,然后继续下载剩余的未下载部分。
import os
def could_continue(file_name, total_size):
    if os.path.exists(file_name):
        size = os.path.getsize(file_name)
        return total_size > size
    return False
def continue_download(url, num_thread=8):
    r = requests.head(url)  # 获取文件大小
    total_size = int(r.headers['Content-Length'])
    file_name = url.split('/')[-1]
    if not could_continue(file_name, total_size):
        multi_thread_download(url, num_thread)
    else:
        threads = []
        proxy_ip = proxy_ips[0]
        with open(file_name, 'r+b') as f:
            for i in range(num_thread):
                start = i * (total_size // num_thread)
                f.seek(start)
                proxy_ips.append(proxy_ips.pop(0)) # 每个线程使用不同的代理IP
                proxy_ip = proxy_ips[0]
                end = start + (total_size // num_thread) - 1
                if i == num_thread - 1:
                    end = total_size - 1
                t = threading.Thread(target=download_file, args=(url, start, end, file_name, proxy_ip))
                threads.append(t)
            for t in threads:
                t.start()
            for t in threads:
                t.join()
  1. 下载完成后合并文件
    下载完成后,我们需要将下载的每个文件片段合并成一个完整的文件。我们可以使用 Python 自带的 os 模块实现。
def merge_files(file_names):
    with open('merged_file', 'wb') as f:
        for i in range(len(file_names)):
            file_name = file_names[i]
            with open(file_name, 'rb') as fi:
                content = fi.read()
                f.write(content)
            os.remove(file_name)

完整代码如下:

import os
import threading
import requests
from bs4 import BeautifulSoup
def get_proxy_ip():
    url = 'http://www.xicidaili.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    trs = soup.find_all('tr')
    proxy_ips = []
    for tr in trs[1:]:
        tds = tr.find_all('td')
        if len(tds) != 0:
            ip = tds[1].text
            port = tds[2].text
            proxy_ips.append(ip + ':' + port)
    return proxy_ips
proxy_ips = get_proxy_ip()
def download_file(url, start, end, file_name, proxy_ip):
    headers = {
        'Range': 'bytes={}-{}'.format(start, end),
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    proxies = {
        'http': 'http://' + proxy_ip,
        'https': 'https://' + proxy_ip,
    }
    try:
        r = requests.get(url, headers=headers, proxies=proxies, stream=True, timeout=10)
        with open(file_name, 'r+b') as f:
            f.seek(start)
            f.write(r.content)
    except:
        pass
def multi_thread_download(url, num_thread=8):
    r = requests.get(url, stream=True)
    file_name = url.split('/')[-1]
    total_size = int(r.headers['Content-Length'])
    part = total_size // num_thread  
    intervals = []
    for i in range(num_thread):
        start = i * part
        end = start + part - 1
        if i == num_thread - 1:
            end = total_size
        intervals.append((start, end))
    with open(file_name, 'wb') as f:
        f.truncate(0)  # 清空文件内容
        f.write(b'\x00' * total_size)
    threads = []
    proxy_ip = proxy_ips[0]
    for start, end in intervals:
        t = threading.Thread(target=download_file, args=(url, start, end, file_name, proxy_ip))
        threads.append(t)
        proxy_ips.append(proxy_ips.pop(0)) # 每个线程使用不同的代理IP
        proxy_ip = proxy_ips[0]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
def could_continue(file_name, total_size):
    if os.path.exists(file_name):
        size = os.path.getsize(file_name)
        return total_size > size
    return False
def continue_download(url, num_thread=8):
    r = requests.head(url)
    total_size = int(r.headers['Content-Length'])
    file_name = url.split('/')[-1]
    if not could_continue(file_name, total_size):
        multi_thread_download(url, num_thread)
    else:
        threads = []
        proxy_ip = proxy_ips[0]
        with open(file_name, 'r+b') as f:
            for i in range(num_thread):
                start = i * (total_size // num_thread)
                f.seek(start)
                proxy_ips.append(proxy_ips.pop(0)) # 每个线程使用不同的代理IP
                proxy_ip = proxy_ips[0]
                end = start + (total_size // num_thread) - 1
                if i == num_thread - 1:
                    end = total_size - 1
                t = threading.Thread(target=download_file, args=(url, start, end, file_name, proxy_ip))
                threads.append(t)
            for t in threads:
                t.start()
            for t in threads:
                t.join()
def merge_files(file_names):
    with open('merged_file', 'wb') as f:
        for i in range(len(file_names)):
            file_name = file_names[i]
            with open(file_name, 'rb') as fi:
                content = fi.read()
                f.write(content)
            os.remove(file_name)
if __name__ == '__main__':
    url = 'http://speedtest.ftp.otenet.gr/files/test10Mb.db'
    continue_download(url, num_thread=8)
    file_names = []
    for i in range(8):
        file_name = 'test{}.db'.format(i)
        file_names.append(file_name)
    merge_files(file_names)

相关文章