如何使用Python从代理IP池中获取IP

2023-04-17 00:00:00 获取 如何使用 池中
  1. 安装requests和bs4库
pip install requests
pip install bs4
  1. 爬取代理IP池网站

我们可以从代理IP池网站上爬取免费的代理IP地址,这里以http://www.xiladaili.com/为例。

import requests 
from bs4 import BeautifulSoup

def get_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    r.encoding = 'utf-8'
    return r.text

def get_proxy_ip():
    url = 'http://www.xiladaili.com/gaoni/'
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', attrs={'class': 'fl-table'})
    trs = table.tbody.find_all('tr')
    ip_list = []
    for tr in trs:
        tds = tr.find_all('td')
        if len(tds) == 7:
            ip = tds[0].text.strip()
            port = tds[1].text.strip()
            proxy_type = tds[3].text.strip()
            if proxy_type == 'HTTP':
                ip_list.append('http://' + ip + ':' + port)
    return ip_list

if __name__ == '__main__':
    ip_list = get_proxy_ip()
    print(ip_list)
  1. 随机选择一个代理IP
import random

proxies = {'http': random.choice(ip_list)}
  1. 使用代理IP访问网站
import requests

url = 'https://www.pidancode.com'
response = requests.get(url, proxies=proxies)
print(response.text)

完整代码:

import random
import requests
from bs4 import BeautifulSoup

def get_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    r.encoding = 'utf-8'
    return r.text

def get_proxy_ip():
    url = 'http://www.xiladaili.com/gaoni/'
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', attrs={'class': 'fl-table'})
    trs = table.tbody.find_all('tr')
    ip_list = []
    for tr in trs:
        tds = tr.find_all('td')
        if len(tds) == 7:
            ip = tds[0].text.strip()
            port = tds[1].text.strip()
            proxy_type = tds[3].text.strip()
            if proxy_type == 'HTTP':
                ip_list.append('http://' + ip + ':' + port)
    return ip_list

if __name__ == '__main__':
    ip_list = get_proxy_ip()
    proxies = {'http': random.choice(ip_list)}
    url = 'https://www.pidancode.com'
    response = requests.get(url, proxies=proxies)
    print(response.text)

相关文章