如何使用Python从代理IP池中获取IP
- 安装requests和bs4库
pip install requests pip install bs4
- 爬取代理IP池网站
我们可以从代理IP池网站上爬取免费的代理IP地址,这里以http://www.xiladaili.com/为例。
import requests from bs4 import BeautifulSoup def get_html(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} r = requests.get(url, headers=headers, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text def get_proxy_ip(): url = 'http://www.xiladaili.com/gaoni/' html = get_html(url) soup = BeautifulSoup(html, 'html.parser') table = soup.find('table', attrs={'class': 'fl-table'}) trs = table.tbody.find_all('tr') ip_list = [] for tr in trs: tds = tr.find_all('td') if len(tds) == 7: ip = tds[0].text.strip() port = tds[1].text.strip() proxy_type = tds[3].text.strip() if proxy_type == 'HTTP': ip_list.append('http://' + ip + ':' + port) return ip_list if __name__ == '__main__': ip_list = get_proxy_ip() print(ip_list)
- 随机选择一个代理IP
import random proxies = {'http': random.choice(ip_list)}
- 使用代理IP访问网站
import requests url = 'https://www.pidancode.com' response = requests.get(url, proxies=proxies) print(response.text)
完整代码:
import random import requests from bs4 import BeautifulSoup def get_html(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} r = requests.get(url, headers=headers, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text def get_proxy_ip(): url = 'http://www.xiladaili.com/gaoni/' html = get_html(url) soup = BeautifulSoup(html, 'html.parser') table = soup.find('table', attrs={'class': 'fl-table'}) trs = table.tbody.find_all('tr') ip_list = [] for tr in trs: tds = tr.find_all('td') if len(tds) == 7: ip = tds[0].text.strip() port = tds[1].text.strip() proxy_type = tds[3].text.strip() if proxy_type == 'HTTP': ip_list.append('http://' + ip + ':' + port) return ip_list if __name__ == '__main__': ip_list = get_proxy_ip() proxies = {'http': random.choice(ip_list)} url = 'https://www.pidancode.com' response = requests.get(url, proxies=proxies) print(response.text)
相关文章