以下是一个简单的爬取代理IP并验证可用性的Python脚本,将可用的代理IP保存到IP.txt文件中:
```python
import requests
from bs4 import BeautifulSoup
# 获取代理IP列表
def get_proxy_list():
url = 'https://www.xicidaili.com/nn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', attrs={'id': 'ip_list'})
trs = table.find_all('tr')[1:]
proxy_list = []
for tr in trs:
tds = tr.find_all('td')
ip = tds[1].text.strip()
port = tds[2].text.strip()
protocol = tds[5].text.strip().lower()
proxy = protocol + '://' + ip + ':' + port
proxy_list.append(proxy)
return proxy_list
# 验证代理IP是否可用
def check_proxy(proxy):
url = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
response = requests.get(url, headers=headers, proxies={'http': proxy, 'https': proxy}, timeout=5)
if response.status_code == 200:
return True
else:
return False
except:
return False
# 保存可用的代理IP到文件中
def save_proxy(proxy_list):
with open('IP.txt', 'w') as f:
for proxy in proxy_list:
if check_proxy(proxy):
f.write(proxy + '\n')
if __name__ == '__main__':
proxy_list = get_proxy_list()
save_proxy(proxy_list)
```
该脚本首先通过requests库和BeautifulSoup库获取西刺代理网