python建立ip池

2021年9月20日 12点热度 0条评论 来源: 查无此字 ๑

建立ip池

ip池对于大批量的数据爬取是很有必要的,一些网站采用ip访问频率的限制,一不小心ip就被封了,这时候可以 ①花钱买代理ip稳定) ②爬取免费代理ip建立ip池不稳定
两种方式都可以就看你手头银子够不够了

免费的代理ip网址有很多,百度ip代理,会弹出一大堆的网址,可以随意选几个来获取

基本上代理ip网址都没什么反爬,简单的xpath+正则都能搞定 ,但是要注意访问频率,小网站禁不起风吹雨打,咱们拿人手软,不要太过

# -*- coding: utf-8 -*-
import os
import re
import time

import fake_useragent
import requests, threading, datetime


# 方法二,从本地文件夹获取
from lxml import etree

location = os.getcwd() + 'headers.csv'
ka = fake_useragent.UserAgent(path=location, verify_ssl=False, use_cache_server=False)

headers={ 

	'User-Agent':ka.random
}

he={ 
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
	'User-Agent':ka.random,
	'Accept-Encoding': 'gzip, deflate, br',
	'Accept-Language': 'zh-CN,zh;q=0.9',
	'Cache-Control': 'max-age=0',
	'Cookie': 'acw_tc=76b20f4415724078231512183e3c6a7d68346bb9b735affcb7e7adae8bce5f; ASPSESSIONIDSQBAQBBS=MNPFALHAAAEJLDPPPGNLILAL; __51cke__=; Hm_lvt_8fd158bb3e69c43ab5dd05882cf0b234=1572407823; ASPSESSIONIDSQAATBBT=IDCDBJIAMJKDLENIPNCAGHHH; __tins__16949115=%7B%22sid%22%3A%201572412362318%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201572415815590%7D; __51laig__=9; Hm_lpvt_8fd158bb3e69c43ab5dd05882cf0b234=1572414016',
	'Host': 'www.zdaye.com',
	'Referer': 'https://www.zdaye.com/dayProxy.html',
	'Sec-Fetch-Site': 'same-origin',
	'Sec-Fetch-Mode': 'navigate',
	'Sec-Fetch-User': '?1',
	'Upgrade-Insecure-Requests': '1'
}
def main(url_one,url_two):
	# pp='125.88.190.1:3128'
	# rt = {"http": "http://" + pp, "https": "http://" + pp}


	path='https://www.zdaye.com/dayProxy.html'
	requests.packages.urllib3.disable_warnings()
	response=requests.get(url=path,headers=headers,verify=False)
	response.encoding='utf-8'
	source = etree.HTML(response.text)
	new_list=source.xpath('//div[@class="thread_item"]/div[@class="thread_content"]/h3/a/@href')
	print('-------爬虫开始-------%s'%len(new_list))
	start = datetime.datetime.now()  # 开始时间
	for new in new_list:
		url='https://www.zdaye.com'+str(new)
		requests.packages.urllib3.disable_warnings()
		res=requests.get(url=url,headers=he,verify=False)
		res.encoding = 'utf-8'
		ip_list = re.findall('<br><a href=".*?">(.*?)@HTTP', res.text)

		for ip in ip_list:
			ip=ip.replace('</a>','')
			#每个ip验证两次,均无问题就写入文件
			if checkip(url_one,ip):
				if checkip(url_two, ip):
					write(ip)
					print(ip)
		time.sleep(4)
	end = datetime.datetime.now()
	diff = gettimediff(start, end)
	ips = read()
	print('一共爬取代理ip: %s 个,共耗时: %s \n' % (len(ips), diff))


def checkip(targeturl, ip):
	proxies = { "http": "http://" + ip, "https": "http://" + ip}  # 代理ip
	try:
		response = requests.get(url=targeturl, proxies=proxies, headers=headers, timeout=5).status_code
		if response == 200:
			return True
		else:
			return False
	except:
		return False



# 写入文档
def write(text):
	with open('ip_test.txt', 'a', encoding='utf-8') as f:
		f.writelines(text)
		f.write('\n')


# 清空文档
def truncatefile():
	with open('ip_test.txt', 'w', encoding='utf-8') as f:
		f.truncate()


# 读取文档
def read():
	with open('ip_test.txt', 'r', encoding='utf-8') as f:
		txt = []
		for s in f.readlines():
			txt.append(s.strip())
	return txt

# 计算时间差,格式: 时分秒
def gettimediff(start, end):
	seconds = (end - start).seconds
	m, s = divmod(seconds, 60)
	h, m = divmod(m, 60)
	diff = ("%02d:%02d:%02d" % (h, m, s))
	return diff


if __name__ == '__main__':
	#运行之前清空文档
	truncatefile()
	#验证ip的网址
	url_one='https://blog.csdn.net'
	url_two = 'https://www.baidu.com/'
	main(url_one,url_two)
    原文作者:查无此字 ๑
    原文地址: https://blog.csdn.net/weixin_42195144/article/details/103257258
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系管理员进行删除。