搭建ip代理池(简易版)
推荐两个scrapy代理的项目
第一个是免费的代理插件,无需付费https://github.com/aivarsk/scrapy-proxies
第二个是需要付费的代理插件https://github.com/scrapy-plugins/scrapy-crawlera
撸视频的时候学到的代理池实例
获取西刺代理的代理列表并存入mysql数据库:1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38def crawl_xici():
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
for i in range(3411):
res = requests.get('http://www.xicidaili.com/nn/{}'.format(i), headers = headers)
ip_list = []
selector = Selector(text=res.text)
all_trs = selector.css("#ip_list tr")
for tr in all_trs[1:]:
speed_str = tr.css(".bar::attr(title)").extract()[0]
if speed_str:
speed = float(speed_str.split("秒")[0])
all_texts = tr.css("td::text").extract()
ip = all_texts[0]
port = all_texts[1]
proxy_type = all_texts[5]
ip_list.append((ip,port,proxy_type,speed))
# print(ip_list)
# for ip_info in ip_list:
# cursor.execute(
# "insert into proxy_ip(ip, port, speed, proxy_type) VALUES ('{0}', '{1}', {2}, '{3}')".format(
# ip_info[0], ip_info[1], ip_info[3], ip_info[2]
# )
# )
# conn.commit()
for ip_info in ip_list:
insert_sql = """
insert into proxy_ip(ip, port, speed, proxy_type)
VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE ip=VALUES(ip), port=VALUES(port), speed=VALUES(speed), proxy_type=VALUES(proxy_type)
"""
params = (ip_info[0], ip_info[1], ip_info[3], ip_info[2])
cursor.execute(insert_sql,params)
conn.commit()
# print("入库成功")
定义随机获取ip的类方法(包括删除无效代理)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56class GetIP(object):
# 从数据库删除ip
def delete_ip(self, ip):
delete_sql = """
delete from proxy_ip WHERE ip={0}
""".format(ip)
cursor.execute(delete_sql)
conn.commit()
print("删除成功")
return True
# 验证ip
def judge_ip(self, ip, port):
http_url = "http://www.baidu.com"
proxy_url = "http://{0}:{1}".format(ip, port)
try:
proxy_dict = {
"http":proxy_url
}
res = requests.get(http_url, proxies=proxy_dict)
except Exception as e:
print("invalid ip and port")
self.delete_ip(ip)
return False
else:
code = res.status_code
if code >= 200 and code < 300:
print("effective ip")
return True
else:
print("invalid ip and port")
self.delete_ip(ip)
return False
# 从数据库获取随机ip
def get_random_ip(self):
select_sql = """
SELECT ip,port from proxy_ip ORDER BY RAND() LIMIT 1
"""
result = cursor.execute(select_sql)
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
judge_re = self.judge_ip(ip,port)
if judge_re:
return "http://{0}:{1}".format(ip, port)
else:
return self.get_random_ip()
# crawl_xici()
if __name__ == '__main__':
get_ip = GetIP()
get_ip.get_random_ip()
写入middleware文件中1
2
3
4
5
6
7# 使用前要记得在setting中添加RadomProxyMiddleware
from tools.crawl_xici_ip import GetIP
# 随机ip代理
class RadomProxyMiddleware(object):
def process_request(self, request, spider):
get_ip = GetIP()
request.meta['proxy'] = get_ip.get_random_ip()
自定义pipline的使用实例
pipline存储json(自定义json存储)
1 | import codecs |
pipline存储json(使用scrapy自带的组件)
1 | from scrapy.exporters import JsonItemExporter |
pipline中的存储mysql(阻塞)
1 | import MySQLdb |
pipline中存储mysql(异步)
1 | class MysqlTwistedPipline(object): |
如何在scrapy中随机切换UA?
随机UA下载中间件(初始版)
setting文件:1
2
3DOWNLOADER_MIDDLEWARES = {
'ArticleSpider.middlewares.RandomUserAgentMiddleware': 543,
}
middlewares文件:1
2
3
4
5
6
7
8
9
10
11
12
13
14from fake_useragent import UserAgent
class RandomUserAgentMiddleware(object):
def __init__(self, crawler):
super(RandomUserAgentMiddleware, self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
数据存错怎么办?
将redis数据库导入mongodb数据库
1 | import json, redis, pymongo |
将redis数据存入mysql数据库
1 | import redis, json, time |