Scrapy pipline 实战实例

pipline中的使用实例

pipline存储json(自定义json存储)

1
2
3
4
5
6
7
8
9
10
11
import codecs
class JsonWithEncodingPipeline(object):
#自定义json文件的导出
def __init__(self):
self.file = codecs.open('article.json', 'w', encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()

pipline存储json(使用scrapy自带的组件)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from scrapy.exporters import JsonItemExporter
class JsonExporterPipleline(object):
#调用scrapy提供的json export导出json文件
def __init__(self):
self.file = open('articleexport.json', 'wb')
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()

def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()

def process_item(self, item, spider):
self.exporter.export_item(item)
return item

pipline中的存储mysql(阻塞)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import MySQLdb
import MySQLdb.cursors
class MysqlPipeline(object):
#采用同步的机制写入mysql
def __init__(self):
self.conn = MySQLdb.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()

def process_item(self, item, spider):
insert_sql = """
insert into jobbole_article(title, url, create_date, fav_nums)
VALUES (%s, %s, %s, %s)
"""
self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
self.conn.commit()

pipline中存储mysql(异步)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class MysqlTwistedPipline(object):
def __init__(self, dbpool):
self.dbpool = dbpool

@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)

return cls(dbpool)

def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) #处理异常

def handle_error(self, failure, item, spider):
# 处理异步插入的异常
print (failure)


def do_insert(self, cursor, item):
insert_sql = """
insert into jobbole_article(title, url, create_date, fav_nums)
VALUES (%s, %s, %s, %s)
"""
cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
# def do_insert(self, cursor, item):
# #执行具体的插入
# #根据不同的item 构建不同的sql语句并插入到mysql中
# insert_sql, params = item.get_insert_sql()
# print (insert_sql, params)
# cursor.execute(insert_sql, params)
煌金 wechat
扫描关注公众号,回复「1024」获取为你准备的特别推送~
  • 本文作者: 煌金 | 微信公众号【咸鱼学Python】
  • 本文链接: http://www.xianyucoder.cn/2018/10/09/scrapy-pipline/
  • 版权声明: 本博客所有文章除特别声明外,均采用 许可协议。转载请注明出处!
  • 并保留本声明和上方二维码。感谢您的阅读和支持!