Dropping duplicate item value in Scrapy pipeline
我将一些结果存储在.json文件中,格式如下:
(每行一项)
1 2 3 4 | {"category": ["ctg1"],"pages": 3,"websites": ["x1.com","x2.com","x5.com"]} {"category": ["ctg2"],"pages": 2,"websites": ["x1.com","d4.com"]} . . |
我试图删除重复值,但没有删除整个项目,但没有成功。
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | import scrapy import json import codecs from scrapy.exceptions import DropItem class ResultPipeline(object): def __init__(self): self.ids_seen = set() self.file = codecs.open('results.json', 'w', encoding='utf-8') def process_item(self, item, spider): for sites in item['websites']: if sites in self.ids_seen: raise DropItem("Duplicate item found: %s" % sites) else: self.ids_seen.add(sites) line = json.dumps(dict(item), ensure_ascii=False) +" " self.file.write(line) return item def spider_closed(self, spider): self.file.close() |
不要删除重复的项目,只需重新构建不在
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import json line1 = '{"category": ["ctg1"],"pages": 3,"websites": ["x1.com","x2.com","x5.com"]}' line2 = '{"category": ["ctg2"],"pages": 2,"websites": ["x1.com","d4.com"]}' lines = (line1, line2) ids_seen = set() def process_item(item): item_unique_sites = [] for site in item['websites']: if not site in ids_seen: ids_seen.add(site) item_unique_sites.append(site) # Delete the duplicates item['websites'] = item_unique_sites line = json.dumps(dict(item), ensure_ascii=False) +" " print line #self.file.write(line) return item for line in lines: json_data = json.loads(line) process_item(json_data) |