A web scraper build to search specific information for a given compound (and its pseudonyms)
1# For more information on item pipelines, see the Scrapy documentation in:
2# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
3import re
4
5from scrapy.exceptions import DropItem
6
7
8class RemoveNonePipeline(object):
9 def __init__(self):
10 pass
11
12 @staticmethod
13 def process_item(item, spider):
14 """
15 Processing the items so None values are replaced by empty strings
16 :param item: The incoming item
17 :param spider: The spider which scraped the spider
18 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
19 """
20 for key in item:
21 if item[key] is None:
22 item[key] = ""
23 return item
24
25
26class DuplicatePipeline(object):
27 def __init__(self):
28 self.known_values = set()
29
30 def process_item(self, item, spider):
31 """
32 Processing the items so exact doubles are dropped
33 :param item: The incoming item
34 :param spider: The spider which scraped the spider
35 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
36 """
37 value = (item['attribute'], item['value'], item['conditions'])
38 if value in self.known_values:
39 raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
40 else:
41 self.known_values.add(value)
42 return item
43
44
45class AttributeSelectionPipeline(object):
46 def __init__(self):
47 pass
48
49 @staticmethod
50 def process_item(item, spider):
51 """
52 The items are processed using the selected attribute list available in the spider,
53 items that don't match the selected items are dropped.
54 :param item: The incoming item
55 :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
56 :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
57 """
58 if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
59 return item
60 else:
61 raise DropItem("Attribute not selected by used: %s" % item)