A web scraper build to search specific information for a given compound (and its pseudonyms)
at feature/executable 61 lines 2.2 kB view raw
1# For more information on item pipelines, see the Scrapy documentation in: 2# http://doc.scrapy.org/en/latest/topics/item-pipeline.html 3import re 4 5from scrapy.exceptions import DropItem 6 7 8class RemoveNonePipeline(object): 9 def __init__(self): 10 pass 11 12 @staticmethod 13 def process_item(item, spider): 14 """ 15 Processing the items so None values are replaced by empty strings 16 :param item: The incoming item 17 :param spider: The spider which scraped the spider 18 :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 """ 20 for key in item: 21 if item[key] is None: 22 item[key] = "" 23 return item 24 25 26class DuplicatePipeline(object): 27 def __init__(self): 28 self.known_values = set() 29 30 def process_item(self, item, spider): 31 """ 32 Processing the items so exact doubles are dropped 33 :param item: The incoming item 34 :param spider: The spider which scraped the spider 35 :return: :raise DropItem: Returns the item if unique or drops them if it's already known 36 """ 37 value = (item['attribute'], item['value'], item['conditions']) 38 if value in self.known_values: 39 raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item. 40 else: 41 self.known_values.add(value) 42 return item 43 44 45class AttributeSelectionPipeline(object): 46 def __init__(self): 47 pass 48 49 @staticmethod 50 def process_item(item, spider): 51 """ 52 The items are processed using the selected attribute list available in the spider, 53 items that don't match the selected items are dropped. 54 :param item: The incoming item 55 :param spider: The spider which scraped the item. Should have an attribute "selected_attributes". 56 :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped. 57 """ 58 if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]: 59 return item 60 else: 61 raise DropItem("Attribute not selected by used: %s" % item)