A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'release/0.3.1'

+20 -3
+16
FourmiCrawler/pipelines.py
··· 5 5 import re 6 6 from scrapy.exceptions import DropItem 7 7 8 + class RemoveNonePipeline(object): 9 + 10 + def __init__(self): 11 + self.known_values = set() 12 + 13 + def process_item(self, item, spider): 14 + """ 15 + Processing the items so None values are replaced by empty strings 16 + :param item: The incoming item 17 + :param spider: The spider which scraped the spider 18 + :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 + """ 20 + for key in item: 21 + if item[key] is None: 22 + item[key] = "" 23 + return item 8 24 9 25 class DuplicatePipeline(object): 10 26
+3 -2
FourmiCrawler/settings.py
··· 11 11 SPIDER_MODULES = ['FourmiCrawler'] 12 12 NEWSPIDER_MODULE = 'FourmiCrawler' 13 13 ITEM_PIPELINES = { 14 - 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, 15 - 'FourmiCrawler.pipelines.DuplicatePipeline': 200, 14 + "FourmiCrawler.pipelines.RemoveNonePipeline": 100, 15 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200, 16 + 'FourmiCrawler.pipelines.DuplicatePipeline': 300, 16 17 } 17 18 FEED_URI = 'results.json' 18 19 FEED_FORMAT = 'jsonlines'
+1 -1
fourmi.py
··· 80 80 81 81 82 82 if __name__ == '__main__': 83 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') 83 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1') 84 84 loader = SourceLoader() 85 85 86 86 if arguments["--include"]: