A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'release/v0.1.0'

+99 -53
-48
Fourmi.py
··· 1 - #!/usr/bin/env python 2 - """ 3 - Fourmi - An internet webcrawler searching for information on chemical 4 - compounds. [todo] - Add some more useful text here. 5 - Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet! 6 - """ 7 - 8 - from twisted.internet import reactor 9 - from scrapy.crawler import Crawler 10 - from scrapy import log, signals 11 - from FourmiCrawler.parsers.parser import Parser 12 - from FourmiCrawler.spider import FourmiSpider 13 - from scrapy.utils.project import get_project_settings 14 - import os, inspect, re 15 - 16 - def load_parsers(rel_dir="FourmiCrawler/parsers"): 17 - path = os.path.dirname(os.path.abspath(__file__)) 18 - path += "/" + rel_dir 19 - parsers = [] 20 - known_parser = set() 21 - 22 - for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 23 - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 24 - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 25 - for cls in classes: 26 - if issubclass(cls, Parser) and cls not in known_parser: 27 - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? 28 - known_parser.add(cls) 29 - return parsers 30 - 31 - def setup_crawler(searchables): 32 - spider = FourmiSpider(compounds=searchables) 33 - spider.add_parsers(load_parsers()) 34 - settings = get_project_settings() 35 - crawler = Crawler(settings) 36 - crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 37 - crawler.configure() 38 - crawler.crawl(spider) 39 - crawler.start() 40 - 41 - 42 - def start(): 43 - setup_crawler(["Methane"]) 44 - log.start() 45 - reactor.run() 46 - 47 - 48 - start()
+3
FourmiCrawler/settings.py
··· 13 13 ITEM_PIPELINES = { 14 14 'FourmiCrawler.pipelines.FourmiPipeline': 100 15 15 } 16 + FEED_URI = 'results.json' 17 + FEED_FORMAT = 'jsonlines' 18 + 16 19 17 20 # Crawl responsibly by identifying yourself (and your website) on the 18 21 # user-agent
+2 -5
FourmiCrawler/spider.py
··· 8 8 __parsers = [] 9 9 synonyms = [] 10 10 11 - def __init__(self, compounds=None, *args, **kwargs): 11 + def __init__(self, compound=None, *args, **kwargs): 12 12 super(FourmiSpider, self).__init__(*args, **kwargs) 13 - if isinstance(compounds, list): 14 - self.synonyms.extend(compounds) 15 - else: 16 - self.synonyms.append(compounds) 13 + self.synonyms.append(compound) 17 14 18 15 def parse(self, reponse): 19 16 for parser in self.__parsers:
+94
fourmi.py
··· 1 + #!/usr/bin/env python 2 + """ 3 + Fourmi, an web scraper build to search specific information for a given compound (and it's pseudonyms). 4 + 5 + Usage: 6 + fourmi search <compound> 7 + fourmi [options] search <compound> 8 + fourmi -h | --help 9 + fourmi --version 10 + 11 + Options: 12 + -h --help Show this screen. 13 + --version Show version. 14 + --verbose Verbose logging output. 15 + --log=<file> Save log to an file. 16 + -o <file> --output=<file> Output file [default: result.*format*] 17 + -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] 18 + """ 19 + 20 + import os 21 + import inspect 22 + 23 + from twisted.internet import reactor 24 + from scrapy.crawler import Crawler 25 + from scrapy import log, signals 26 + from scrapy.utils.project import get_project_settings 27 + import docopt 28 + 29 + from FourmiCrawler.parsers.parser import Parser 30 + from FourmiCrawler.spider import FourmiSpider 31 + 32 + 33 + def load_parsers(rel_dir="FourmiCrawler/parsers"): 34 + path = os.path.dirname(os.path.abspath(__file__)) 35 + path += "/" + rel_dir 36 + parsers = [] 37 + known_parser = set() 38 + 39 + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 40 + mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 41 + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 42 + for cls in classes: 43 + if issubclass(cls, Parser) and cls not in known_parser: 44 + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? 45 + known_parser.add(cls) 46 + return parsers 47 + 48 + 49 + def setup_crawler(searchable, settings): 50 + spider = FourmiSpider(compound=searchable) 51 + spider.add_parsers(load_parsers()) 52 + crawler = Crawler(settings) 53 + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 54 + crawler.configure() 55 + crawler.crawl(spider) 56 + crawler.start() 57 + 58 + 59 + def scrapy_settings_manipulation(arguments): 60 + settings = get_project_settings() 61 + 62 + if arguments["--output"] != 'result.*format*': 63 + settings.overrides["FEED_URI"] = arguments["--output"] 64 + elif arguments["--format"] == "jsonlines": 65 + settings.overrides["FEED_URI"] = "results.json" 66 + elif arguments["--format"] is not None: 67 + settings.overrides["FEED_URI"] = "results." + arguments["--format"] 68 + 69 + if arguments["--format"] is not None: 70 + settings.overrides["FEED_FORMAT"] = arguments["--format"] 71 + 72 + return settings 73 + 74 + 75 + def start_log(arguments): 76 + if arguments["--log"] is not None: 77 + if arguments["--verbose"]: 78 + log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG) 79 + else: 80 + log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING) 81 + else: 82 + if arguments["--verbose"]: 83 + log.start(logstdout=False, loglevel=log.DEBUG) 84 + else: 85 + log.start(logstdout=True, loglevel=log.WARNING) 86 + 87 + 88 + if __name__ == '__main__': 89 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0') 90 + start_log(arguments) 91 + settings = scrapy_settings_manipulation(arguments) 92 + setup_crawler([arguments["<compound>"]], settings) 93 + reactor.run() 94 +