A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'feature/parser-selection' into develop

+83 -42
+42 -42
fourmi.py
··· 5 5 Usage: 6 6 fourmi search <compound> 7 7 fourmi [options] search <compound> 8 + fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 9 + fourmi list 10 + fourmi [--include=<sourcename> | --exclude=<sourcename>] list 8 11 fourmi -h | --help 9 12 fourmi --version 10 13 ··· 15 18 --log=<file> Save log to an file. 16 19 -o <file> --output=<file> Output file [default: result.*format*] 17 20 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] 21 + --include=<sourcenames> Include only sources that match the regular these expressions split by a comma. 22 + --exclude=<sourcenames> Exclude the sources that match the regular these expressions split by a comma. 18 23 """ 19 24 20 - import os 21 - import inspect 22 - 23 25 from twisted.internet import reactor 24 26 from scrapy.crawler import Crawler 25 27 from scrapy import log, signals 26 28 from scrapy.utils.project import get_project_settings 27 29 import docopt 28 30 29 - from FourmiCrawler.parsers.parser import Parser 30 31 from FourmiCrawler.spider import FourmiSpider 31 - 32 - 33 - def load_parsers(rel_dir="FourmiCrawler/parsers"): 34 - path = os.path.dirname(os.path.abspath(__file__)) 35 - path += "/" + rel_dir 36 - parsers = [] 37 - known_parser = set() 38 - 39 - for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 40 - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 41 - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 42 - for cls in classes: 43 - if issubclass(cls, Parser) and cls not in known_parser: 44 - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? 45 - known_parser.add(cls) 46 - return parsers 32 + from sourceloader import SourceLoader 47 33 48 34 49 - def setup_crawler(searchable, settings): 35 + def setup_crawler(searchable, settings, source_loader): 50 36 spider = FourmiSpider(compound=searchable) 51 - spider.add_parsers(load_parsers()) 37 + spider.add_parsers(source_loader.sources) 52 38 crawler = Crawler(settings) 53 39 crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 54 40 crawler.configure() ··· 56 42 crawler.start() 57 43 58 44 59 - def scrapy_settings_manipulation(arguments): 45 + def scrapy_settings_manipulation(docopt_arguments): 60 46 settings = get_project_settings() 61 - 62 - if arguments["--output"] != 'result.*format*': 63 - settings.overrides["FEED_URI"] = arguments["--output"] 64 - elif arguments["--format"] == "jsonlines": 47 + # [todo] - add at least a warning for files that already exist 48 + if docopt_arguments["--output"] != 'result.*format*': 49 + settings.overrides["FEED_URI"] = docopt_arguments["--output"] 50 + elif docopt_arguments["--format"] == "jsonlines": 65 51 settings.overrides["FEED_URI"] = "results.json" 66 - elif arguments["--format"] is not None: 67 - settings.overrides["FEED_URI"] = "results." + arguments["--format"] 52 + elif docopt_arguments["--format"] is not None: 53 + settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] 68 54 69 - if arguments["--format"] is not None: 70 - settings.overrides["FEED_FORMAT"] = arguments["--format"] 55 + if docopt_arguments["--format"] is not None: 56 + settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] 71 57 72 58 return settings 73 59 74 60 75 - def start_log(arguments): 76 - if arguments["--log"] is not None: 77 - if arguments["--verbose"]: 78 - log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG) 61 + def start_log(docopt_arguments): 62 + if docopt_arguments["--log"] is not None: 63 + if docopt_arguments["--verbose"]: 64 + log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) 79 65 else: 80 - log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING) 66 + log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) 81 67 else: 82 - if arguments["--verbose"]: 68 + if docopt_arguments["--verbose"]: 83 69 log.start(logstdout=False, loglevel=log.DEBUG) 84 70 else: 85 71 log.start(logstdout=True, loglevel=log.WARNING) 86 72 87 73 88 - if __name__ == '__main__': 89 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.0') 90 - start_log(arguments) 91 - settings = scrapy_settings_manipulation(arguments) 92 - setup_crawler(arguments["<compound>"], settings) 74 + def search(docopt_arguments, source_loader): 75 + start_log(docopt_arguments) 76 + settings = scrapy_settings_manipulation(docopt_arguments) 77 + setup_crawler(docopt_arguments["<compound>"], settings, source_loader) 93 78 reactor.run() 94 79 80 + 81 + if __name__ == '__main__': 82 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0') 83 + loader = SourceLoader() 84 + 85 + if arguments["--include"]: 86 + loader.include(arguments["--include"].split(',')) 87 + elif arguments["--exclude"]: 88 + loader.exclude(arguments["--exclude"].split(',')) 89 + 90 + if arguments["search"]: 91 + search(arguments, loader) 92 + elif arguments["list"]: 93 + print "-== Available Sources ==-" 94 + print str(loader)
+41
sourceloader.py
··· 1 + import inspect 2 + import os 3 + import re 4 + from FourmiCrawler.parsers.parser import Parser 5 + 6 + 7 + class SourceLoader: 8 + sources = [] 9 + 10 + def __init__(self, rel_dir="FourmiCrawler/parsers"): 11 + path = os.path.dirname(os.path.abspath(__file__)) 12 + path += "/" + rel_dir 13 + known_parser = set() 14 + 15 + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 16 + mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 17 + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 18 + for cls in classes: 19 + if issubclass(cls, Parser) and cls not in known_parser: 20 + self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 21 + known_parser.add(cls) 22 + 23 + def include(self, source_names): 24 + new = set() 25 + for name in source_names: 26 + new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 27 + self.sources = list(new) 28 + 29 + def exclude(self, source_names): 30 + exclude = [] 31 + for name in source_names: 32 + exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 33 + self.sources = [src for src in self.sources if src not in exclude] 34 + 35 + def __str__(self): 36 + string = "" 37 + for src in self.sources: 38 + string += "Source: " + src.__class__.__name__ 39 + string += " - " 40 + string += "URI: " + src.website + "\n" 41 + return string