···55Usage:
66 fourmi search <compound>
77 fourmi [options] search <compound>
88+ fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
99+ fourmi list
1010+ fourmi [--include=<sourcename> | --exclude=<sourcename>] list
811 fourmi -h | --help
912 fourmi --version
1013···1518 --log=<file> Save log to an file.
1619 -o <file> --output=<file> Output file [default: result.*format*]
1720 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
2121+ --include=<sourcenames> Include only sources that match the regular these expressions split by a comma.
2222+ --exclude=<sourcenames> Exclude the sources that match the regular these expressions split by a comma.
1823"""
19242020-import os
2121-import inspect
2222-2325from twisted.internet import reactor
2426from scrapy.crawler import Crawler
2527from scrapy import log, signals
2628from scrapy.utils.project import get_project_settings
2729import docopt
28302929-from FourmiCrawler.parsers.parser import Parser
3031from FourmiCrawler.spider import FourmiSpider
3131-3232-3333-def load_parsers(rel_dir="FourmiCrawler/parsers"):
3434- path = os.path.dirname(os.path.abspath(__file__))
3535- path += "/" + rel_dir
3636- parsers = []
3737- known_parser = set()
3838-3939- for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
4040- mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
4141- classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
4242- for cls in classes:
4343- if issubclass(cls, Parser) and cls not in known_parser:
4444- parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
4545- known_parser.add(cls)
4646- return parsers
3232+from sourceloader import SourceLoader
473348344949-def setup_crawler(searchable, settings):
3535+def setup_crawler(searchable, settings, source_loader):
5036 spider = FourmiSpider(compound=searchable)
5151- spider.add_parsers(load_parsers())
3737+ spider.add_parsers(source_loader.sources)
5238 crawler = Crawler(settings)
5339 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
5440 crawler.configure()
···5642 crawler.start()
574358445959-def scrapy_settings_manipulation(arguments):
4545+def scrapy_settings_manipulation(docopt_arguments):
6046 settings = get_project_settings()
6161-6262- if arguments["--output"] != 'result.*format*':
6363- settings.overrides["FEED_URI"] = arguments["--output"]
6464- elif arguments["--format"] == "jsonlines":
4747+ # [todo] - add at least a warning for files that already exist
4848+ if docopt_arguments["--output"] != 'result.*format*':
4949+ settings.overrides["FEED_URI"] = docopt_arguments["--output"]
5050+ elif docopt_arguments["--format"] == "jsonlines":
6551 settings.overrides["FEED_URI"] = "results.json"
6666- elif arguments["--format"] is not None:
6767- settings.overrides["FEED_URI"] = "results." + arguments["--format"]
5252+ elif docopt_arguments["--format"] is not None:
5353+ settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
68546969- if arguments["--format"] is not None:
7070- settings.overrides["FEED_FORMAT"] = arguments["--format"]
5555+ if docopt_arguments["--format"] is not None:
5656+ settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
71577258 return settings
735974607575-def start_log(arguments):
7676- if arguments["--log"] is not None:
7777- if arguments["--verbose"]:
7878- log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG)
6161+def start_log(docopt_arguments):
6262+ if docopt_arguments["--log"] is not None:
6363+ if docopt_arguments["--verbose"]:
6464+ log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
7965 else:
8080- log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING)
6666+ log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
8167 else:
8282- if arguments["--verbose"]:
6868+ if docopt_arguments["--verbose"]:
8369 log.start(logstdout=False, loglevel=log.DEBUG)
8470 else:
8571 log.start(logstdout=True, loglevel=log.WARNING)
867287738888-if __name__ == '__main__':
8989- arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.0')
9090- start_log(arguments)
9191- settings = scrapy_settings_manipulation(arguments)
9292- setup_crawler(arguments["<compound>"], settings)
7474+def search(docopt_arguments, source_loader):
7575+ start_log(docopt_arguments)
7676+ settings = scrapy_settings_manipulation(docopt_arguments)
7777+ setup_crawler(docopt_arguments["<compound>"], settings, source_loader)
9378 reactor.run()
94798080+8181+if __name__ == '__main__':
8282+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0')
8383+ loader = SourceLoader()
8484+8585+ if arguments["--include"]:
8686+ loader.include(arguments["--include"].split(','))
8787+ elif arguments["--exclude"]:
8888+ loader.exclude(arguments["--exclude"].split(','))
8989+9090+ if arguments["search"]:
9191+ search(arguments, loader)
9292+ elif arguments["list"]:
9393+ print "-== Available Sources ==-"
9494+ print str(loader)
+41
sourceloader.py
···11+import inspect
22+import os
33+import re
44+from FourmiCrawler.parsers.parser import Parser
55+66+77+class SourceLoader:
88+ sources = []
99+1010+ def __init__(self, rel_dir="FourmiCrawler/parsers"):
1111+ path = os.path.dirname(os.path.abspath(__file__))
1212+ path += "/" + rel_dir
1313+ known_parser = set()
1414+1515+ for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
1616+ mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
1717+ classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
1818+ for cls in classes:
1919+ if issubclass(cls, Parser) and cls not in known_parser:
2020+ self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
2121+ known_parser.add(cls)
2222+2323+ def include(self, source_names):
2424+ new = set()
2525+ for name in source_names:
2626+ new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
2727+ self.sources = list(new)
2828+2929+ def exclude(self, source_names):
3030+ exclude = []
3131+ for name in source_names:
3232+ exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
3333+ self.sources = [src for src in self.sources if src not in exclude]
3434+3535+ def __str__(self):
3636+ string = ""
3737+ for src in self.sources:
3838+ string += "Source: " + src.__class__.__name__
3939+ string += " - "
4040+ string += "URI: " + src.website + "\n"
4141+ return string