···343435353636def setup_crawler(searchable, settings, source_loader, attributes):
3737+ """
3838+ This function prepares and start the crawler which starts the actual search on the internet
3939+ :param searchable: The compound which should be searched
4040+ :param settings: A scrapy settings object
4141+ :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
4242+ :param attributes: A list of regular expressions which the attribute names should match.
4343+ """
3744 spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
3845 spider.add_parsers(source_loader.sources)
3946 crawler = Crawler(settings)
···445145524653def scrapy_settings_manipulation(docopt_arguments):
5454+ """
5555+ This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
5656+ project these are command line arguments.
5757+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
5858+ """
4759 settings = get_project_settings()
4848- # [todo] - add at least a warning for files that already exist
6060+4961 if docopt_arguments["--output"] != 'result.*format*':
5062 settings.overrides["FEED_URI"] = docopt_arguments["--output"]
5163 elif docopt_arguments["--format"] == "jsonlines":
···607261736274def start_log(docopt_arguments):
7575+ """
7676+ This function starts the logging functionality of Scrapy using the settings given by the CLI.
7777+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
7878+ """
6379 if docopt_arguments["--log"] is not None:
6480 if docopt_arguments["--verbose"]:
6581 log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
···738974907591def search(docopt_arguments, source_loader):
9292+ """
9393+ The function that facilitates the search for a specific compound.
9494+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
9595+ :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
9696+ """
7697 start_log(docopt_arguments)
7798 settings = scrapy_settings_manipulation(docopt_arguments)
7899 setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
79100 reactor.run()
8010181102103103+# The start for the Fourmi Command Line interface.
82104if __name__ == '__main__':
83105 arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
84106 loader = SourceLoader()