A web scraper build to search specific information for a given compound (and its pseudonyms)

Added documentation to the Executable Python file

+23 -1
+23 -1
fourmi.py
··· 34 34 35 35 36 36 def setup_crawler(searchable, settings, source_loader, attributes): 37 + """ 38 + This function prepares and start the crawler which starts the actual search on the internet 39 + :param searchable: The compound which should be searched 40 + :param settings: A scrapy settings object 41 + :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. 42 + :param attributes: A list of regular expressions which the attribute names should match. 43 + """ 37 44 spider = FourmiSpider(compound=searchable, selected_attributes=attributes) 38 45 spider.add_parsers(source_loader.sources) 39 46 crawler = Crawler(settings) ··· 44 51 45 52 46 53 def scrapy_settings_manipulation(docopt_arguments): 54 + """ 55 + This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi 56 + project these are command line arguments. 57 + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 58 + """ 47 59 settings = get_project_settings() 48 - # [todo] - add at least a warning for files that already exist 60 + 49 61 if docopt_arguments["--output"] != 'result.*format*': 50 62 settings.overrides["FEED_URI"] = docopt_arguments["--output"] 51 63 elif docopt_arguments["--format"] == "jsonlines": ··· 60 72 61 73 62 74 def start_log(docopt_arguments): 75 + """ 76 + This function starts the logging functionality of Scrapy using the settings given by the CLI. 77 + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 78 + """ 63 79 if docopt_arguments["--log"] is not None: 64 80 if docopt_arguments["--verbose"]: 65 81 log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) ··· 73 89 74 90 75 91 def search(docopt_arguments, source_loader): 92 + """ 93 + The function that facilitates the search for a specific compound. 94 + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 95 + :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 96 + """ 76 97 start_log(docopt_arguments) 77 98 settings = scrapy_settings_manipulation(docopt_arguments) 78 99 setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) 79 100 reactor.run() 80 101 81 102 103 + # The start for the Fourmi Command Line interface. 82 104 if __name__ == '__main__': 83 105 arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') 84 106 loader = SourceLoader()