A web scraper build to search specific information for a given compound (and its pseudonyms)

Added documentation to the FourmiSpider

+33 -1
+33 -1
FourmiCrawler/spider.py
··· 1 + import re 2 + 1 3 from scrapy.spider import Spider 2 4 from scrapy import log 3 - import re 4 5 5 6 6 7 class FourmiSpider(Spider): 8 + """ 9 + A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. 10 + """ 7 11 name = "FourmiSpider" 8 12 __parsers = [] 9 13 synonyms = [] 10 14 11 15 def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 16 + """ 17 + Initiation of the Spider 18 + :param compound: compound that will be searched. 19 + :param selected_attributes: A list of regular expressions that the attributes should match. 20 + """ 12 21 super(FourmiSpider, self).__init__(*args, **kwargs) 13 22 self.synonyms.append(compound) 14 23 self.selected_attributes = selected_attributes; 15 24 16 25 def parse(self, reponse): 26 + """ 27 + The function that is called when a response to a request is available. This function distributes this to a 28 + parser which should be able to handle parsing the data. 29 + :param reponse: A Scrapy Response object that should be parsed 30 + :return: A list of Result items and new Request to be handled by the scrapy core. 31 + """ 17 32 for parser in self.__parsers: 18 33 if re.match(parser.website, reponse.url): 19 34 log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG) ··· 21 36 return None 22 37 23 38 def get_synonym_requests(self, compound): 39 + """ 40 + A function that generates new Scrapy Request for each source given a new synonym of a compound. 41 + :param compound: A compound name 42 + :return: A list of Scrapy Request objects 43 + """ 24 44 requests = [] 25 45 for parser in self.__parsers: 26 46 parser_requests = parser.new_compound_request(compound) ··· 29 49 return requests 30 50 31 51 def start_requests(self): 52 + """ 53 + The function called by Scrapy for it's first Requests 54 + :return: A list of Scrapy Request generated from the known synonyms using the available sources. 55 + """ 32 56 requests = [] 33 57 for synonym in self.synonyms: 34 58 requests.extend(self.get_synonym_requests(synonym)) 35 59 return requests 36 60 37 61 def add_parsers(self, parsers): 62 + """ 63 + A function to add a new Parser objects to the list of available parsers. 64 + :param parsers: A list of Parser Objects. 65 + """ 38 66 for parser in parsers: 39 67 self.add_parser(parser) 40 68 41 69 def add_parser(self, parser): 70 + """ 71 + A function add a new Parser object to the list of available parsers. 72 + :param parser: A Parser Object 73 + """ 42 74 self.__parsers.append(parser) 43 75 parser.set_spider(self)