A web scraper build to search specific information for a given compound (and its pseudonyms)
at feature/executable 77 lines 2.9 kB view raw
1import re 2 3from scrapy.spider import Spider 4from scrapy import log 5 6 7class FourmiSpider(Spider): 8 """ 9 A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. 10 """ 11 name = "FourmiSpider" 12 _sources = [] 13 synonyms = set() 14 15 def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 16 """ 17 Initiation of the Spider 18 :param compound: compound that will be searched. 19 :param selected_attributes: A list of regular expressions that the attributes should match. 20 """ 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 self.synonyms.add(compound) 23 self.selected_attributes = selected_attributes 24 25 def parse(self, response): 26 """ 27 The function that is called when a response to a request is available. This function distributes this to a 28 source which should be able to handle parsing the data. 29 :param response: A Scrapy Response object that should be parsed 30 :return: A list of Result items and new Request to be handled by the scrapy core. 31 """ 32 for source in self._sources: 33 if re.match(source.website, response.url): 34 log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 35 return source.parse(response) 36 return None 37 38 def get_synonym_requests(self, compound): 39 """ 40 A function that generates new Scrapy Request for each source given a new synonym of a compound. 41 :param compound: A compound name 42 :return: A list of Scrapy Request objects 43 """ 44 requests = [] 45 if compound not in self.synonyms: 46 self.synonyms.add(compound) 47 for parser in self._sources: 48 parser_requests = parser.new_compound_request(compound) 49 if parser_requests is not None: 50 requests.append(parser_requests) 51 return requests 52 53 def start_requests(self): 54 """ 55 The function called by Scrapy for it's first Requests 56 :return: A list of Scrapy Request generated from the known synonyms using the available sources. 57 """ 58 requests = [] 59 for synonym in self.synonyms: 60 requests.extend(self.get_synonym_requests(synonym)) 61 return requests 62 63 def add_sources(self, sources): 64 """ 65 A function to add a new Parser objects to the list of available sources. 66 :param sources: A list of Source Objects. 67 """ 68 for parser in sources: 69 self.add_source(parser) 70 71 def add_source(self, source): 72 """ 73 A function add a new Parser object to the list of available parsers. 74 :param source: A Source Object 75 """ 76 self._sources.append(source) 77 source.set_spider(self)