A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 81 lines 3.1 kB view raw
1import re 2 3from scrapy.spider import Spider 4from scrapy import log 5 6 7class FourmiSpider(Spider): 8 """ 9 A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. 10 """ 11 name = "FourmiSpider" 12 13 def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): 14 """ 15 Initiation of the Spider 16 :param compound: compound that will be searched. 17 :param selected_attributes: A list of regular expressions that the attributes should match. 18 """ 19 self._sources = [] 20 self.synonyms = set() 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 self.synonyms.add(compound) 23 if selected_attributes is None: 24 self.selected_attributes = [".*"] 25 else: 26 self.selected_attributes = selected_attributes 27 28 def parse(self, response): 29 """ 30 The function that is called when a response to a request is available. This function distributes this to a 31 source which should be able to handle parsing the data. 32 :param response: A Scrapy Response object that should be parsed 33 :return: A list of Result items and new Request to be handled by the scrapy core. 34 """ 35 for source in self._sources: 36 if re.match(source.website, response.url): 37 log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 38 return source.parse(response) 39 log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO) 40 return None 41 42 def get_synonym_requests(self, compound, force=False): 43 """ 44 A function that generates new Scrapy Request for each source given a new synonym of a compound. 45 :param compound: A compound name 46 :return: A list of Scrapy Request objects 47 """ 48 requests = [] 49 if force or compound not in self.synonyms: 50 self.synonyms.add(compound) 51 for parser in self._sources: 52 parser_requests = parser.new_compound_request(compound) 53 if parser_requests is not None: 54 requests.append(parser_requests) 55 return requests 56 57 def start_requests(self): 58 """ 59 The function called by Scrapy for it's first Requests 60 :return: A list of Scrapy Request generated from the known synonyms using the available sources. 61 """ 62 requests = [] 63 for synonym in self.synonyms: 64 requests.extend(self.get_synonym_requests(synonym, force=True)) 65 return requests 66 67 def add_sources(self, sources): 68 """ 69 A function to add a new Parser objects to the list of available sources. 70 :param sources: A list of Source Objects. 71 """ 72 for parser in sources: 73 self.add_source(parser) 74 75 def add_source(self, source): 76 """ 77 A function add a new Parser object to the list of available parsers. 78 :param source: A Source Object 79 """ 80 self._sources.append(source) 81 source.set_spider(self)