A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy.spider import Spider
4from scrapy import log
5
6
7class FourmiSpider(Spider):
8 """
9 A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
10 """
11 name = "FourmiSpider"
12 _sources = []
13 synonyms = set()
14
15 def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
16 """
17 Initiation of the Spider
18 :param compound: compound that will be searched.
19 :param selected_attributes: A list of regular expressions that the attributes should match.
20 """
21 super(FourmiSpider, self).__init__(*args, **kwargs)
22 self.synonyms.add(compound)
23 self.selected_attributes = selected_attributes
24
25 def parse(self, response):
26 """
27 The function that is called when a response to a request is available. This function distributes this to a
28 source which should be able to handle parsing the data.
29 :param response: A Scrapy Response object that should be parsed
30 :return: A list of Result items and new Request to be handled by the scrapy core.
31 """
32 for source in self._sources:
33 if re.match(source.website, response.url):
34 log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
35 return source.parse(response)
36 return None
37
38 def get_synonym_requests(self, compound):
39 """
40 A function that generates new Scrapy Request for each source given a new synonym of a compound.
41 :param compound: A compound name
42 :return: A list of Scrapy Request objects
43 """
44 requests = []
45 if compound not in self.synonyms:
46 self.synonyms.add(compound)
47 for parser in self._sources:
48 parser_requests = parser.new_compound_request(compound)
49 if parser_requests is not None:
50 requests.append(parser_requests)
51 return requests
52
53 def start_requests(self):
54 """
55 The function called by Scrapy for it's first Requests
56 :return: A list of Scrapy Request generated from the known synonyms using the available sources.
57 """
58 requests = []
59 for synonym in self.synonyms:
60 requests.extend(self.get_synonym_requests(synonym))
61 return requests
62
63 def add_sources(self, sources):
64 """
65 A function to add a new Parser objects to the list of available sources.
66 :param sources: A list of Source Objects.
67 """
68 for parser in sources:
69 self.add_source(parser)
70
71 def add_source(self, source):
72 """
73 A function add a new Parser object to the list of available parsers.
74 :param source: A Source Object
75 """
76 self._sources.append(source)
77 source.set_spider(self)