A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'release/v0.0.1'

+75 -25
+27 -13
Fourmi.py
··· 2 2 """ 3 3 Fourmi - An internet webcrawler searching for information on chemical 4 4 compounds. [todo] - Add some more useful text here. 5 + Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet! 5 6 """ 6 7 7 8 from twisted.internet import reactor ··· 9 10 from scrapy import log, signals 10 11 from FourmiCrawler.spider import FourmiSpider 11 12 from scrapy.utils.project import get_project_settings 13 + import os, inspect, re 12 14 15 + def load_parsers(rel_dir="FourmiCrawler/parsers"): 16 + path = os.path.dirname(os.path.abspath(__file__)) 17 + path += "/" + rel_dir 18 + parsers = [] 13 19 14 - def setup_crawler(searchable): 15 - # [TODO] - Initiate all parsers for the different websites and get 16 - # allowed URLs. 17 - spider = FourmiSpider(compound=searchable) 18 - settings = get_project_settings() 19 - crawler = Crawler(settings) 20 - crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 21 - crawler.configure() 22 - crawler.crawl(spider) 23 - crawler.start() 20 + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 21 + mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 22 + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 23 + for cls in classes: 24 + if re.match(path + "/*", inspect.getfile(cls)): 25 + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? 26 + return parsers 27 + 28 + def setup_crawler(searchables): 29 + spider = FourmiSpider(compounds=searchables) 30 + spider.add_parsers(load_parsers()) 31 + settings = get_project_settings() 32 + crawler = Crawler(settings) 33 + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 34 + crawler.configure() 35 + crawler.crawl(spider) 36 + crawler.start() 24 37 25 38 26 39 def start(): 27 - setup_crawler("Methane") 28 - log.start() 29 - reactor.run() 40 + setup_crawler(["Methane"]) 41 + log.start() 42 + reactor.run() 43 + 30 44 31 45 start()
+16 -4
FourmiCrawler/parsers/parser.py
··· 1 1 from scrapy import log 2 + # from scrapy.http import Request 2 3 3 4 4 5 class Parser: 5 - website = "http://localhost/*" 6 + ''' 7 + website should be an regular expression of the urls of request the parser is able to parse. 8 + ''' 9 + website = "http://something/*" 10 + __spider = None 6 11 7 - def parse(self, reponse): 8 - log.msg("The parse function of the empty parser was used.", level=log.Warning) 9 - pass 12 + def parse(self, reponse): 13 + log.msg("The parse function of the empty parser was used.", level=log.WARNING) 14 + pass 15 + 16 + def new_compound_request(self, compound): 17 + # return Request(url=self.website[:-1] + compound, callback=self.parse) 18 + pass 19 + 20 + def set_spider(self, spider): 21 + self.__spider = spider
+32 -8
FourmiCrawler/spider.py
··· 1 1 from scrapy.spider import Spider 2 + from scrapy import log 3 + import re 2 4 3 5 4 6 class FourmiSpider(Spider): 5 7 name = "FourmiSpider" 8 + __parsers = [] 9 + synonyms = [] 6 10 7 - def __init__(self, compound=None, *args, **kwargs): 11 + def __init__(self, compounds=None, *args, **kwargs): 8 12 super(FourmiSpider, self).__init__(*args, **kwargs) 9 - self.synonyms = [compound] 13 + if isinstance(compounds, list): 14 + self.synonyms.extend(compounds) 15 + else: 16 + self.synonyms.append(compounds) 10 17 18 + def parse(self, reponse): 19 + for parser in self.__parsers: 20 + if re.match(parser.website, reponse.url): 21 + log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) 22 + return parser.parse(reponse) 23 + return None 11 24 12 - def parse(self, reponse): 13 - # [TODO] - This function should delegate it's functionality to other 14 - # parsers. 15 - pass 25 + def get_synonym_requests(self, compound): 26 + requests = [] 27 + for parser in self.__parsers: 28 + requests.append(parser.new_compound_request(compound)) 29 + return requests 16 30 31 + def start_requests(self): 32 + requests = [] 33 + for synonym in self.synonyms: 34 + requests.extend(self.get_synonym_requests(synonym)) 35 + return requests 17 36 18 - def add_parser(self, parser): 19 - self.parsers.add(parser) 37 + def add_parsers(self, parsers): 38 + for parser in parsers: 39 + self.add_parser(parser) 40 + 41 + def add_parser(self, parser): 42 + self.__parsers.append(parser) 43 + parser.set_spider(self)