···22"""
33Fourmi - An internet webcrawler searching for information on chemical
44compounds. [todo] - Add some more useful text here.
55+Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet!
56"""
6778from twisted.internet import reactor
···910from scrapy import log, signals
1011from FourmiCrawler.spider import FourmiSpider
1112from scrapy.utils.project import get_project_settings
1313+import os, inspect, re
12141515+def load_parsers(rel_dir="FourmiCrawler/parsers"):
1616+ path = os.path.dirname(os.path.abspath(__file__))
1717+ path += "/" + rel_dir
1818+ parsers = []
13191414-def setup_crawler(searchable):
1515- # [TODO] - Initiate all parsers for the different websites and get
1616- # allowed URLs.
1717- spider = FourmiSpider(compound=searchable)
1818- settings = get_project_settings()
1919- crawler = Crawler(settings)
2020- crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
2121- crawler.configure()
2222- crawler.crawl(spider)
2323- crawler.start()
2020+ for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
2121+ mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
2222+ classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
2323+ for cls in classes:
2424+ if re.match(path + "/*", inspect.getfile(cls)):
2525+ parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
2626+ return parsers
2727+2828+def setup_crawler(searchables):
2929+ spider = FourmiSpider(compounds=searchables)
3030+ spider.add_parsers(load_parsers())
3131+ settings = get_project_settings()
3232+ crawler = Crawler(settings)
3333+ crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
3434+ crawler.configure()
3535+ crawler.crawl(spider)
3636+ crawler.start()
243725382639def start():
2727- setup_crawler("Methane")
2828- log.start()
2929- reactor.run()
4040+ setup_crawler(["Methane"])
4141+ log.start()
4242+ reactor.run()
4343+30443145start()
+16-4
FourmiCrawler/parsers/parser.py
···11from scrapy import log
22+# from scrapy.http import Request
233445class Parser:
55- website = "http://localhost/*"
66+ '''
77+ website should be an regular expression of the urls of request the parser is able to parse.
88+ '''
99+ website = "http://something/*"
1010+ __spider = None
61177- def parse(self, reponse):
88- log.msg("The parse function of the empty parser was used.", level=log.Warning)
99- pass
1212+ def parse(self, reponse):
1313+ log.msg("The parse function of the empty parser was used.", level=log.WARNING)
1414+ pass
1515+1616+ def new_compound_request(self, compound):
1717+ # return Request(url=self.website[:-1] + compound, callback=self.parse)
1818+ pass
1919+2020+ def set_spider(self, spider):
2121+ self.__spider = spider
+32-8
FourmiCrawler/spider.py
···11from scrapy.spider import Spider
22+from scrapy import log
33+import re
243546class FourmiSpider(Spider):
57 name = "FourmiSpider"
88+ __parsers = []
99+ synonyms = []
61077- def __init__(self, compound=None, *args, **kwargs):
1111+ def __init__(self, compounds=None, *args, **kwargs):
812 super(FourmiSpider, self).__init__(*args, **kwargs)
99- self.synonyms = [compound]
1313+ if isinstance(compounds, list):
1414+ self.synonyms.extend(compounds)
1515+ else:
1616+ self.synonyms.append(compounds)
10171818+ def parse(self, reponse):
1919+ for parser in self.__parsers:
2020+ if re.match(parser.website, reponse.url):
2121+ log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
2222+ return parser.parse(reponse)
2323+ return None
11241212-def parse(self, reponse):
1313- # [TODO] - This function should delegate it's functionality to other
1414- # parsers.
1515- pass
2525+ def get_synonym_requests(self, compound):
2626+ requests = []
2727+ for parser in self.__parsers:
2828+ requests.append(parser.new_compound_request(compound))
2929+ return requests
16303131+ def start_requests(self):
3232+ requests = []
3333+ for synonym in self.synonyms:
3434+ requests.extend(self.get_synonym_requests(synonym))
3535+ return requests
17361818-def add_parser(self, parser):
1919- self.parsers.add(parser)
3737+ def add_parsers(self, parsers):
3838+ for parser in parsers:
3939+ self.add_parser(parser)
4040+4141+ def add_parser(self, parser):
4242+ self.__parsers.append(parser)
4343+ parser.set_spider(self)