···11-#!/usr/bin/env python
22-"""
33-Fourmi - An internet webcrawler searching for information on chemical
44-compounds. [todo] - Add some more useful text here.
55-Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet!
66-"""
77-88-from twisted.internet import reactor
99-from scrapy.crawler import Crawler
1010-from scrapy import log, signals
1111-from FourmiCrawler.parsers.parser import Parser
1212-from FourmiCrawler.spider import FourmiSpider
1313-from scrapy.utils.project import get_project_settings
1414-import os, inspect, re
1515-1616-def load_parsers(rel_dir="FourmiCrawler/parsers"):
1717- path = os.path.dirname(os.path.abspath(__file__))
1818- path += "/" + rel_dir
1919- parsers = []
2020- known_parser = set()
2121-2222- for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
2323- mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
2424- classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
2525- for cls in classes:
2626- if issubclass(cls, Parser) and cls not in known_parser:
2727- parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
2828- known_parser.add(cls)
2929- return parsers
3030-3131-def setup_crawler(searchables):
3232- spider = FourmiSpider(compounds=searchables)
3333- spider.add_parsers(load_parsers())
3434- settings = get_project_settings()
3535- crawler = Crawler(settings)
3636- crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
3737- crawler.configure()
3838- crawler.crawl(spider)
3939- crawler.start()
4040-4141-4242-def start():
4343- setup_crawler(["Methane"])
4444- log.start()
4545- reactor.run()
4646-4747-4848-start()
+3
FourmiCrawler/settings.py
···1313ITEM_PIPELINES = {
1414 'FourmiCrawler.pipelines.FourmiPipeline': 100
1515}
1616+FEED_URI = 'results.json'
1717+FEED_FORMAT = 'jsonlines'
1818+16191720# Crawl responsibly by identifying yourself (and your website) on the
1821# user-agent