···44#Python Specific ignores
55*.pyc
6677+#may contain authentication information
88+sources.cfg
99+710#THINGS WE WOULD NEVER EVER WANT!
811#ignore thumbnails created by windows
912Thumbs.db
+1-1
.travis.yml
···10101111# command to run tests, e.g. python setup.py test
1212script:
1313- - nosetests --with-coverage --cover-package=FourmiCrawler tests
1313+ - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
14141515notifications:
1616 slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
+42-37
FourmiCrawler/sources/ChemSpider.py
···9910101111# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
1212-1212+# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
13131414class ChemSpider(Source):
1515 """ChemSpider scraper for synonyms and properties
···1919 The token required for the API should be in a configuration file
2020 somewhere.
2121 """
2222-2323- def __init__(self):
2424- Source.__init__(self)
25222623 website = 'http://www.chemspider.com/*'
27242828- # [TODO] - Save and access token of specific user.
2929- search = ('Search.asmx/SimpleSearch?query=%s&token='
3030- '052bfd06-5ce4-43d6-bf12-89eabefd2338')
2525+ search = 'Search.asmx/SimpleSearch?query=%s&token='
3126 structure = 'Chemical-Structure.%s.html'
3232- extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
3333- '052bfd06-5ce4-43d6-bf12-89eabefd2338')
2727+ extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
34283535- ignore_list = []
2929+ def __init__(self, config={}):
3030+ Source.__init__(self, config)
3131+ self.cfg = config
3232+ self.ignore_list = []
3333+ if 'token' not in self.cfg or self.cfg['token'] == '':
3434+ log.msg('ChemSpider token not set or empty, search/MassSpec API '
3535+ 'not available', level=log.WARNING)
3636+ self.cfg['token'] = ''
3737+ self.search += self.cfg['token']
3838+ self.extendedinfo += self.cfg['token']
3939+36403741 def parse(self, response):
3842 sel = Selector(response)
···44484549 return requests
46504747- @staticmethod
4848- def parse_properties(sel):
5151+ def parse_properties(self, sel):
4952 """scrape Experimental Data and Predicted ACD/Labs tabs"""
5053 properties = []
5154···7679 prop_value = m.group(1)
7780 prop_conditions = m.group(2)
78817979- new_prop = Result({
8080- 'attribute': prop_name,
8181- 'value': prop_value,
8282- 'source': 'ChemSpider Predicted - ACD/Labs Tab',
8383- 'reliability': 'Unknown',
8484- 'conditions': prop_conditions
8585- })
8282+ new_prop = self.newresult(
8383+ attribute=prop_name,
8484+ value=prop_value,
8585+ source='ChemSpider Predicted - ACD/Labs Tab',
8686+ conditions=prop_conditions
8787+ )
8688 properties.append(new_prop)
8789 log.msg('CS prop: |%s| |%s| |%s|' %
8890 (new_prop['attribute'], new_prop['value'], new_prop['source']),
···100102 if line.xpath('span/text()'):
101103 property_name = line.xpath('span/text()').extract()[0].rstrip()
102104 else:
103103- new_prop = Result({
104104- 'attribute': property_name[:-1],
105105- 'value': line.xpath('text()').extract()[0].rstrip(),
106106- 'source': line.xpath(
107107- 'strong/text()').extract()[0].rstrip(),
108108- 'reliability': 'Unknown',
109109- 'conditions': ''
110110- })
105105+ new_prop = self.newresult(
106106+ attribute=property_name[:-1],
107107+ value=line.xpath('text()').extract()[0].rstrip(),
108108+ source=line.xpath('strong/text()').extract()[0].rstrip(),
109109+ )
111110 properties.append(new_prop)
112111 log.msg('CS prop: |%s| |%s| |%s|' %
113112 (new_prop['attribute'], new_prop['value'],
···183182 }
184183 return synonym
185184186186- @staticmethod
187187- def parse_extendedinfo(response):
185185+ def parse_extendedinfo(self, response):
188186 """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
189187 sel = Selector(response)
190188 properties = []
191189 names = sel.xpath('*').xpath('name()').extract()
192190 values = sel.xpath('*').xpath('text()').extract()
193191 for (name, value) in zip(names, values):
194194- result = Result({
195195- 'attribute': name,
196196- 'value': value, # These values have no unit!
197197- 'source': 'ChemSpider ExtendedCompoundInfo',
198198- 'reliability': 'Unknown',
199199- 'conditions': ''
200200- })
192192+ result = self.newresult(
193193+ attribute=name,
194194+ value=value, # These values have no unit!
195195+ source='ChemSpider ExtendedCompoundInfo',
196196+ )
201197 if result['value']:
202198 properties.append(result)
203199 return properties
200200+201201+ def newresult(self, attribute, value, conditions='', source='ChemSpider'):
202202+ return Result({
203203+ 'attribute': attribute,
204204+ 'value': value,
205205+ 'source': source,
206206+ 'reliability': self.cfg['reliability'],
207207+ 'conditions': conditions
208208+ })
204209205210 def parse_searchrequest(self, response):
206211 """Parse the initial response of the ChemSpider Search API """
···224229 callback=self.parse_extendedinfo)]
225230226231 def new_compound_request(self, compound):
227227- if compound in self.ignore_list: # [TODO] - add regular expression
232232+ if compound in self.ignore_list or self.cfg['token'] == '':
228233 return None
229234 searchurl = self.website[:-1] + self.search % compound
230235 log.msg('chemspider compound', level=log.DEBUG)
···66 website = "http://something/*" # Regex of URI's the source is able to parse
77 _spider = None
8899- def __init__(self):
99+ def __init__(self, config={}):
1010 """
1111 Initiation of a new Source
1212 """
+2-2
FourmiCrawler/spider.py
···99 A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
1010 """
1111 name = "FourmiSpider"
1212- _sources = []
1313- synonyms = set()
14121513 def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
1614 """
···1816 :param compound: compound that will be searched.
1917 :param selected_attributes: A list of regular expressions that the attributes should match.
2018 """
1919+ self._sources = []
2020+ self.synonyms = set()
2121 super(FourmiSpider, self).__init__(*args, **kwargs)
2222 self.synonyms.add(compound)
2323 self.selected_attributes = selected_attributes
+2-2
README.md
···11# Fourmi
2233-**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
33+**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
4455-**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
55+**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
6677Fourmi is an web scraper for chemical substances. The program is designed to be
88used as a search engine to search multiple chemical databases for a specific
+9-45
fourmi.py
···1717 --version Show version.
1818 --verbose Verbose logging output.
1919 --log=<file> Save log to an file.
2020- -o <file> --output=<file> Output file [default: result.*format*]
2121- -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
2020+ -o <file> --output=<file> Output file [default: results.*format*]
2121+ -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
2222 --include=<regex> Include only sources that match these regular expressions split by a comma.
2323 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
2424"""
···3030import docopt
31313232from FourmiCrawler.spider import FourmiSpider
3333-from sourceloader import SourceLoader
3333+from utils.configurator import Configurator
3434+from utils.sourceloader import SourceLoader
343535363637def setup_crawler(compound, settings, source_loader, attributes):
···5051 crawler.start()
515252535353-def scrapy_settings_manipulation(docopt_arguments):
5454- """
5555- This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
5656- project these are command line arguments.
5757- :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
5858- """
5959- settings = get_project_settings()
6060-6161- if docopt_arguments["--output"] != 'result.*format*':
6262- settings.overrides["FEED_URI"] = docopt_arguments["--output"]
6363- elif docopt_arguments["--format"] == "jsonlines":
6464- settings.overrides["FEED_URI"] = "results.json"
6565- elif docopt_arguments["--format"] is not None:
6666- settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
6767-6868- if docopt_arguments["--format"] is not None:
6969- settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
7070-7171- return settings
7272-7373-7474-def start_log(docopt_arguments):
7575- """
7676- This function starts the logging functionality of Scrapy using the settings given by the CLI.
7777- :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
7878- """
7979- if docopt_arguments["--log"] is not None:
8080- if docopt_arguments["--verbose"]:
8181- log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
8282- else:
8383- log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
8484- else:
8585- if docopt_arguments["--verbose"]:
8686- log.start(logstdout=False, loglevel=log.DEBUG)
8787- else:
8888- log.start(logstdout=True, loglevel=log.WARNING)
8989-9090-9154def search(docopt_arguments, source_loader):
9255 """
9356 The function that facilitates the search for a specific compound.
9457 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
9558 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
9659 """
9797- start_log(docopt_arguments)
9898- settings = scrapy_settings_manipulation(docopt_arguments)
9999- setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
6060+ conf = Configurator()
6161+ conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
6262+ conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
6363+ setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
10064 reactor.run()
101651026610367# The start for the Fourmi Command Line interface.
10468if __name__ == '__main__':
105105- arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2')
6969+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
10670 loader = SourceLoader()
1077110872 if arguments["--include"]:
+10-5
sourceloader.py
utils/sourceloader.py
···33import re
4455from FourmiCrawler.sources.source import Source
66-66+from utils.configurator import Configurator
7788class SourceLoader:
99 sources = []
10101111- def __init__(self, rel_dir="FourmiCrawler/sources"):
1111+ def __init__(self, rel_dir="../FourmiCrawler/sources"):
1212 """
1313 The initiation of a SourceLoader, selects and indexes a directory for usable sources.
1414+ Also loads a configuration file for Sources and passes the arguments in
1515+ the named section to the source
1416 :param rel_dir: A relative path to a directory.
1517 """
1618 path = os.path.dirname(os.path.abspath(__file__))
1719 path += "/" + rel_dir
1820 known_parser = set()
19212222+ config = Configurator.read_sourceconfiguration()
2323+2024 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
2121- mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
2525+ mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
2226 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
2327 for cls in classes:
2428 if issubclass(cls, Source) and cls not in known_parser:
2525- self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
2929+ sourcecfg = Configurator.get_section(config, cls.__name__)
3030+ self.sources.append(cls(sourcecfg))
2631 known_parser.add(cls)
27322833 def include(self, source_names):
···5560 string += "Source: " + src.__class__.__name__
5661 string += " - "
5762 string += "URI: " + src.website + "\n"
5858- return string6363+ return string
···11+from scrapy import log
22+from scrapy.utils.project import get_project_settings
33+import ConfigParser
44+55+class Configurator:
66+ """
77+ A helper class in the fourmi class. This class is used to process the settings as set
88+ from one of the Fourmi applications.
99+ """
1010+1111+ def __init__(self):
1212+ self.scrapy_settings = get_project_settings()
1313+1414+1515+ def set_output(self, filename, fileformat):
1616+ """
1717+ This function manipulates the Scrapy output file settings that normally would be set in the settings file.
1818+ In the Fourmi project these are command line arguments.
1919+ :param filename: The filename of the file where the output will be put.
2020+ :param fileformat: The format in which the output will be.
2121+ """
2222+2323+ if filename != 'results.*format*':
2424+ self.scrapy_settings.overrides["FEED_URI"] = filename
2525+ elif fileformat == "jsonlines":
2626+ self.scrapy_settings.overrides["FEED_URI"] = "results.json"
2727+ elif fileformat is not None:
2828+ self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
2929+3030+ if fileformat is not None:
3131+ self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
3232+3333+3434+ def start_log(self, logfile, verbose):
3535+ """
3636+ This function starts the logging functionality of Scrapy using the settings given by the CLI.
3737+ :param logfile: The location where the logfile will be saved.
3838+ :param verbose: A boolean value to switch between loglevels.
3939+ """
4040+ if logfile is not None:
4141+ if verbose:
4242+ log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
4343+ else:
4444+ log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
4545+ else:
4646+ if verbose:
4747+ log.start(logstdout=False, loglevel=log.DEBUG)
4848+ else:
4949+ log.start(logstdout=True, loglevel=log.WARNING)
5050+5151+ @staticmethod
5252+ def read_sourceconfiguration():
5353+ """
5454+ This function reads sources.cfg in the main folder for configuration
5555+ variables for sources
5656+ :return a ConfigParser object of sources.cfg
5757+ """
5858+ config = ConfigParser.ConfigParser()
5959+ config.read('sources.cfg') # [TODO]: should be softcoded eventually
6060+ return config
6161+6262+ @staticmethod
6363+ def get_section(config, sourcename):
6464+ """
6565+ This function reads a config section labeled in variable sourcename and
6666+ tests whether the reliability variable is set else set to empty string.
6767+ Return the default section if the labeled config section does not exist
6868+ :param config: a ConfigParser object
6969+ :param sourcename: the name of the section to be read
7070+ :return a dictionary of the section in the config labeled in sourcename
7171+ """
7272+ section = dict()
7373+ if config.has_section(sourcename):
7474+ section = dict(config.items(sourcename))
7575+ elif config.defaults():
7676+ section = config.defaults()
7777+ if 'reliability' not in section:
7878+ log.msg('Reliability not set for %s' % sourcename,
7979+ level=log.WARNING)
8080+ section['reliability'] = ''
8181+ return section