···11+### v0.5.3
22+- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
33+- FIX: Logging is now "actually" disabled if not using the verbose option.
44+- FEATURE: Added support for PubChem
55+66+### v0.5.2
77+- FIX: Signatured used to contain untracked and older files, current signature
88+should be correct.
99+1010+### v0.5.1
1111+- UPDATED: Logging functionality from command line
1212+- DEV: Code cleanup and extra tests
+3-1
FourmiCrawler/settings.py
···1818FEED_URI = 'results.json'
1919FEED_FORMAT = 'jsonlines'
20202121-2221# Crawl responsibly by identifying yourself (and your website) on the
2322# user-agent
24232424+# [todo] - Check for repercussions on spoofing the user agent
2525+2526# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
2727+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
+4-5
FourmiCrawler/sources/ChemSpider.py
···2626 structure = 'Chemical-Structure.%s.html'
2727 extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
28282929- def __init__(self, config={}):
2929+ def __init__(self, config=None):
3030 Source.__init__(self, config)
3131- self.cfg = config
3231 self.ignore_list = []
3332 if 'token' not in self.cfg or self.cfg['token'] == '':
3433 log.msg('ChemSpider token not set or empty, search/MassSpec API '
···3635 self.cfg['token'] = ''
3736 self.search += self.cfg['token']
3837 self.extendedinfo += self.cfg['token']
3939-40384139 def parse(self, response):
4240 sel = Selector(response)
···199197 return properties
200198201199 def newresult(self, attribute, value, conditions='', source='ChemSpider'):
202202- return Result({
200200+ return Result(
201201+ {
203202 'attribute': attribute,
204203 'value': value,
205204 'source': source,
206205 'reliability': self.cfg['reliability'],
207206 'conditions': conditions
208208- })
207207+ })
209208210209 def parse_searchrequest(self, response):
211210 """Parse the initial response of the ChemSpider Search API """
···11+from scrapy.http import Request
22+from scrapy import log
33+from source import Source
44+from scrapy.selector import Selector
55+from FourmiCrawler.items import Result
66+import re
77+88+99+class PubChem(Source):
1010+ """ PubChem scraper for chemical properties
1111+1212+ This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
1313+ including sources of the values of properties.
1414+ """
1515+1616+ #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
1717+ website = 'https://*.ncbi.nlm.nih.gov/*'
1818+ website_www = 'https://www.ncbi.nlm.nih.gov/*'
1919+ website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
2020+ search = 'pccompound?term=%s'
2121+ data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
2222+2323+ __spider = None
2424+ searched_compounds = set()
2525+2626+ def __init__(self, config):
2727+ Source.__init__(self, config)
2828+ self.cfg = config
2929+3030+ def parse(self, response):
3131+ """
3232+ Distributes the above described behaviour
3333+ :param response: The incoming search request
3434+ :return Returns the found properties if response is unique or returns none if it's already known
3535+ """
3636+ requests = []
3737+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
3838+3939+ sel = Selector(response)
4040+ compound = sel.xpath('//h1/text()').extract()[0]
4141+ if compound in self.searched_compounds:
4242+ return None
4343+4444+ self.searched_compounds.update(compound)
4545+ raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
4646+ for synonym in raw_synonyms.strip().split(', '):
4747+ log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
4848+ self.searched_compounds.update(synonym)
4949+ self._spider.get_synonym_requests(synonym)
5050+ log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
5151+5252+ n = re.search(r'cid=(\d+)',response.url)
5353+ if n:
5454+ cid = n.group(1)
5555+ log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
5656+ # the seperate html page which contains the properties and their values
5757+5858+ #using this cid to get the right url and scrape it
5959+ requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
6060+ return requests
6161+6262+ def parse_data(self, response):
6363+ """
6464+ Parse data found in 'Chemical and Physical properties' part of a substance page.
6565+ :param response: The response with the page to parse
6666+ :return: requests: Returns a list of properties with their values, source, etc.
6767+ """
6868+ log.msg('parsing data', level=log.DEBUG)
6969+ requests = []
7070+7171+ sel = Selector(response)
7272+ props = sel.xpath('//div')
7373+7474+ for prop in props:
7575+ prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
7676+ if prop.xpath('a'): # parsing for single value in property
7777+ prop_source = ''.join(prop.xpath('a/@title').extract())
7878+ prop_value = ''.join(prop.xpath('a/text()').extract())
7979+ new_prop = Result({
8080+ 'attribute': prop_name,
8181+ 'value': prop_value,
8282+ 'source': prop_source,
8383+ 'reliability': 'Unknown',
8484+ 'conditions': ''
8585+ })
8686+ log.msg('PubChem prop: |%s| |%s| |%s|' %
8787+ (new_prop['attribute'], new_prop['value'],
8888+ new_prop['source']), level=log.DEBUG)
8989+ requests.append(new_prop)
9090+ elif prop.xpath('ul'): # parsing for multiple values (list) in property
9191+ prop_values = prop.xpath('ul//li')
9292+ for prop_li in prop_values:
9393+ prop_value = ''.join(prop_li.xpath('a/text()').extract())
9494+ prop_source = ''.join(prop_li.xpath('a/@title').extract())
9595+ new_prop = Result({
9696+ 'attribute': prop_name,
9797+ 'value': prop_value,
9898+ 'source': prop_source,
9999+ 'reliability': 'Unknown',
100100+ 'conditions': ''
101101+ })
102102+ log.msg('PubChem prop: |%s| |%s| |%s|' %
103103+ (new_prop['attribute'], new_prop['value'],
104104+ new_prop['source']), level=log.DEBUG)
105105+ requests.append(new_prop)
106106+107107+ return requests
108108+109109+110110+ def new_compound_request(self, compound):
111111+ return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
···66 website = "http://something/*" # Regex of URI's the source is able to parse
77 _spider = None
8899- def __init__(self, config={}):
99+ def __init__(self, config=None):
1010 """
1111 Initiation of a new Source
1212 """
1313+ self.cfg = {}
1414+ if config is not None:
1515+ self.cfg = config
1316 pass
14171518 def parse(self, response):
+5-2
FourmiCrawler/spider.py
···1010 """
1111 name = "FourmiSpider"
12121313- def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
1313+ def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
1414 """
1515 Initiation of the Spider
1616 :param compound: compound that will be searched.
···2020 self.synonyms = set()
2121 super(FourmiSpider, self).__init__(*args, **kwargs)
2222 self.synonyms.add(compound)
2323- self.selected_attributes = selected_attributes
2323+ if selected_attributes is None:
2424+ self.selected_attributes = [".*"]
2525+ else:
2626+ self.selected_attributes = selected_attributes
24272528 def parse(self, response):
2629 """
+5-9
README.md
···23232424### Installing
25252626-If you're installing Fourmi, please take a look at our [installation guide](...)
2727-on our wiki. When you've installed the application, make sure to check our
2828-[usage guide](...).
2626+If you're installing Fourmi, please take a look at our installation guides
2727+on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
2828+usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
29293030### Using the Source
31313232To use the Fourmi source code multiple dependencies are required. Take a look at
3333-the [wiki page](...) on using the application source code for a step by step
3333+our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
3434installation guide.
35353636When developing for the Fourmi project keep in mind that code readability is a
3737must. To maintain the readability, code should be conform with the
3838[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
3939code. More information about the different structures and principles of the
4040-Fourmi application can be found on our [wiki](...).
4040+Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
41414242### To Do
4343···45454646__Main goals:__
47474848-- Improve our documentation and guides. (Assignee: Dekker)
4948- Build an graphical user interface(GUI) as alternative for the command line
5049interface(CLI). (Assignee: Harmen)
5150- Compiling the source into an windows executable. (Assignee: Bas)
5252-- Create an configuration file to hold logins and API keys.
5353-- Determine reliability of our data point.
5454-- Create an module to gather data from NIST. (Assignee: Rob)
5551- Create an module to gather data from PubChem. (Assignee: Nout)
56525753__Side goals:__
+101
SIGNED.md
···11+##### Signed by https://keybase.io/jdekker
22+```
33+-----BEGIN PGP SIGNATURE-----
44+Version: GnuPG v1.4.11 (GNU/Linux)
55+66+iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB
77+MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9
88+MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt
99+gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC
1010+ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI
1111+VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi
1212+IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG
1313+4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw
1414+UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f
1515+poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE
1616+3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I
1717+ZbM/uDYyJdZbBe4UoCoF
1818+=AMhi
1919+-----END PGP SIGNATURE-----
2020+2121+```
2222+2323+<!-- END SIGNATURES -->
2424+2525+### Begin signed statement
2626+2727+#### Expect
2828+2929+```
3030+size exec file contents
3131+ ./
3232+375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
3333+464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
3434+428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df
3535+ FourmiCrawler/
3636+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3737+304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
3838+2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
3939+914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2
4040+ sources/
4141+9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
4242+9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
4343+4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb
4444+6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
4545+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
4646+1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
4747+3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
4848+1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
4949+3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
5050+3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23
5151+261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
5252+ tests/
5353+1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
5454+2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80
5555+1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
5656+1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
5757+2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
5858+ utils/
5959+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
6060+3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921
6161+2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
6262+```
6363+6464+#### Ignore
6565+6666+```
6767+/SIGNED.md
6868+```
6969+7070+#### Presets
7171+7272+```
7373+git # ignore .git and anything as described by .gitignore files
7474+dropbox # ignore .dropbox-cache and other Dropbox-related files
7575+kb # ignore anything as described by .kbignore files
7676+```
7777+7878+<!-- summarize version = 0.0.9 -->
7979+8080+### End signed statement
8181+8282+<hr>
8383+8484+#### Notes
8585+8686+With keybase you can sign any directory's contents, whether it's a git repo,
8787+source code distribution, or a personal documents folder. It aims to replace the drudgery of:
8888+8989+ 1. comparing a zipped file to a detached statement
9090+ 2. downloading a public key
9191+ 3. confirming it is in fact the author's by reviewing public statements they've made, using it
9292+9393+All in one simple command:
9494+9595+```bash
9696+keybase dir verify
9797+```
9898+9999+There are lots of options, including assertions for automating your checks.
100100+101101+For more info, check out https://keybase.io/docs/command_line/code_signing
+10-7
fourmi.py
···55Usage:
66 fourmi search <compound>
77 fourmi [options] search <compound>
88- fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
88+ fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
99 fourmi list
1010 fourmi [--include=<sourcename> | --exclude=<sourcename>] list
1111 fourmi -h | --help
···1515 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
1616 -h --help Show this screen.
1717 --version Show version.
1818- --verbose Verbose logging output.
1818+ -v Verbose logging output. (Multiple occurrences increase logging level)
1919 --log=<file> Save log to an file.
2020 -o <file> --output=<file> Output file [default: results.*format*]
2121 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
···25252626from twisted.internet import reactor
2727from scrapy.crawler import Crawler
2828-from scrapy import log, signals
2929-from scrapy.utils.project import get_project_settings
2828+from scrapy import signals, log
3029import docopt
31303231from FourmiCrawler.spider import FourmiSpider
···5857 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
5958 """
6059 conf = Configurator()
6161- conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
6060+ conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
6261 conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
6363- setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
6262+ setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
6363+ source_loader, docopt_arguments["--attributes"].split(','))
6464+ if conf.scrapy_settings.getbool("LOG_ENABLED"):
6565+ log.start(conf.scrapy_settings.get("LOG_FILE"),
6666+ conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
6467 reactor.run()
656866696770# The start for the Fourmi Command Line interface.
6871if __name__ == '__main__':
6969- arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
7272+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3')
7073 loader = SourceLoader()
71747275 if arguments["--include"]:
···1313 def test_none_pipeline(self):
1414 # Testing the pipeline that replaces the None values in items.
1515 self.testItem["value"] = "abc"
1616+ self.testItem["source"] = None
1617 pipe = pipelines.RemoveNonePipeline()
1718 processed = pipe.process_item(self.testItem, spider.FourmiSpider())
1819
-1
tests/test_spider.py
···4747 self.assertGreater(len(requests), 0)
4848 self.assertIsInstance(requests[0], Request)
49495050-5150 def test_synonym_requests(self):
5251 # A test for the synonym request function
5352 self.spi._sources = []
+28-18
utils/configurator.py
···11-from scrapy import log
11+import ConfigParser
22+23from scrapy.utils.project import get_project_settings
33-import ConfigParser
44+4556class Configurator:
67 """
···10111112 def __init__(self):
1213 self.scrapy_settings = get_project_settings()
1313-14141515 def set_output(self, filename, fileformat):
1616 """
···3030 if fileformat is not None:
3131 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
32323333-3434- def start_log(self, logfile, verbose):
3333+ def set_logging(self, logfile=None, verbose=0):
3534 """
3636- This function starts the logging functionality of Scrapy using the settings given by the CLI.
3535+ This function changes the default settings of Scapy's logging functionality
3636+ using the settings given by the CLI.
3737 :param logfile: The location where the logfile will be saved.
3838- :param verbose: A boolean value to switch between loglevels.
3838+ :param verbose: A integer value to switch between loglevels.
3939 """
4040+ if verbose != 0:
4141+ self.scrapy_settings.overrides["LOG_ENABLED"] = True
4242+ else:
4343+ self.scrapy_settings.overrides["LOG_ENABLED"] = False
4444+4545+ if verbose == 1:
4646+ self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
4747+ elif verbose == 2:
4848+ self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
4949+ else:
5050+ self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
5151+5252+ if verbose > 1:
5353+ self.scrapy_settings.overrides["LOG_STDOUT"] = False
5454+ else:
5555+ self.scrapy_settings.overrides["LOG_STDOUT"] = True
5656+4057 if logfile is not None:
4141- if verbose:
4242- log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
4343- else:
4444- log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
5858+ self.scrapy_settings.overrides["LOG_FILE"] = logfile
4559 else:
4646- if verbose:
4747- log.start(logstdout=False, loglevel=log.DEBUG)
4848- else:
4949- log.start(logstdout=True, loglevel=log.WARNING)
6060+ self.scrapy_settings.overrides["LOG_FILE"] = None
50615162 @staticmethod
5263 def read_sourceconfiguration():
···5667 :return a ConfigParser object of sources.cfg
5768 """
5869 config = ConfigParser.ConfigParser()
5959- config.read('sources.cfg') # [TODO]: should be softcoded eventually
7070+ config.read('sources.cfg') # [TODO]: should be softcoded eventually
6071 return config
61726273 @staticmethod
···7586 elif config.defaults():
7687 section = config.defaults()
7788 if 'reliability' not in section:
7878- log.msg('Reliability not set for %s' % sourcename,
7979- level=log.WARNING)
8989+ print 'WARNING: Reliability not set for %s' % sourcename
8090 section['reliability'] = ''
8191 return section