···1212 """ Wikipedia scraper for chemical properties
13131414 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
1515- It also returns requests with other external sources which contain information on parsed subject.
1515+ It also returns requests with other external sources which contain information on parsed subject.
1616 """
17171818 website = "http://en.wikipedia.org/wiki/*"
1919 __spider = None
2020 searched_compounds = []
21212222- cfg = {}
2323-2424- def __init__(self, config={}):
2222+ def __init__(self, config=None):
2523 Source.__init__(self, config)
2626- self.cfg = config
27242825 def parse(self, response):
2929- """ Distributes the above described behaviour """
2626+ """
2727+ Distributes the above described behaviour
2828+ :param response: The incoming search request
2929+ :return: Returns the found properties if response is unique or returns none if it's already known
3030+ """
3031 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
3132 sel = Selector(response)
3233 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
···3839 return items
39404041 def parse_infobox(self, sel):
4141- """ scrape data from infobox on wikipedia. """
4242+ """
4343+ Scrape data from infobox on wikipedia.
4444+4545+ Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
4646+ :param sel: The selector with the html-information of the page to parse
4747+ :return: item_list: Returns a list of properties with their values, source, etc..
4848+ """
4949+4250 items = []
43514444- # be sure to get chembox (wikipedia template)
5252+ # scrape the chembox (wikipedia template)
5353+ items = self.parse_chembox(sel, items)
5454+5555+ # scrape the drugbox (wikipedia template)
5656+ items = self.parse_drugbox(sel, items)
5757+5858+ items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
5959+ item_list = self.clean_items(items)
6060+6161+ identifiers = self.get_identifiers(sel)
6262+6363+ #add extra sources to scrape from as requests
6464+ for i, identifier in enumerate(identifiers):
6565+ request = None
6666+ #discard internal wikipedia links
6767+ if re.match('//en\.wikipedia', identifier):
6868+ log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
6969+ #fix links starting with '//www.'
7070+ elif re.match('/{2}', identifier):
7171+ identifier = re.sub("/{2}", "http://", identifier)
7272+ request = Request(identifier)
7373+ else:
7474+ request = Request(identifier)
7575+ log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
7676+ item_list.append(request)
7777+7878+ return item_list
7979+8080+ def parse_chembox(self, sel, items):
8181+ """
8282+ Scrape data from chembox infobox on wikipedia.
8383+8484+ :param sel: The selector with the html-information of the page to parse
8585+ :param items: the list of items where the result have to be stored in
8686+ :return: items: the list of items with the new found and stored items
8787+ """
4588 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
4689 xpath('normalize-space(string())')
4790 prop_names = tr_list[::2]
···5396 )
5497 items.append(item)
5598 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
9999+ return items
561005757- #scrape the drugbox (wikipedia template)
101101+ def parse_drugbox(self, sel, items):
102102+ """
103103+ Scrape data from drugbox infobox on wikipedia.
104104+105105+ :param sel: The selector with the html-information of the page to parse
106106+ :param items: the list of items where the result have to be stored in
107107+ :return: items: the list of items with the new found and stored items
108108+ """
58109 tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
59110 log.msg('dit: %s' % tr_list2, level=log.DEBUG)
60111 for tablerow in tr_list2:
···69120 log.msg(
70121 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
71122 level=log.DEBUG)
7272-7373- items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
7474- item_list = self.clean_items(items)
7575-7676- identifiers = self.get_identifiers(sel)
7777-7878- #add extra sources to scrape from as requests
7979- for i, identifier in enumerate(identifiers):
8080- request = None
8181- #discard internal wikipedia links
8282- if re.match('//en\.wikipedia', identifier):
8383- log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
8484- #fix links starting with '//www.'
8585- elif re.match('/{2}', identifier):
8686- identifier = re.sub("/{2}", "http://", identifier)
8787- request = Request(identifier)
8888- else:
8989- request = Request(identifier)
9090- log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
9191- item_list.append(request)
9292-9393- return item_list
123123+ return items
9412495125 def new_compound_request(self, compound):
96126 return Request(url=self.website[:-1] + compound, callback=self.parse)
9712798128 @staticmethod
99129 def clean_items(items):
100100- """ clean up properties using regex, makes it possible to split the values from the units """
130130+131131+ """
132132+ Clean up properties using regex, makes it possible to split the values from the units
133133+134134+ Almost not in use, only cleans J/K/mol values and boiling/melting points.
135135+136136+ :param items: List of properties with their values, source, etc..
137137+ :return: items: List of now cleaned up items
138138+ """
101139 for item in items:
102140 value = item['value']
103141 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
···110148111149 @staticmethod
112150 def get_identifiers(sel):
113113- """ find external links, named 'Identifiers' to different sources. """
151151+ """
152152+ Find external links, named 'Identifiers' to different sources.
153153+154154+ :param sel: The selector with the html-information of the page to parse
155155+ :return: links: New links which can be used to expand the crawlers search
156156+ """
114157 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
115158 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
116159 return links
117160118161 def newresult(self, attribute, value):
119119- return Result({
120120- 'attribute': attribute,
121121- 'value': value,
122122- 'source': 'Wikipedia',
123123- 'reliability': self.cfg['reliability'],
124124- 'conditions': ''
162162+ return Result(
163163+ {
164164+ 'attribute': attribute,
165165+ 'value': value,
166166+ 'source': 'Wikipedia',
167167+ 'reliability': self.cfg['reliability'],
168168+ 'conditions': ''
125169 })
+4-1
FourmiCrawler/sources/source.py
···66 website = "http://something/*" # Regex of URI's the source is able to parse
77 _spider = None
8899- def __init__(self, config={}):
99+ def __init__(self, config=None):
1010 """
1111 Initiation of a new Source
1212 """
1313+ self.cfg = {}
1414+ if config is not None:
1515+ self.cfg = config
1316 pass
14171518 def parse(self, response):
+5-2
FourmiCrawler/spider.py
···1010 """
1111 name = "FourmiSpider"
12121313- def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
1313+ def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
1414 """
1515 Initiation of the Spider
1616 :param compound: compound that will be searched.
···2020 self.synonyms = set()
2121 super(FourmiSpider, self).__init__(*args, **kwargs)
2222 self.synonyms.add(compound)
2323- self.selected_attributes = selected_attributes
2323+ if selected_attributes is None:
2424+ self.selected_attributes = [".*"]
2525+ else:
2626+ self.selected_attributes = selected_attributes
24272528 def parse(self, response):
2629 """
+5-9
README.md
···23232424### Installing
25252626-If you're installing Fourmi, please take a look at our [installation guide](...)
2727-on our wiki. When you've installed the application, make sure to check our
2828-[usage guide](...).
2626+If you're installing Fourmi, please take a look at our installation guides
2727+on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
2828+usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
29293030### Using the Source
31313232To use the Fourmi source code multiple dependencies are required. Take a look at
3333-the [wiki page](...) on using the application source code for a step by step
3333+our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
3434installation guide.
35353636When developing for the Fourmi project keep in mind that code readability is a
3737must. To maintain the readability, code should be conform with the
3838[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
3939code. More information about the different structures and principles of the
4040-Fourmi application can be found on our [wiki](...).
4040+Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
41414242### To Do
4343···45454646__Main goals:__
47474848-- Improve our documentation and guides. (Assignee: Dekker)
4948- Build an graphical user interface(GUI) as alternative for the command line
5049interface(CLI). (Assignee: Harmen)
5150- Compiling the source into an windows executable. (Assignee: Bas)
5252-- Create an configuration file to hold logins and API keys.
5353-- Determine reliability of our data point.
5454-- Create an module to gather data from NIST. (Assignee: Rob)
5551- Create an module to gather data from PubChem. (Assignee: Nout)
56525753__Side goals:__
+103
SIGNED.md
···11+##### Signed by https://keybase.io/jdekker
22+```
33+-----BEGIN PGP SIGNATURE-----
44+Version: GnuPG v1.4.11 (GNU/Linux)
55+66+iQIcBAABAgAGBQJTnfAAAAoJEJrQ9RIUCT6/KZIQAME07yzAG5hnqsQof5ESoeQs
77+5wBxAhiBIX/0yn3qIT/eMh0ubCKUZsqJ3/PzUljeMJ6CGtwxFYfTWkgjYlOoAz9G
88+fS7CjPmRPyiu+MFo5he+oVRmLUMqfuLUrCyuIxJwMXq5YbQvzyqiffvxr8VRULtV
99+3c0drWfQMX1ZeAWSIYN0xuMndzvaqIAQU6o4tSQf/rUiKlM2NnTDNUHu2PY9FED/
1010+IJwM/IgAMAkJARyL7ltq6pHzORsu7sd2Nhv0esa0Gs2GSuRjKueeMZvJzpDAufy9
1111+bWn9EqKhVwPR6zWnXRmNj9Ymj1w167hIUYcBdFhC7kie5zv9+pDE6d/s7pw/Rejd
1212+L0k8LKBGtJ8o7SKYR9kcNLDWXEnHjfCraD+14FMYqQPcz2ekoV6Exv/mP8qRPwUc
1313+b+FtjJtW8fEiOMAyjMOvLTzYbCVwjdErAqgNdHeSByi1nxfrphjajRiNUt7fVimJ
1414+++QZzKCj6xN2MuTJ41KbZ8teiUXwQB4OKKij0fgoy0RBwW0vqH6MF7cCKm1zT1Qa
1515+9FGlBU2jSybQqUu4lJ/eUjO/3tQMhJErQJU/i+6lwi7OMnS9J/g17Heghp5Hxyhc
1616+VWvhR56pbWLIL2XQqDGGEqPDIzXohHnbRJ1N71b06akIvIIrTqc6Glu4PJeUG/Pe
1717+EF8/jBwydxbKUOyKRSQS
1818+=xWbc
1919+-----END PGP SIGNATURE-----
2020+2121+```
2222+2323+<!-- END SIGNATURES -->
2424+2525+### Begin signed statement
2626+2727+#### Expect
2828+2929+```
3030+size exec file contents
3131+ ./
3232+17591 .coverage 1dd1207846db74e407d3a4a1951b8e81934a4693385d39f6c337a224375bad39|1b7ead09cf213b5a9545557be982aaa30238b689bb54adf604f82b12ef521eb2
3333+375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
3434+464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
3535+97 Changelog.md bcbce9a33bbbbcd18fd7788e6dc3a9c4b13dff7128ea99968994c1b290ddc931
3636+ FourmiCrawler/
3737+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3838+304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
3939+2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
4040+716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6
4141+ sources/
4242+9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
4343+9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
4444+6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
4545+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
4646+1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
4747+3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
4848+1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
4949+3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
5050+3659 x fourmi.py 81781ed7299e447e6fc551fba69e62cd7a1d63f27dfa063927f4c5c10f5ac331
5151+200850 log.txt d76e741f9e7b67c2574e9cdbbe499ea4861f6e0bd11e5962fdaf9d8720effef8
5252+184692 results.csv 31132f7f394babeb5dfd249aaa714756017b2e1b314b6715f57e6ad9524e5be8|d0bb724f6d714ec7a4a1ad2052f70dd4510b5ac08d616e24b5e9a903dedab586
5353+261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
5454+ tests/
5555+1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
5656+2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80
5757+1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
5858+1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
5959+2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
6060+ utils/
6161+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
6262+3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921
6363+2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
6464+```
6565+6666+#### Ignore
6767+6868+```
6969+/SIGNED.md
7070+```
7171+7272+#### Presets
7373+7474+```
7575+git # ignore .git and anything as described by .gitignore files
7676+dropbox # ignore .dropbox-cache and other Dropbox-related files
7777+kb # ignore anything as described by .kbignore files
7878+```
7979+8080+<!-- summarize version = 0.0.9 -->
8181+8282+### End signed statement
8383+8484+<hr>
8585+8686+#### Notes
8787+8888+With keybase you can sign any directory's contents, whether it's a git repo,
8989+source code distribution, or a personal documents folder. It aims to replace the drudgery of:
9090+9191+ 1. comparing a zipped file to a detached statement
9292+ 2. downloading a public key
9393+ 3. confirming it is in fact the author's by reviewing public statements they've made, using it
9494+9595+All in one simple command:
9696+9797+```bash
9898+keybase dir verify
9999+```
100100+101101+There are lots of options, including assertions for automating your checks.
102102+103103+For more info, check out https://keybase.io/docs/command_line/code_signing
+9-6
fourmi.py
···55Usage:
66 fourmi search <compound>
77 fourmi [options] search <compound>
88+ fourmi [-v | -vv | -vvv] [options] search <compound>
89 fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
910 fourmi list
1011 fourmi [--include=<sourcename> | --exclude=<sourcename>] list
···1516 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
1617 -h --help Show this screen.
1718 --version Show version.
1818- --verbose Verbose logging output.
1919+ -v Verbose logging output. (Multiple occurrences increase logging level)
1920 --log=<file> Save log to an file.
2021 -o <file> --output=<file> Output file [default: results.*format*]
2122 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
···25262627from twisted.internet import reactor
2728from scrapy.crawler import Crawler
2828-from scrapy import log, signals
2929-from scrapy.utils.project import get_project_settings
2929+from scrapy import signals, log
3030import docopt
31313232from FourmiCrawler.spider import FourmiSpider
···5858 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
5959 """
6060 conf = Configurator()
6161- conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
6161+ conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
6262 conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
6363- setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
6363+ setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
6464+ source_loader, docopt_arguments["--attributes"].split(','))
6565+ log.start(conf.scrapy_settings.get("LOG_FILE"),
6666+ conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
6467 reactor.run()
656866696770# The start for the Fourmi Command Line interface.
6871if __name__ == '__main__':
6969- arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
7272+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.1')
7073 loader = SourceLoader()
71747275 if arguments["--include"]:
···1313 def test_none_pipeline(self):
1414 # Testing the pipeline that replaces the None values in items.
1515 self.testItem["value"] = "abc"
1616+ self.testItem["source"] = None
1617 pipe = pipelines.RemoveNonePipeline()
1718 processed = pipe.process_item(self.testItem, spider.FourmiSpider())
1819
-1
tests/test_spider.py
···4747 self.assertGreater(len(requests), 0)
4848 self.assertIsInstance(requests[0], Request)
49495050-5150 def test_synonym_requests(self):
5251 # A test for the synonym request function
5352 self.spi._sources = []
+28-18
utils/configurator.py
···11-from scrapy import log
11+import ConfigParser
22+23from scrapy.utils.project import get_project_settings
33-import ConfigParser
44+4556class Configurator:
67 """
···10111112 def __init__(self):
1213 self.scrapy_settings = get_project_settings()
1313-14141515 def set_output(self, filename, fileformat):
1616 """
···3030 if fileformat is not None:
3131 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
32323333-3434- def start_log(self, logfile, verbose):
3333+ def set_logging(self, logfile=None, verbose=0):
3534 """
3636- This function starts the logging functionality of Scrapy using the settings given by the CLI.
3535+ This function changes the default settings of Scapy's logging functionality
3636+ using the settings given by the CLI.
3737 :param logfile: The location where the logfile will be saved.
3838- :param verbose: A boolean value to switch between loglevels.
3838+ :param verbose: A integer value to switch between loglevels.
3939 """
4040+ if verbose != 0:
4141+ self.scrapy_settings.overrides["LOG_ENABLED"] = True
4242+ else:
4343+ self.scrapy_settings.overrides["LOG_ENABLED"] = False
4444+4545+ if verbose == 1:
4646+ self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
4747+ elif verbose == 2:
4848+ self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
4949+ else:
5050+ self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
5151+5252+ if verbose > 1:
5353+ self.scrapy_settings.overrides["LOG_STDOUT"] = False
5454+ else:
5555+ self.scrapy_settings.overrides["LOG_STDOUT"] = True
5656+4057 if logfile is not None:
4141- if verbose:
4242- log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
4343- else:
4444- log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
5858+ self.scrapy_settings.overrides["LOG_FILE"] = logfile
4559 else:
4646- if verbose:
4747- log.start(logstdout=False, loglevel=log.DEBUG)
4848- else:
4949- log.start(logstdout=True, loglevel=log.WARNING)
6060+ self.scrapy_settings.overrides["LOG_FILE"] = None
50615162 @staticmethod
5263 def read_sourceconfiguration():
···5667 :return a ConfigParser object of sources.cfg
5768 """
5869 config = ConfigParser.ConfigParser()
5959- config.read('sources.cfg') # [TODO]: should be softcoded eventually
7070+ config.read('sources.cfg') # [TODO]: should be softcoded eventually
6071 return config
61726273 @staticmethod
···7586 elif config.defaults():
7687 section = config.defaults()
7788 if 'reliability' not in section:
7878- log.msg('Reliability not set for %s' % sourcename,
7979- level=log.WARNING)
8989+ print 'WARNING: Reliability not set for %s' % sourcename
8090 section['reliability'] = ''
8191 return section