···33# For simplicity, this file contains only the most important settings by
44# default. All the other settings are documented here:
55#
66-# http://doc.scrapy.org/en/latest/topics/settings.html
66+# http://doc.scrapy.org/en/latest/topics/settings.html
77#
8899BOT_NAME = 'FourmiCrawler'
+1-1
FourmiCrawler/sources/ChemSpider.py
···6363 # Test for properties without values, with one hardcoded exception
6464 if (not re.match(r'^\d', prop_value) or
6565 (prop_name == 'Polarizability' and
6666- prop_value == '10-24cm3')):
6666+ prop_value == '10-24cm3')):
6767 continue
68686969 # Match for condition in parentheses
+6-7
FourmiCrawler/sources/NIST.py
···10101111# [TODO]: values can be '128.', perhaps remove the dot in that case?
1212# [TODO]: properties have references and comments which do not exist in the
1313-# Result item, but should be included eventually.
1313+# Result item, but should be included eventually.
14141515class NIST(Source):
1616 """NIST Scraper plugin
···1818 This plugin manages searching for a chemical on the NIST website
1919 and parsing the resulting page if the chemical exists on NIST.
2020 """
2121- website = "http://webbook.nist.gov/*"
2121+ website = "http://webbook.nist.gov/*"
22222323 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
2424···7878 requests.extend(self.parse_generic_data(table, summary))
7979 else:
8080 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
8181- continue #Assume unsupported
8181+ continue #Assume unsupported
8282 return requests
83838484 def parse_generic_info(self, sel):
···106106 data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
107107108108 raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
109109- '/tt/text()')
109109+ '/tt/text()')
110110 data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
111111112112 raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
···132132 results = []
133133 for tr in table.xpath('tr[td]'):
134134 extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
135135- '/a/@href').extract()
135135+ '/a/@href').extract()
136136 if extra_data_url:
137137 request = Request(url=self.website[:-1] + extra_data_url[0],
138138- callback=self.parse_individual_datapoints)
138138+ callback=self.parse_individual_datapoints)
139139 results.append(request)
140140 continue
141141 data = []
···182182 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
183183 })
184184 results.append(result)
185185-186185187186 return results
188187
+1-1
FourmiCrawler/sources/WikipediaParser.py
···3838 """ scrape data from infobox on wikipedia. """
3939 items = []
40404141- #be sure to get chembox (wikipedia template)
4141+ # be sure to get chembox (wikipedia template)
4242 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
4343 xpath('normalize-space(string())')
4444 prop_names = tr_list[::2]
+1-1
fourmi.py
···11-#!/usr/bin/env python
11+# !/usr/bin/env python
22"""
33Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
44
+3-4
tests/test_pipeline.py
···778899class TestPipelines(unittest.TestCase):
1010-1110 def setUp(self):
1211 self.testItem = items.Result()
13121413 def test_NonePipeline(self):
1515- #Testing the pipeline that replaces the None values in items.
1414+ # Testing the pipeline that replaces the None values in items.
1615 self.testItem["value"] = "abc"
1716 pipe = pipelines.RemoveNonePipeline()
1817 processed = pipe.process_item(self.testItem, spider.FourmiSpider())
···2524 self.assertIs(processed[key], "")
26252726 def test_DuplicatePipeline(self):
2828- #Testing the pipeline that removes duplicates.
2727+ # Testing the pipeline that removes duplicates.
2928 self.testItem["attribute"] = "test"
3029 self.testItem["value"] = "test"
3130 self.testItem["conditions"] = "test"
···3938 self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem)
40394140 def test_AttributeSelection(self):
4242- #Testing the pipeline that selects attributes.
4141+ # Testing the pipeline that selects attributes.
4342 item1 = copy.deepcopy(self.testItem)
4443 item2 = copy.deepcopy(self.testItem)
4544
+2-3
tests/test_sourceloader.py
···445566class TestSourceloader(unittest.TestCase):
77-87 def setUp(self):
98 self.loader = SourceLoader()
109···1615 self.assertIn("Source: WikipediaParser", str(self.loader))
17161817 def test_include(self):
1919- #Tests for the include functionality.
1818+ # Tests for the include functionality.
2019 self.loader.include(["So.rc.*"])
21202221 self.assertIn("Source: Source", str(self.loader))
···2524 self.assertNotIn("Source: WikipediaParser", str(self.loader))
26252726 def test_exclude(self):
2828- #Tests for the exclude functionality.
2727+ # Tests for the exclude functionality.
2928 self.loader.exclude(["So.rc.*"])
30293130 self.assertNotIn("Source: Source", str(self.loader))