A web scraper build to search specific information for a given compound (and its pseudonyms)

Code reformat

+19 -23
+4 -4
FourmiCrawler/pipelines.py
··· 4 4 5 5 from scrapy.exceptions import DropItem 6 6 7 - class RemoveNonePipeline(object): 8 7 8 + class RemoveNonePipeline(object): 9 9 def __init__(self): 10 10 pass 11 11 ··· 21 21 item[key] = "" 22 22 return item 23 23 24 - class DuplicatePipeline(object): 25 24 25 + class DuplicatePipeline(object): 26 26 def __init__(self): 27 27 self.known_values = set() 28 28 ··· 35 35 """ 36 36 value = (item['attribute'], item['value'], item['conditions']) 37 37 if value in self.known_values: 38 - raise DropItem("Duplicate item found: %s" % item) #[todo] append sources of first item. 38 + raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item. 39 39 else: 40 40 self.known_values.add(value) 41 41 return item 42 42 43 - class AttributeSelectionPipeline(object): 44 43 44 + class AttributeSelectionPipeline(object): 45 45 def __init__(self): 46 46 pass; 47 47
+1 -1
FourmiCrawler/settings.py
··· 3 3 # For simplicity, this file contains only the most important settings by 4 4 # default. All the other settings are documented here: 5 5 # 6 - # http://doc.scrapy.org/en/latest/topics/settings.html 6 + # http://doc.scrapy.org/en/latest/topics/settings.html 7 7 # 8 8 9 9 BOT_NAME = 'FourmiCrawler'
+1 -1
FourmiCrawler/sources/ChemSpider.py
··· 63 63 # Test for properties without values, with one hardcoded exception 64 64 if (not re.match(r'^\d', prop_value) or 65 65 (prop_name == 'Polarizability' and 66 - prop_value == '10-24cm3')): 66 + prop_value == '10-24cm3')): 67 67 continue 68 68 69 69 # Match for condition in parentheses
+6 -7
FourmiCrawler/sources/NIST.py
··· 10 10 11 11 # [TODO]: values can be '128.', perhaps remove the dot in that case? 12 12 # [TODO]: properties have references and comments which do not exist in the 13 - # Result item, but should be included eventually. 13 + # Result item, but should be included eventually. 14 14 15 15 class NIST(Source): 16 16 """NIST Scraper plugin ··· 18 18 This plugin manages searching for a chemical on the NIST website 19 19 and parsing the resulting page if the chemical exists on NIST. 20 20 """ 21 - website = "http://webbook.nist.gov/*" 21 + website = "http://webbook.nist.gov/*" 22 22 23 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 24 ··· 78 78 requests.extend(self.parse_generic_data(table, summary)) 79 79 else: 80 80 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) 81 - continue #Assume unsupported 81 + continue #Assume unsupported 82 82 return requests 83 83 84 84 def parse_generic_info(self, sel): ··· 106 106 data['IUPAC Standard InChI'] = raw_inchi.extract()[0] 107 107 108 108 raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' 109 - '/tt/text()') 109 + '/tt/text()') 110 110 data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] 111 111 112 112 raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') ··· 132 132 results = [] 133 133 for tr in table.xpath('tr[td]'): 134 134 extra_data_url = tr.xpath('td[last()][a="Individual data points"]' 135 - '/a/@href').extract() 135 + '/a/@href').extract() 136 136 if extra_data_url: 137 137 request = Request(url=self.website[:-1] + extra_data_url[0], 138 - callback=self.parse_individual_datapoints) 138 + callback=self.parse_individual_datapoints) 139 139 results.append(request) 140 140 continue 141 141 data = [] ··· 182 182 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 183 183 }) 184 184 results.append(result) 185 - 186 185 187 186 return results 188 187
+1 -1
FourmiCrawler/sources/WikipediaParser.py
··· 38 38 """ scrape data from infobox on wikipedia. """ 39 39 items = [] 40 40 41 - #be sure to get chembox (wikipedia template) 41 + # be sure to get chembox (wikipedia template) 42 42 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 43 43 xpath('normalize-space(string())') 44 44 prop_names = tr_list[::2]
+1 -1
fourmi.py
··· 1 - #!/usr/bin/env python 1 + # !/usr/bin/env python 2 2 """ 3 3 Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). 4 4
+3 -4
tests/test_pipeline.py
··· 7 7 8 8 9 9 class TestPipelines(unittest.TestCase): 10 - 11 10 def setUp(self): 12 11 self.testItem = items.Result() 13 12 14 13 def test_NonePipeline(self): 15 - #Testing the pipeline that replaces the None values in items. 14 + # Testing the pipeline that replaces the None values in items. 16 15 self.testItem["value"] = "abc" 17 16 pipe = pipelines.RemoveNonePipeline() 18 17 processed = pipe.process_item(self.testItem, spider.FourmiSpider()) ··· 25 24 self.assertIs(processed[key], "") 26 25 27 26 def test_DuplicatePipeline(self): 28 - #Testing the pipeline that removes duplicates. 27 + # Testing the pipeline that removes duplicates. 29 28 self.testItem["attribute"] = "test" 30 29 self.testItem["value"] = "test" 31 30 self.testItem["conditions"] = "test" ··· 39 38 self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem) 40 39 41 40 def test_AttributeSelection(self): 42 - #Testing the pipeline that selects attributes. 41 + # Testing the pipeline that selects attributes. 43 42 item1 = copy.deepcopy(self.testItem) 44 43 item2 = copy.deepcopy(self.testItem) 45 44
+2 -3
tests/test_sourceloader.py
··· 4 4 5 5 6 6 class TestSourceloader(unittest.TestCase): 7 - 8 7 def setUp(self): 9 8 self.loader = SourceLoader() 10 9 ··· 16 15 self.assertIn("Source: WikipediaParser", str(self.loader)) 17 16 18 17 def test_include(self): 19 - #Tests for the include functionality. 18 + # Tests for the include functionality. 20 19 self.loader.include(["So.rc.*"]) 21 20 22 21 self.assertIn("Source: Source", str(self.loader)) ··· 25 24 self.assertNotIn("Source: WikipediaParser", str(self.loader)) 26 25 27 26 def test_exclude(self): 28 - #Tests for the exclude functionality. 27 + # Tests for the exclude functionality. 29 28 self.loader.exclude(["So.rc.*"]) 30 29 31 30 self.assertNotIn("Source: Source", str(self.loader))
-1
tests/test_spider.py
··· 8 8 9 9 10 10 class TestFoumiSpider(unittest.TestCase): 11 - 12 11 def setUp(self): 13 12 self.compound = "test_compound" 14 13 self.attributes = ["a.*", ".*a"]