A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'feature/compound-name' into develop

+43 -39
+9 -9
FourmiCrawler/sources/ChemSpider.py
··· 89 89 90 90 # Test for properties without values, with one hardcoded exception 91 91 if (not re.match(r'^\d', prop_value) or 92 - (prop_name == 'Polarizability' and prop_value == '10-24cm3')): 92 + (prop_name == 'Polarizability' and prop_value == '10-24cm3')): 93 93 continue 94 94 95 95 m = re.match(r'(.*) \((.*)\)', prop_name) ··· 122 122 properties = [] 123 123 124 124 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' 125 - 'Properties"]//li/table/tr/td') 125 + 'Properties"]//li/table/tr/td') 126 126 if not scraped_list: 127 127 return properties 128 128 # Format is: property name followed by a list of values 129 129 property_name = scraped_list.pop(0).xpath( 130 - 'span/text()').extract()[0].rstrip() 130 + 'span/text()').extract()[0].rstrip() 131 131 for line in scraped_list: 132 132 if line.xpath('span/text()'): 133 133 property_name = line.xpath('span/text()').extract()[0].rstrip() ··· 251 251 :return: A Result item 252 252 """ 253 253 return Result({ 254 - 'attribute': attribute, 255 - 'value': value, 256 - 'source': source, 257 - 'reliability': self.cfg['reliability'], 258 - 'conditions': conditions 259 - }) 254 + 'attribute': attribute, 255 + 'value': value, 256 + 'source': source, 257 + 'reliability': self.cfg['reliability'], 258 + 'conditions': conditions 259 + }) 260 260 261 261 def parse_searchrequest(self, response): 262 262 """
+7 -6
FourmiCrawler/sources/NIST.py
··· 313 313 :param conditions: optional conditions regarding the value 314 314 :return: A Result item 315 315 """ 316 - return Result({ 317 - 'attribute': attribute, 318 - 'value': value, 319 - 'source': 'NIST', 320 - 'reliability': self.cfg['reliability'], 321 - 'conditions': conditions 316 + return Result( 317 + { 318 + 'attribute': attribute, 319 + 'value': value, 320 + 'source': 'NIST', 321 + 'reliability': self.cfg['reliability'], 322 + 'conditions': conditions 322 323 }) 323 324 324 325 def new_compound_request(self, compound):
+14 -12
FourmiCrawler/sources/PubChem.py
··· 15 15 including sources of the values of properties. 16 16 """ 17 17 18 - #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 18 + # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 19 19 website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' 20 20 website_www = 'http://www.ncbi.nlm.nih.gov/*' 21 21 website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' ··· 54 54 n = re.search(r'cid=(\d+)', response.url) 55 55 if n: 56 56 cid = n.group(1) 57 - log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach 58 - # the seperate html page which contains the properties and their values 57 + log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach 58 + # the seperate html page which contains the properties and their values 59 59 60 - #using this cid to get the right url and scrape it 61 - requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) 60 + # using this cid to get the right url and scrape it 61 + requests.append( 62 + Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) 62 63 return requests 63 64 64 - def parse_data(self, response): 65 + @staticmethod 66 + def parse_data(response): 65 67 """ 66 68 Parse data found in 'Chemical and Physical properties' part of a substance page. 67 69 :param response: The response with the page to parse ··· 74 76 props = sel.xpath('//div') 75 77 76 78 for prop in props: 77 - prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing 78 - if prop.xpath('a'): # parsing for single value in property 79 + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing 80 + if prop.xpath('a'): # parsing for single value in property 79 81 prop_source = ''.join(prop.xpath('a/@title').extract()) 80 82 prop_value = ''.join(prop.xpath('a/text()').extract()) 81 83 new_prop = Result({ ··· 89 91 (new_prop['attribute'], new_prop['value'], 90 92 new_prop['source']), level=log.DEBUG) 91 93 requests.append(new_prop) 92 - elif prop.xpath('ul'): # parsing for multiple values (list) in property 94 + elif prop.xpath('ul'): # parsing for multiple values (list) in property 93 95 prop_values = prop.xpath('ul//li') 94 96 for prop_li in prop_values: 95 97 prop_value = ''.join(prop_li.xpath('a/text()').extract()) ··· 102 104 'conditions': '' 103 105 }) 104 106 log.msg('PubChem prop: |%s| |%s| |%s|' % 105 - (new_prop['attribute'], new_prop['value'], 106 - new_prop['source']), level=log.DEBUG) 107 + (new_prop['attribute'], new_prop['value'], 108 + new_prop['source']), level=log.DEBUG) 107 109 requests.append(new_prop) 108 110 109 111 return requests ··· 116 118 case the search request forwarded to the compound page 117 119 """ 118 120 119 - #check if pubchem forwarded straight to compound page 121 + # check if pubchem forwarded straight to compound page 120 122 m = re.match(self.website_pubchem, response.url) 121 123 if m: 122 124 log.msg('PubChem search forwarded to compound page',
+3 -3
fourmi.py
··· 17 17 --version Show version. 18 18 -v Verbose logging output. (Multiple occurrences increase logging level) 19 19 --log=<file> Save log to an file. 20 - -o <file> --output=<file> Output file [default: results.*format*] 20 + -o <file> --output=<file> Output file [default: <compound>.*format*] 21 21 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] 22 22 --include=<regex> Include only sources that match these regular expressions split by a comma. 23 23 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. ··· 58 58 """ 59 59 conf = Configurator() 60 60 conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 61 - conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) 61 + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"]) 62 62 setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 63 63 source_loader, docopt_arguments["--attributes"].split(',')) 64 64 if conf.scrapy_settings.getbool("LOG_ENABLED"): 65 65 log.start(conf.scrapy_settings.get("LOG_FILE"), 66 - conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 66 + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 67 67 reactor.run() 68 68 69 69
+5 -5
tests/test_configurator.py
··· 10 10 self.conf = Configurator() 11 11 12 12 def test_set_output(self): 13 - self.conf.set_output(filename="test.txt", fileformat="csv") 13 + self.conf.set_output(filename="test.txt", fileformat="csv", compound="test") 14 14 self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") 15 15 self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 16 16 17 - self.conf.set_output("results.*format*", "jsonlines") 18 - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") 17 + self.conf.set_output("<compound>.*format*", "jsonlines", "test") 18 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json") 19 19 self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") 20 20 21 - self.conf.set_output("results.*format*", "csv") 22 - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") 21 + self.conf.set_output("results.*format*", "csv", "test") 22 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv") 23 23 self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 24 24 25 25 def test_start_log(self):
+5 -4
utils/configurator.py
··· 3 3 4 4 from scrapy.utils.project import get_project_settings 5 5 6 + 6 7 class Configurator: 7 8 """ 8 9 A helper class in the fourmi class. This class is used to process the settings as set ··· 12 13 def __init__(self): 13 14 self.scrapy_settings = get_project_settings() 14 15 15 - def set_output(self, filename, fileformat): 16 + def set_output(self, filename, fileformat, compound): 16 17 """ 17 18 This function manipulates the Scrapy output file settings that normally would be set in the settings file. 18 19 In the Fourmi project these are command line arguments. ··· 20 21 :param fileformat: The format in which the output will be. 21 22 """ 22 23 23 - if filename != 'results.*format*': 24 + if filename != '<compound>.*format*': 24 25 self.scrapy_settings.overrides["FEED_URI"] = filename 25 26 elif fileformat == "jsonlines": 26 - self.scrapy_settings.overrides["FEED_URI"] = "results.json" 27 + self.scrapy_settings.overrides["FEED_URI"] = compound + ".json" 27 28 elif fileformat is not None: 28 - self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat 29 + self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat 29 30 30 31 if fileformat is not None: 31 32 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat