A web scraper build to search specific information for a given compound (and its pseudonyms)

added scraping for generic info except for synonyms

Rob tB 98f58ea4 50c79e3b

+37
+37
FourmiCrawler/sources/NIST.py
··· 18 18 19 19 requests = [] 20 20 21 + requests.extend(self.parse_generic_info(sel)) 22 + 21 23 symbol_table = {} 22 24 tds = sel.xpath('//table[@class="symbol_table"]/tr/td') 23 25 for (symbol_td, name_td) in zip(tds[::2], tds[1::2]): ··· 58 60 else: 59 61 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) 60 62 continue #Assume unsupported 63 + return requests 64 + 65 + def parse_generic_info(self, sel): 66 + ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 67 + li = ul.xpath('li') 68 + 69 + data = {} 70 + 71 + raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract() 72 + data['Chemical formula'] = ''.join(raw_formula[2:]).strip() 73 + 74 + raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()') 75 + data['Molecular weight'] = raw_mol_weight.extract()[0].strip() 76 + 77 + raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()') 78 + data['IUPAC Standard InChI'] = raw_inchi.extract()[0] 79 + 80 + raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' 81 + '/tt/text()') 82 + data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] 83 + 84 + raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') 85 + data['CAS Registry Number'] = raw_cas_number.extract()[0].strip() 86 + 87 + requests = [] 88 + for key, value in data.iteritems(): 89 + result = Result({ 90 + 'attribute': key, 91 + 'value': value, 92 + 'source': 'NIST', 93 + 'reliability': 'Unknown', 94 + 'conditions': '' 95 + }) 96 + requests.append(result) 97 + 61 98 return requests 62 99 63 100 def parse_aggregate_data(self, table, symbol_table):