A web scraper build to search specific information for a given compound (and its pseudonyms)
at feature/executable 276 lines 10 kB view raw
1import re 2 3from scrapy import log 4from scrapy.http import Request 5from scrapy.selector import Selector 6 7from source import Source 8from FourmiCrawler.items import Result 9 10 11# [TODO]: values can be '128.', perhaps remove the dot in that case? 12# [TODO]: properties have references and comments which do not exist in the 13# Result item, but should be included eventually. 14 15class NIST(Source): 16 """NIST Scraper plugin 17 18 This plugin manages searching for a chemical on the NIST website 19 and parsing the resulting page if the chemical exists on NIST. 20 """ 21 website = "http://webbook.nist.gov/*" 22 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 25 ignore_list = set() 26 27 def __init__(self): 28 Source.__init__(self) 29 30 def parse(self, response): 31 sel = Selector(response) 32 33 title = sel.xpath('head/title/text()').extract()[0] 34 if title == 'Name Not Found': 35 log.msg('NIST: Chemical not found!', level=log.ERROR) 36 return 37 if title not in self.ignore_list: 38 self.ignore_list.update(title) 39 log.msg('NIST emit synonym: %s' % title, level=log.DEBUG) 40 self._spider.get_synonym_requests(title) 41 42 requests = [] 43 44 requests.extend(self.parse_generic_info(sel)) 45 46 symbol_table = {} 47 tds = sel.xpath('//table[@class="symbol_table"]/tr/td') 48 for (symbol_td, name_td) in zip(tds[::2], tds[1::2]): 49 symbol = ''.join(symbol_td.xpath('node()').extract()) 50 name = name_td.xpath('text()').extract()[0] 51 symbol_table[symbol] = name 52 log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name), 53 level=log.DEBUG) 54 55 for table in sel.xpath('//table[@class="data"]'): 56 summary = table.xpath('@summary').extract()[0] 57 if summary == 'One dimensional data': 58 log.msg('NIST table: Aggregrate data', level=log.DEBUG) 59 requests.extend( 60 self.parse_aggregate_data(table, symbol_table)) 61 elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1': 62 log.msg('NIST table; Enthalpy/entropy of phase transition', 63 level=log.DEBUG) 64 requests.extend(self.parse_transition_data(table, summary)) 65 elif table.xpath('tr[1]/td'): 66 log.msg('NIST table: Horizontal table', level=log.DEBUG) 67 elif summary == 'Antoine Equation Parameters': 68 log.msg('NIST table: Antoine Equation Parameters', 69 level=log.DEBUG) 70 requests.extend(self.parse_antoine_data(table, summary)) 71 elif len(table.xpath('tr[1]/th')) == 5: 72 log.msg('NIST table: generic 5 columns', level=log.DEBUG) 73 # Symbol (unit) Temperature (K) Method Reference Comment 74 requests.extend(self.parse_generic_data(table, summary)) 75 elif len(table.xpath('tr[1]/th')) == 4: 76 log.msg('NIST table: generic 4 columns', level=log.DEBUG) 77 # Symbol (unit) Temperature (K) Reference Comment 78 requests.extend(self.parse_generic_data(table, summary)) 79 else: 80 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) 81 continue # Assume unsupported 82 return requests 83 84 def parse_generic_info(self, sel): 85 """Parses: synonyms, chemical formula, molecular weight, InChI, 86 InChiKey, CAS number 87 """ 88 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 89 li = ul.xpath('li') 90 91 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 92 for synonym in raw_synonyms[0].strip().split(';\n'): 93 log.msg('NIST synonym: %s' % synonym, level=log.DEBUG) 94 self.ignore_list.update(synonym) 95 self._spider.get_synonym_requests(synonym) 96 97 data = {} 98 99 raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract() 100 data['Chemical formula'] = ''.join(raw_formula[2:]).strip() 101 102 raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()') 103 data['Molecular weight'] = raw_mol_weight.extract()[0].strip() 104 105 raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()') 106 data['IUPAC Standard InChI'] = raw_inchi.extract()[0] 107 108 raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' 109 '/tt/text()') 110 data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] 111 112 raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') 113 data['CAS Registry Number'] = raw_cas_number.extract()[0].strip() 114 115 requests = [] 116 for key, value in data.iteritems(): 117 result = Result({ 118 'attribute': key, 119 'value': value, 120 'source': 'NIST', 121 'reliability': 'Unknown', 122 'conditions': '' 123 }) 124 requests.append(result) 125 126 return requests 127 128 def parse_aggregate_data(self, table, symbol_table): 129 """Parses the table(s) which contain possible links to individual 130 data points 131 """ 132 results = [] 133 for tr in table.xpath('tr[td]'): 134 extra_data_url = tr.xpath('td[last()][a="Individual data points"]' 135 '/a/@href').extract() 136 if extra_data_url: 137 request = Request(url=self.website[:-1] + extra_data_url[0], 138 callback=self.parse_individual_datapoints) 139 results.append(request) 140 continue 141 data = [] 142 for td in tr.xpath('td'): 143 data.append(''.join(td.xpath('node()').extract())) 144 145 name = symbol_table[data[0]] 146 condition = '' 147 148 m = re.match(r'(.*) at (.*)', name) 149 if m: 150 name = m.group(1) 151 condition = m.group(2) 152 153 result = Result({ 154 'attribute': name, 155 'value': data[1] + ' ' + data[2], 156 'source': 'NIST', 157 'reliability': 'Unknown', 158 'conditions': condition 159 }) 160 log.msg('NIST: |%s|' % data, level=log.DEBUG) 161 results.append(result) 162 return results 163 164 @staticmethod 165 def parse_transition_data(table, summary): 166 """Parses the table containing properties regarding phase changes""" 167 results = [] 168 169 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 170 m = re.search(r'\((.*)\)', tr_unit) 171 unit = '!' 172 if m: 173 unit = m.group(1) 174 175 for tr in table.xpath('tr[td]'): 176 tds = tr.xpath('td/text()').extract() 177 result = Result({ 178 'attribute': summary, 179 'value': tds[0] + ' ' + unit, 180 'source': 'NIST', 181 'reliability': 'Unknown', 182 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 183 }) 184 results.append(result) 185 186 return results 187 188 @staticmethod 189 def parse_generic_data(table, summary): 190 """Parses the common tables of 4 and 5 rows. Assumes they are of the 191 form: 192 Symbol (unit)|Temperature (K)|Method|Reference|Comment 193 Symbol (unit)|Temperature (K)|Reference|Comment 194 """ 195 results = [] 196 197 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 198 m = re.search(r'\((.*)\)', tr_unit) 199 unit = '!' 200 if m: 201 unit = m.group(1) 202 203 for tr in table.xpath('tr[td]'): 204 tds = tr.xpath('td/text()').extract() 205 result = Result({ 206 'attribute': summary, 207 'value': tds[0] + ' ' + unit, 208 'source': 'NIST', 209 'reliability': 'Unknown', 210 'conditions': '%s K' % tds[1] 211 }) 212 results.append(result) 213 return results 214 215 @staticmethod 216 def parse_antoine_data(table, summary): 217 """Parse table containing parameters for the Antione equation""" 218 results = [] 219 220 for tr in table.xpath('tr[td]'): 221 tds = tr.xpath('td/text()').extract() 222 result = Result({ 223 'attribute': summary, 224 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 225 'source': 'NIST', 226 'reliability': 'Unknown', 227 'conditions': '%s K' % tds[0] 228 }) 229 results.append(result) 230 231 return results 232 233 @staticmethod 234 def parse_individual_datapoints(response): 235 """Parses the page linked from aggregate data""" 236 sel = Selector(response) 237 table = sel.xpath('//table[@class="data"]')[0] 238 239 results = [] 240 241 name = table.xpath('@summary').extract()[0] 242 condition = '' 243 m = re.match(r'(.*) at (.*)', name) 244 if m: 245 name = m.group(1) 246 condition = m.group(2) 247 248 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 249 m = re.search(r'\((.*)\)', tr_unit) 250 unit = '!' 251 if m: 252 unit = m.group(1) 253 254 for tr in table.xpath('tr[td]'): 255 tds = tr.xpath('td/text()').extract() 256 uncertainty = '' 257 m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1]) 258 if m: 259 uncertainty = '+- %s ' % m.group(1) 260 # [TODO]: get the plusminus sign working in here 261 result = Result({ 262 'attribute': name, 263 'value': '%s %s%s' % (tds[0], uncertainty, unit), 264 'source': 'NIST', 265 'reliability': 'Unknown', 266 'conditions': condition 267 }) 268 results.append(result) 269 270 return results 271 272 def new_compound_request(self, compound): 273 if compound not in self.ignore_list: 274 self.ignore_list.update(compound) 275 return Request(url=self.website[:-1] + self.search % compound, 276 callback=self.parse)