A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 334 lines 12 kB view raw
1import re 2 3from scrapy import log 4from scrapy.http import Request 5from scrapy.selector import Selector 6 7from source import Source 8from FourmiCrawler.items import Result 9 10 11# [TODO]: values can be '128.', perhaps remove the dot in that case? 12# [TODO]: properties have references and comments which do not exist in the 13# Result item, but should be included eventually. 14 15class NIST(Source): 16 """ 17 NIST Scraper plugin 18 This plugin manages searching for a chemical on the NIST website 19 and parsing the resulting page if the chemical exists on NIST. 20 """ 21 website = "http://webbook\\.nist\\.gov/.*" 22 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 25 def __init__(self, config=None): 26 """ 27 Initialization of NIST scraper 28 :param config: configuration variables for this scraper, must contain 29 'reliability' key. 30 """ 31 Source.__init__(self, config) 32 self.ignore_list = set() 33 34 def parse(self, response): 35 """ 36 This function is called when a Response matching the variable 37 'website' is available for parsing the Response object. 38 :param response: The Scrapy Response object to be parsed 39 :return: a list of Result items and Request objects 40 """ 41 sel = Selector(response) 42 43 title = sel.xpath('head/title/text()').extract()[0] 44 if title == 'Name Not Found': 45 log.msg('NIST: Chemical not found!', level=log.ERROR) 46 return 47 if title not in self.ignore_list: 48 self.ignore_list.update(title) 49 log.msg('NIST emit synonym: %s' % title, level=log.DEBUG) 50 self._spider.get_synonym_requests(title) 51 52 requests = [] 53 54 requests.extend(self.parse_generic_info(sel)) 55 56 symbol_table = {} 57 tds = sel.xpath('//table[@class="symbol_table"]/tr/td') 58 for (symbol_td, name_td) in zip(tds[::2], tds[1::2]): 59 symbol = ''.join(symbol_td.xpath('node()').extract()) 60 name = name_td.xpath('text()').extract()[0] 61 symbol_table[symbol] = name 62 log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name), 63 level=log.DEBUG) 64 65 requests.extend(self.parse_tables(sel, symbol_table)) 66 67 return requests 68 69 def parse_tables(self, sel, symbol_table): 70 """ 71 This function identifies and distributes parsing of tables to other 72 functions below. 73 :param sel: A Selector object of the whole page 74 :param symbol_table: a dictionary containing translations of raw HTML 75 tags to human readable names 76 :return: a list of Result items and Requests 77 """ 78 requests = [] 79 80 for table in sel.xpath('//table[@class="data"]'): 81 summary = table.xpath('@summary').extract()[0] 82 if summary == 'One dimensional data': 83 log.msg('NIST table: Aggregrate data', level=log.DEBUG) 84 requests.extend( 85 self.parse_aggregate_data(table, symbol_table)) 86 elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1': 87 log.msg('NIST table; Enthalpy/entropy of phase transition', 88 level=log.DEBUG) 89 requests.extend(self.parse_transition_data(table, summary)) 90 elif table.xpath('tr[1]/td'): 91 log.msg('NIST table: Horizontal table', level=log.DEBUG) 92 elif summary == 'Antoine Equation Parameters': 93 log.msg('NIST table: Antoine Equation Parameters', 94 level=log.DEBUG) 95 requests.extend(self.parse_antoine_data(table, summary)) 96 elif len(table.xpath('tr[1]/th')) == 5: 97 log.msg('NIST table: generic 5 columns', level=log.DEBUG) 98 # Symbol (unit) Temperature (K) Method Reference Comment 99 requests.extend(self.parse_generic_data(table, summary)) 100 elif len(table.xpath('tr[1]/th')) == 4: 101 log.msg('NIST table: generic 4 columns', level=log.DEBUG) 102 # Symbol (unit) Temperature (K) Reference Comment 103 requests.extend(self.parse_generic_data(table, summary)) 104 else: 105 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) 106 continue # Assume unsupported 107 return requests 108 109 def parse_generic_info(self, sel): 110 """ 111 This function parses: synonyms, chemical formula, molecular weight, 112 InChI, InChiKey, CAS number 113 :param sel: A Selector object of the entire page in the original 114 response 115 :return: a list of Result items 116 """ 117 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 118 119 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 120 for synonym in raw_synonyms[0].strip().split(';\n'): 121 log.msg('NIST synonym: %s' % synonym, level=log.DEBUG) 122 self.ignore_list.update(synonym) 123 self._spider.get_synonym_requests(synonym) 124 125 data = {} 126 127 raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract() 128 data['Chemical formula'] = ''.join(raw_formula[2:]).strip() 129 130 raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()') 131 data['Molecular weight'] = raw_mol_weight.extract()[0].strip() 132 133 raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()') 134 data['IUPAC Standard InChI'] = raw_inchi.extract()[0] 135 136 raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' 137 '/tt/text()') 138 data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] 139 140 raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') 141 data['CAS Registry Number'] = raw_cas_number.extract()[0].strip() 142 143 requests = [] 144 for key, value in data.iteritems(): 145 result = self.newresult( 146 attribute=key, 147 value=value 148 ) 149 requests.append(result) 150 151 return requests 152 153 def parse_aggregate_data(self, table, symbol_table): 154 """ 155 This function parses the table(s) which contain possible links to 156 individual data points 157 :param table: a Selector object of the table to be parsed 158 :param symbol_table: a dictionary containing translations of raw HTML 159 tags to human readable names 160 :return: a list of Result items and Request objects 161 """ 162 results = [] 163 for tr in table.xpath('tr[td]'): 164 extra_data_url = tr.xpath('td[last()][a="Individual data points"]' 165 '/a/@href').extract() 166 if extra_data_url: 167 request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0], 168 callback=self.parse_individual_datapoints) 169 results.append(request) 170 continue 171 data = [] 172 for td in tr.xpath('td'): 173 data.append(''.join(td.xpath('node()').extract())) 174 175 name = symbol_table[data[0]] 176 condition = '' 177 178 m = re.match(r'(.*) at (.*)', name) 179 if m: 180 name = m.group(1) 181 condition = m.group(2) 182 183 result = self.newresult( 184 attribute=name, 185 value=data[1] + ' ' + data[2], 186 conditions=condition 187 ) 188 log.msg('NIST: |%s|' % data, level=log.DEBUG) 189 results.append(result) 190 return results 191 192 def parse_transition_data(self, table, summary): 193 """ 194 This function parses the table containing properties regarding phase 195 changes 196 :param table: a Selector object of the table to be parsed 197 :param summary: the name of the property 198 :return: a list of Result items 199 """ 200 results = [] 201 202 unit = self.get_unit(table) 203 204 for tr in table.xpath('tr[td]'): 205 tds = tr.xpath('td/text()').extract() 206 result = self.newresult( 207 attribute=summary, 208 value=tds[0] + ' ' + unit, 209 conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 210 ) 211 results.append(result) 212 213 return results 214 215 def parse_generic_data(self, table, summary): 216 """ 217 Parses the common tables of 4 and 5 rows. Assumes they are of the 218 form: 219 Symbol (unit)|Temperature (K)|Method|Reference|Comment 220 Symbol (unit)|Temperature (K)|Reference|Comment 221 :param table: a Selector object of the table to be parsed 222 :param summary: the name of the property 223 :return: a list of Result items 224 """ 225 results = [] 226 227 unit = self.get_unit(table) 228 229 for tr in table.xpath('tr[td]'): 230 tds = tr.xpath('td/text()').extract() 231 result = self.newresult( 232 attribute=summary, 233 value=tds[0] + ' ' + unit, 234 conditions='%s K' % tds[1] 235 ) 236 results.append(result) 237 return results 238 239 def parse_antoine_data(self, table, summary): 240 """ 241 This function parses the table containing parameters for the Antione 242 equation 243 :param table: a Selector object of the table to be parsed 244 :param summary: the name of the property 245 :return: a list of Result items 246 """ 247 results = [] 248 249 for tr in table.xpath('tr[td]'): 250 tds = tr.xpath('td/text()').extract() 251 result = self.newresult( 252 attribute=summary, 253 value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 254 conditions='%s K' % tds[0] 255 ) 256 results.append(result) 257 258 return results 259 260 def parse_individual_datapoints(self, response): 261 """ 262 This function parses the 'individual data points' page linked from 263 the aggregate data table(s) 264 :param response: the Scrapy Response object to be parsed 265 :return: a list of Result items 266 """ 267 sel = Selector(response) 268 table = sel.xpath('//table[@class="data"]')[0] 269 270 results = [] 271 272 name = table.xpath('@summary').extract()[0] 273 condition = '' 274 m = re.match(r'(.*) at (.*)', name) 275 if m: 276 name = m.group(1) 277 condition = m.group(2) 278 279 unit = self.get_unit(table) 280 281 for tr in table.xpath('tr[td]'): 282 tds = tr.xpath('td/text()').extract() 283 uncertainty = '' 284 m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1]) 285 if m: 286 uncertainty = '+- %s ' % m.group(1) 287 # [TODO]: get the plusminus sign working in here 288 result = self.newresult( 289 attribute=name, 290 value='%s %s%s' % (tds[0], uncertainty, unit), 291 conditions=condition 292 ) 293 results.append(result) 294 295 return results 296 297 @staticmethod 298 def get_unit(table): 299 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 300 m = re.search(r'\((.*)\)', tr_unit) 301 unit = '!' 302 if m: 303 unit = m.group(1) 304 305 return unit 306 307 def newresult(self, attribute, value, conditions=''): 308 """ 309 This function abstracts from the Result item and provides default 310 values 311 :param attribute: the name of the attribute 312 :param value: the value of the attribute 313 :param conditions: optional conditions regarding the value 314 :return: A Result item 315 """ 316 return Result( 317 { 318 'attribute': attribute, 319 'value': value, 320 'source': 'NIST', 321 'reliability': self.cfg['reliability'], 322 'conditions': conditions 323 }) 324 325 def new_compound_request(self, compound): 326 """ 327 This function is called when a new synonym is returned to the spider 328 to generate new requests 329 :param compound: the name of the compound to search for 330 """ 331 if compound not in self.ignore_list: 332 self.ignore_list.update(compound) 333 return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, 334 callback=self.parse)