A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy import log
4from scrapy.http import Request
5from scrapy.selector import Selector
6
7from source import Source
8from FourmiCrawler.items import Result
9
10
11# [TODO]: values can be '128.', perhaps remove the dot in that case?
12# [TODO]: properties have references and comments which do not exist in the
13# Result item, but should be included eventually.
14
15class NIST(Source):
16 """
17 NIST Scraper plugin
18 This plugin manages searching for a chemical on the NIST website
19 and parsing the resulting page if the chemical exists on NIST.
20 """
21 website = "http://webbook\\.nist\\.gov/.*"
22
23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
24
25 def __init__(self, config=None):
26 """
27 Initialization of NIST scraper
28 :param config: configuration variables for this scraper, must contain
29 'reliability' key.
30 """
31 Source.__init__(self, config)
32 self.ignore_list = set()
33
34 def parse(self, response):
35 """
36 This function is called when a Response matching the variable
37 'website' is available for parsing the Response object.
38 :param response: The Scrapy Response object to be parsed
39 :return: a list of Result items and Request objects
40 """
41 sel = Selector(response)
42
43 title = sel.xpath('head/title/text()').extract()[0]
44 if title == 'Name Not Found':
45 log.msg('NIST: Chemical not found!', level=log.ERROR)
46 return
47 if title not in self.ignore_list:
48 self.ignore_list.update(title)
49 log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
50 self._spider.get_synonym_requests(title)
51
52 requests = []
53
54 requests.extend(self.parse_generic_info(sel))
55
56 symbol_table = {}
57 tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
58 for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
59 symbol = ''.join(symbol_td.xpath('node()').extract())
60 name = name_td.xpath('text()').extract()[0]
61 symbol_table[symbol] = name
62 log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
63 level=log.DEBUG)
64
65 requests.extend(self.parse_tables(sel, symbol_table))
66
67 return requests
68
69 def parse_tables(self, sel, symbol_table):
70 """
71 This function identifies and distributes parsing of tables to other
72 functions below.
73 :param sel: A Selector object of the whole page
74 :param symbol_table: a dictionary containing translations of raw HTML
75 tags to human readable names
76 :return: a list of Result items and Requests
77 """
78 requests = []
79
80 for table in sel.xpath('//table[@class="data"]'):
81 summary = table.xpath('@summary').extract()[0]
82 if summary == 'One dimensional data':
83 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
84 requests.extend(
85 self.parse_aggregate_data(table, symbol_table))
86 elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
87 log.msg('NIST table; Enthalpy/entropy of phase transition',
88 level=log.DEBUG)
89 requests.extend(self.parse_transition_data(table, summary))
90 elif table.xpath('tr[1]/td'):
91 log.msg('NIST table: Horizontal table', level=log.DEBUG)
92 elif summary == 'Antoine Equation Parameters':
93 log.msg('NIST table: Antoine Equation Parameters',
94 level=log.DEBUG)
95 requests.extend(self.parse_antoine_data(table, summary))
96 elif len(table.xpath('tr[1]/th')) == 5:
97 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
98 # Symbol (unit) Temperature (K) Method Reference Comment
99 requests.extend(self.parse_generic_data(table, summary))
100 elif len(table.xpath('tr[1]/th')) == 4:
101 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
102 # Symbol (unit) Temperature (K) Reference Comment
103 requests.extend(self.parse_generic_data(table, summary))
104 else:
105 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
106 continue # Assume unsupported
107 return requests
108
109 def parse_generic_info(self, sel):
110 """
111 This function parses: synonyms, chemical formula, molecular weight,
112 InChI, InChiKey, CAS number
113 :param sel: A Selector object of the entire page in the original
114 response
115 :return: a list of Result items
116 """
117 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
118
119 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
120 for synonym in raw_synonyms[0].strip().split(';\n'):
121 log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
122 self.ignore_list.update(synonym)
123 self._spider.get_synonym_requests(synonym)
124
125 data = {}
126
127 raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
128 data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
129
130 raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
131 data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
132
133 raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
134 data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
135
136 raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
137 '/tt/text()')
138 data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
139
140 raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
141 data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
142
143 requests = []
144 for key, value in data.iteritems():
145 result = self.newresult(
146 attribute=key,
147 value=value
148 )
149 requests.append(result)
150
151 return requests
152
153 def parse_aggregate_data(self, table, symbol_table):
154 """
155 This function parses the table(s) which contain possible links to
156 individual data points
157 :param table: a Selector object of the table to be parsed
158 :param symbol_table: a dictionary containing translations of raw HTML
159 tags to human readable names
160 :return: a list of Result items and Request objects
161 """
162 results = []
163 for tr in table.xpath('tr[td]'):
164 extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
165 '/a/@href').extract()
166 if extra_data_url:
167 request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
168 callback=self.parse_individual_datapoints)
169 results.append(request)
170 continue
171 data = []
172 for td in tr.xpath('td'):
173 data.append(''.join(td.xpath('node()').extract()))
174
175 name = symbol_table[data[0]]
176 condition = ''
177
178 m = re.match(r'(.*) at (.*)', name)
179 if m:
180 name = m.group(1)
181 condition = m.group(2)
182
183 result = self.newresult(
184 attribute=name,
185 value=data[1] + ' ' + data[2],
186 conditions=condition
187 )
188 log.msg('NIST: |%s|' % data, level=log.DEBUG)
189 results.append(result)
190 return results
191
192 def parse_transition_data(self, table, summary):
193 """
194 This function parses the table containing properties regarding phase
195 changes
196 :param table: a Selector object of the table to be parsed
197 :param summary: the name of the property
198 :return: a list of Result items
199 """
200 results = []
201
202 unit = self.get_unit(table)
203
204 for tr in table.xpath('tr[td]'):
205 tds = tr.xpath('td/text()').extract()
206 result = self.newresult(
207 attribute=summary,
208 value=tds[0] + ' ' + unit,
209 conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
210 )
211 results.append(result)
212
213 return results
214
215 def parse_generic_data(self, table, summary):
216 """
217 Parses the common tables of 4 and 5 rows. Assumes they are of the
218 form:
219 Symbol (unit)|Temperature (K)|Method|Reference|Comment
220 Symbol (unit)|Temperature (K)|Reference|Comment
221 :param table: a Selector object of the table to be parsed
222 :param summary: the name of the property
223 :return: a list of Result items
224 """
225 results = []
226
227 unit = self.get_unit(table)
228
229 for tr in table.xpath('tr[td]'):
230 tds = tr.xpath('td/text()').extract()
231 result = self.newresult(
232 attribute=summary,
233 value=tds[0] + ' ' + unit,
234 conditions='%s K' % tds[1]
235 )
236 results.append(result)
237 return results
238
239 def parse_antoine_data(self, table, summary):
240 """
241 This function parses the table containing parameters for the Antione
242 equation
243 :param table: a Selector object of the table to be parsed
244 :param summary: the name of the property
245 :return: a list of Result items
246 """
247 results = []
248
249 for tr in table.xpath('tr[td]'):
250 tds = tr.xpath('td/text()').extract()
251 result = self.newresult(
252 attribute=summary,
253 value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
254 conditions='%s K' % tds[0]
255 )
256 results.append(result)
257
258 return results
259
260 def parse_individual_datapoints(self, response):
261 """
262 This function parses the 'individual data points' page linked from
263 the aggregate data table(s)
264 :param response: the Scrapy Response object to be parsed
265 :return: a list of Result items
266 """
267 sel = Selector(response)
268 table = sel.xpath('//table[@class="data"]')[0]
269
270 results = []
271
272 name = table.xpath('@summary').extract()[0]
273 condition = ''
274 m = re.match(r'(.*) at (.*)', name)
275 if m:
276 name = m.group(1)
277 condition = m.group(2)
278
279 unit = self.get_unit(table)
280
281 for tr in table.xpath('tr[td]'):
282 tds = tr.xpath('td/text()').extract()
283 uncertainty = ''
284 m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1])
285 if m:
286 uncertainty = '+- %s ' % m.group(1)
287 # [TODO]: get the plusminus sign working in here
288 result = self.newresult(
289 attribute=name,
290 value='%s %s%s' % (tds[0], uncertainty, unit),
291 conditions=condition
292 )
293 results.append(result)
294
295 return results
296
297 @staticmethod
298 def get_unit(table):
299 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
300 m = re.search(r'\((.*)\)', tr_unit)
301 unit = '!'
302 if m:
303 unit = m.group(1)
304
305 return unit
306
307 def newresult(self, attribute, value, conditions=''):
308 """
309 This function abstracts from the Result item and provides default
310 values
311 :param attribute: the name of the attribute
312 :param value: the value of the attribute
313 :param conditions: optional conditions regarding the value
314 :return: A Result item
315 """
316 return Result(
317 {
318 'attribute': attribute,
319 'value': value,
320 'source': 'NIST',
321 'reliability': self.cfg['reliability'],
322 'conditions': conditions
323 })
324
325 def new_compound_request(self, compound):
326 """
327 This function is called when a new synonym is returned to the spider
328 to generate new requests
329 :param compound: the name of the compound to search for
330 """
331 if compound not in self.ignore_list:
332 self.ignore_list.update(compound)
333 return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
334 callback=self.parse)