A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy import log
4from scrapy.http import Request
5from scrapy.selector import Selector
6
7from source import Source
8from FourmiCrawler.items import Result
9
10
11# [TODO]: values can be '128.', perhaps remove the dot in that case?
12# [TODO]: properties have references and comments which do not exist in the
13# Result item, but should be included eventually.
14
15class NIST(Source):
16 """NIST Scraper plugin
17
18 This plugin manages searching for a chemical on the NIST website
19 and parsing the resulting page if the chemical exists on NIST.
20 """
21 website = "http://webbook.nist.gov/*"
22
23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
24
25 ignore_list = set()
26
27 def __init__(self):
28 Source.__init__(self)
29
30 def parse(self, response):
31 sel = Selector(response)
32
33 title = sel.xpath('head/title/text()').extract()[0]
34 if title == 'Name Not Found':
35 log.msg('NIST: Chemical not found!', level=log.ERROR)
36 return
37 if title not in self.ignore_list:
38 self.ignore_list.update(title)
39 log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
40 self._spider.get_synonym_requests(title)
41
42 requests = []
43
44 requests.extend(self.parse_generic_info(sel))
45
46 symbol_table = {}
47 tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
48 for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
49 symbol = ''.join(symbol_td.xpath('node()').extract())
50 name = name_td.xpath('text()').extract()[0]
51 symbol_table[symbol] = name
52 log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
53 level=log.DEBUG)
54
55 for table in sel.xpath('//table[@class="data"]'):
56 summary = table.xpath('@summary').extract()[0]
57 if summary == 'One dimensional data':
58 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
59 requests.extend(
60 self.parse_aggregate_data(table, symbol_table))
61 elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
62 log.msg('NIST table; Enthalpy/entropy of phase transition',
63 level=log.DEBUG)
64 requests.extend(self.parse_transition_data(table, summary))
65 elif table.xpath('tr[1]/td'):
66 log.msg('NIST table: Horizontal table', level=log.DEBUG)
67 elif summary == 'Antoine Equation Parameters':
68 log.msg('NIST table: Antoine Equation Parameters',
69 level=log.DEBUG)
70 requests.extend(self.parse_antoine_data(table, summary))
71 elif len(table.xpath('tr[1]/th')) == 5:
72 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
73 # Symbol (unit) Temperature (K) Method Reference Comment
74 requests.extend(self.parse_generic_data(table, summary))
75 elif len(table.xpath('tr[1]/th')) == 4:
76 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
77 # Symbol (unit) Temperature (K) Reference Comment
78 requests.extend(self.parse_generic_data(table, summary))
79 else:
80 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
81 continue # Assume unsupported
82 return requests
83
84 def parse_generic_info(self, sel):
85 """Parses: synonyms, chemical formula, molecular weight, InChI,
86 InChiKey, CAS number
87 """
88 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
89 li = ul.xpath('li')
90
91 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
92 for synonym in raw_synonyms[0].strip().split(';\n'):
93 log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
94 self.ignore_list.update(synonym)
95 self._spider.get_synonym_requests(synonym)
96
97 data = {}
98
99 raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
100 data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
101
102 raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
103 data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
104
105 raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
106 data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
107
108 raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
109 '/tt/text()')
110 data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
111
112 raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
113 data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
114
115 requests = []
116 for key, value in data.iteritems():
117 result = Result({
118 'attribute': key,
119 'value': value,
120 'source': 'NIST',
121 'reliability': 'Unknown',
122 'conditions': ''
123 })
124 requests.append(result)
125
126 return requests
127
128 def parse_aggregate_data(self, table, symbol_table):
129 """Parses the table(s) which contain possible links to individual
130 data points
131 """
132 results = []
133 for tr in table.xpath('tr[td]'):
134 extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
135 '/a/@href').extract()
136 if extra_data_url:
137 request = Request(url=self.website[:-1] + extra_data_url[0],
138 callback=self.parse_individual_datapoints)
139 results.append(request)
140 continue
141 data = []
142 for td in tr.xpath('td'):
143 data.append(''.join(td.xpath('node()').extract()))
144
145 name = symbol_table[data[0]]
146 condition = ''
147
148 m = re.match(r'(.*) at (.*)', name)
149 if m:
150 name = m.group(1)
151 condition = m.group(2)
152
153 result = Result({
154 'attribute': name,
155 'value': data[1] + ' ' + data[2],
156 'source': 'NIST',
157 'reliability': 'Unknown',
158 'conditions': condition
159 })
160 log.msg('NIST: |%s|' % data, level=log.DEBUG)
161 results.append(result)
162 return results
163
164 @staticmethod
165 def parse_transition_data(table, summary):
166 """Parses the table containing properties regarding phase changes"""
167 results = []
168
169 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
170 m = re.search(r'\((.*)\)', tr_unit)
171 unit = '!'
172 if m:
173 unit = m.group(1)
174
175 for tr in table.xpath('tr[td]'):
176 tds = tr.xpath('td/text()').extract()
177 result = Result({
178 'attribute': summary,
179 'value': tds[0] + ' ' + unit,
180 'source': 'NIST',
181 'reliability': 'Unknown',
182 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
183 })
184 results.append(result)
185
186 return results
187
188 @staticmethod
189 def parse_generic_data(table, summary):
190 """Parses the common tables of 4 and 5 rows. Assumes they are of the
191 form:
192 Symbol (unit)|Temperature (K)|Method|Reference|Comment
193 Symbol (unit)|Temperature (K)|Reference|Comment
194 """
195 results = []
196
197 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
198 m = re.search(r'\((.*)\)', tr_unit)
199 unit = '!'
200 if m:
201 unit = m.group(1)
202
203 for tr in table.xpath('tr[td]'):
204 tds = tr.xpath('td/text()').extract()
205 result = Result({
206 'attribute': summary,
207 'value': tds[0] + ' ' + unit,
208 'source': 'NIST',
209 'reliability': 'Unknown',
210 'conditions': '%s K' % tds[1]
211 })
212 results.append(result)
213 return results
214
215 @staticmethod
216 def parse_antoine_data(table, summary):
217 """Parse table containing parameters for the Antione equation"""
218 results = []
219
220 for tr in table.xpath('tr[td]'):
221 tds = tr.xpath('td/text()').extract()
222 result = Result({
223 'attribute': summary,
224 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
225 'source': 'NIST',
226 'reliability': 'Unknown',
227 'conditions': '%s K' % tds[0]
228 })
229 results.append(result)
230
231 return results
232
233 @staticmethod
234 def parse_individual_datapoints(response):
235 """Parses the page linked from aggregate data"""
236 sel = Selector(response)
237 table = sel.xpath('//table[@class="data"]')[0]
238
239 results = []
240
241 name = table.xpath('@summary').extract()[0]
242 condition = ''
243 m = re.match(r'(.*) at (.*)', name)
244 if m:
245 name = m.group(1)
246 condition = m.group(2)
247
248 tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
249 m = re.search(r'\((.*)\)', tr_unit)
250 unit = '!'
251 if m:
252 unit = m.group(1)
253
254 for tr in table.xpath('tr[td]'):
255 tds = tr.xpath('td/text()').extract()
256 uncertainty = ''
257 m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1])
258 if m:
259 uncertainty = '+- %s ' % m.group(1)
260 # [TODO]: get the plusminus sign working in here
261 result = Result({
262 'attribute': name,
263 'value': '%s %s%s' % (tds[0], uncertainty, unit),
264 'source': 'NIST',
265 'reliability': 'Unknown',
266 'conditions': condition
267 })
268 results.append(result)
269
270 return results
271
272 def new_compound_request(self, compound):
273 if compound not in self.ignore_list:
274 self.ignore_list.update(compound)
275 return Request(url=self.website[:-1] + self.search % compound,
276 callback=self.parse)