FourmiCrawler/sources/NIST.py at develop

A web scraper build to search specific information for a given compound (and its pseudonyms)
Fourmi / FourmiCrawler / sources / NIST.py
at develop 334 lines 12 kB view raw
wrap content
  1import re
  2
  3from scrapy import log
  4from scrapy.http import Request
  5from scrapy.selector import Selector
  6
  7from source import Source
  8from FourmiCrawler.items import Result
  9
 10
 11# [TODO]: values can be '128.', perhaps remove the dot in that case?
 12# [TODO]: properties have references and comments which do not exist in the
 13# Result item, but should be included eventually.
 14
 15class NIST(Source):
 16    """
 17    NIST Scraper plugin
 18    This plugin manages searching for a chemical on the NIST website
 19    and parsing the resulting page if the chemical exists on NIST.
 20    """
 21    website = "http://webbook\\.nist\\.gov/.*"
 22
 23    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
 24
 25    def __init__(self, config=None):
 26        """
 27        Initialization of NIST scraper
 28        :param config: configuration variables for this scraper, must contain 
 29        'reliability' key.
 30        """
 31        Source.__init__(self, config)
 32        self.ignore_list = set()
 33
 34    def parse(self, response):
 35        """
 36        This function is called when a Response matching the variable 
 37        'website' is available for parsing the Response object.
 38        :param response: The Scrapy Response object to be parsed
 39        :return: a list of Result items and Request objects
 40        """
 41        sel = Selector(response)
 42
 43        title = sel.xpath('head/title/text()').extract()[0]
 44        if title == 'Name Not Found':
 45            log.msg('NIST: Chemical not found!', level=log.ERROR)
 46            return
 47        if title not in self.ignore_list:
 48            self.ignore_list.update(title)
 49            log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
 50            self._spider.get_synonym_requests(title)
 51
 52        requests = []
 53
 54        requests.extend(self.parse_generic_info(sel))
 55
 56        symbol_table = {}
 57        tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
 58        for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
 59            symbol = ''.join(symbol_td.xpath('node()').extract())
 60            name = name_td.xpath('text()').extract()[0]
 61            symbol_table[symbol] = name
 62            log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
 63                    level=log.DEBUG)
 64
 65        requests.extend(self.parse_tables(sel, symbol_table))
 66
 67        return requests
 68
 69    def parse_tables(self, sel, symbol_table):
 70        """
 71        This function identifies and distributes parsing of tables to other 
 72        functions below.
 73        :param sel: A Selector object of the whole page
 74        :param symbol_table: a dictionary containing translations of raw HTML 
 75        tags to human readable names
 76        :return: a list of Result items and Requests
 77        """
 78        requests = []
 79
 80        for table in sel.xpath('//table[@class="data"]'):
 81            summary = table.xpath('@summary').extract()[0]
 82            if summary == 'One dimensional data':
 83                log.msg('NIST table: Aggregrate data', level=log.DEBUG)
 84                requests.extend(
 85                    self.parse_aggregate_data(table, symbol_table))
 86            elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
 87                log.msg('NIST table; Enthalpy/entropy of phase transition',
 88                        level=log.DEBUG)
 89                requests.extend(self.parse_transition_data(table, summary))
 90            elif table.xpath('tr[1]/td'):
 91                log.msg('NIST table: Horizontal table', level=log.DEBUG)
 92            elif summary == 'Antoine Equation Parameters':
 93                log.msg('NIST table: Antoine Equation Parameters',
 94                        level=log.DEBUG)
 95                requests.extend(self.parse_antoine_data(table, summary))
 96            elif len(table.xpath('tr[1]/th')) == 5:
 97                log.msg('NIST table: generic 5 columns', level=log.DEBUG)
 98                # Symbol (unit) Temperature (K) Method Reference Comment
 99                requests.extend(self.parse_generic_data(table, summary))
100            elif len(table.xpath('tr[1]/th')) == 4:
101                log.msg('NIST table: generic 4 columns', level=log.DEBUG)
102                # Symbol (unit) Temperature (K) Reference Comment
103                requests.extend(self.parse_generic_data(table, summary))
104            else:
105                log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
106                continue  # Assume unsupported
107        return requests
108
109    def parse_generic_info(self, sel):
110        """
111        This function parses: synonyms, chemical formula, molecular weight, 
112        InChI, InChiKey, CAS number
113        :param sel: A Selector object of the entire page in the original 
114        response
115        :return: a list of Result items
116        """
117        ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
118
119        raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
120        for synonym in raw_synonyms[0].strip().split(';\n'):
121            log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
122            self.ignore_list.update(synonym)
123            self._spider.get_synonym_requests(synonym)
124
125        data = {}
126
127        raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
128        data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
129
130        raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
131        data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
132
133        raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
134        data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
135
136        raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
137                                '/tt/text()')
138        data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
139
140        raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
141        data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
142
143        requests = []
144        for key, value in data.iteritems():
145            result = self.newresult(
146                attribute=key,
147                value=value
148            )
149            requests.append(result)
150
151        return requests
152
153    def parse_aggregate_data(self, table, symbol_table):
154        """
155        This function parses the table(s) which contain possible links to 
156        individual data points
157        :param table: a Selector object of the table to be parsed
158        :param symbol_table: a dictionary containing translations of raw HTML 
159        tags to human readable names
160        :return: a list of Result items and Request objects
161        """
162        results = []
163        for tr in table.xpath('tr[td]'):
164            extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
165                                      '/a/@href').extract()
166            if extra_data_url:
167                request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
168                                  callback=self.parse_individual_datapoints)
169                results.append(request)
170                continue
171            data = []
172            for td in tr.xpath('td'):
173                data.append(''.join(td.xpath('node()').extract()))
174
175            name = symbol_table[data[0]]
176            condition = ''
177
178            m = re.match(r'(.*) at (.*)', name)
179            if m:
180                name = m.group(1)
181                condition = m.group(2)
182
183            result = self.newresult(
184                attribute=name,
185                value=data[1] + ' ' + data[2],
186                conditions=condition
187            )
188            log.msg('NIST: |%s|' % data, level=log.DEBUG)
189            results.append(result)
190        return results
191
192    def parse_transition_data(self, table, summary):
193        """
194        This function parses the table containing properties regarding phase 
195        changes
196        :param table: a Selector object of the table to be parsed
197        :param summary: the name of the property
198        :return: a list of Result items
199        """
200        results = []
201
202        unit = self.get_unit(table)
203
204        for tr in table.xpath('tr[td]'):
205            tds = tr.xpath('td/text()').extract()
206            result = self.newresult(
207                attribute=summary,
208                value=tds[0] + ' ' + unit,
209                conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
210            )
211            results.append(result)
212
213        return results
214
215    def parse_generic_data(self, table, summary):
216        """
217        Parses the common tables of 4 and 5 rows. Assumes they are of the
218        form:
219        Symbol (unit)|Temperature (K)|Method|Reference|Comment
220        Symbol (unit)|Temperature (K)|Reference|Comment
221        :param table: a Selector object of the table to be parsed
222        :param summary: the name of the property
223        :return: a list of Result items
224        """
225        results = []
226
227        unit = self.get_unit(table)
228
229        for tr in table.xpath('tr[td]'):
230            tds = tr.xpath('td/text()').extract()
231            result = self.newresult(
232                attribute=summary,
233                value=tds[0] + ' ' + unit,
234                conditions='%s K' % tds[1]
235            )
236            results.append(result)
237        return results
238
239    def parse_antoine_data(self, table, summary):
240        """
241        This function parses the table containing parameters for the Antione 
242        equation
243        :param table: a Selector object of the table to be parsed
244        :param summary: the name of the property
245        :return: a list of Result items
246        """
247        results = []
248
249        for tr in table.xpath('tr[td]'):
250            tds = tr.xpath('td/text()').extract()
251            result = self.newresult(
252                attribute=summary,
253                value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
254                conditions='%s K' % tds[0]
255            )
256            results.append(result)
257
258        return results
259
260    def parse_individual_datapoints(self, response):
261        """
262        This function parses the 'individual data points' page linked from 
263        the aggregate data table(s)
264        :param response: the Scrapy Response object to be parsed
265        :return: a list of Result items
266        """
267        sel = Selector(response)
268        table = sel.xpath('//table[@class="data"]')[0]
269
270        results = []
271
272        name = table.xpath('@summary').extract()[0]
273        condition = ''
274        m = re.match(r'(.*) at (.*)', name)
275        if m:
276            name = m.group(1)
277            condition = m.group(2)
278
279        unit = self.get_unit(table)
280
281        for tr in table.xpath('tr[td]'):
282            tds = tr.xpath('td/text()').extract()
283            uncertainty = ''
284            m = re.search('Uncertainty assigned by TRC =  (.*?) ', tds[-1])
285            if m:
286                uncertainty = '+- %s ' % m.group(1)
287                # [TODO]: get the plusminus sign working in here
288            result = self.newresult(
289                attribute=name,
290                value='%s %s%s' % (tds[0], uncertainty, unit),
291                conditions=condition
292            )
293            results.append(result)
294
295        return results
296
297    @staticmethod
298    def get_unit(table):
299        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
300        m = re.search(r'\((.*)\)', tr_unit)
301        unit = '!'
302        if m:
303            unit = m.group(1)
304
305        return unit
306
307    def newresult(self, attribute, value, conditions=''):
308        """
309        This function abstracts from the Result item and provides default 
310        values
311        :param attribute: the name of the attribute
312        :param value: the value of the attribute
313        :param conditions: optional conditions regarding the value
314        :return: A Result item
315        """
316        return Result(
317            {
318                'attribute': attribute,
319                'value': value,
320                'source': 'NIST',
321                'reliability': self.cfg['reliability'],
322                'conditions': conditions
323            })
324
325    def new_compound_request(self, compound):
326        """
327        This function is called when a new synonym is returned to the spider 
328        to generate new requests
329        :param compound: the name of the compound to search for
330        """
331        if compound not in self.ignore_list:
332            self.ignore_list.update(compound)
333            return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
334                           callback=self.parse)