FourmiCrawler/sources/ChemSpider.py at develop

A web scraper build to search specific information for a given compound (and its pseudonyms)
Fourmi / FourmiCrawler / sources / ChemSpider.py
at develop 298 lines 12 kB view raw
wrap content
  1import re
  2
  3from scrapy import log
  4from scrapy.http import Request
  5from scrapy.selector import Selector
  6
  7from source import Source
  8from FourmiCrawler.items import Result
  9
 10
 11# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
 12
 13class ChemSpider(Source):
 14    """
 15    ChemSpider scraper for synonyms and properties
 16    This parser will manage searching for chemicals through the
 17    ChemsSpider API, and parsing the resulting ChemSpider page.
 18    The token required for the API should be in a configuration file
 19    somewhere.
 20    """
 21
 22    website = 'http://www\\.chemspider\\.com/.*'
 23
 24    search = 'Search.asmx/SimpleSearch?query=%s&token='
 25    structure = 'Chemical-Structure.%s.html'
 26    extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
 27
 28    def __init__(self, config=None):
 29        """
 30        Initialization of ChemSpider scraper
 31        :param config: a dictionary of settings for this scraper, must contain 
 32        'reliability' key
 33        """
 34        Source.__init__(self, config)
 35        self.ignore_list = []
 36        if 'token' not in self.cfg or self.cfg['token'] == '':
 37            log.msg('ChemSpider token not set or empty, search/MassSpec API '
 38                    'not available', level=log.WARNING)
 39            self.cfg['token'] = ''
 40        self.search += self.cfg['token']
 41        self.extendedinfo += self.cfg['token']
 42
 43    def parse(self, response):
 44        """
 45        This function is called when a Response matching the variable 
 46        'website' is available for parsing the Response object.
 47        :param response: the Scrapy Response object to be parsed
 48        :return: a list of Result items and Request objects
 49        """
 50        sel = Selector(response)
 51        requests = []
 52        requests_synonyms = self.parse_synonyms(sel)
 53        requests.extend(requests_synonyms)
 54        requests_properties = self.parse_properties(sel)
 55        requests.extend(requests_properties)
 56
 57        return requests
 58
 59    def parse_properties(self, sel):
 60        """
 61        This function scrapes the Experimental Data and Predicted ACD/Labs tabs
 62        :param sel: a Selector object of the whole page
 63        :return: a list of Result items
 64        """
 65        properties = []
 66
 67        properties.extend(self.parse_acdlabstab(sel))
 68        properties.extend(self.parse_experimentaldatatab(sel))
 69
 70        return properties
 71
 72    def parse_acdlabstab(self, sel):
 73        """
 74        This function scrapes the 'Predicted ACD/Labs tab' under Properties
 75        :param sel: a Selector object of the whole page
 76        :return: a list of Request objects
 77        """
 78        properties = []
 79
 80        td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
 81            'normalize-space(string())')
 82        prop_names = td_list[::2]
 83        prop_values = td_list[1::2]
 84        for (prop_name, prop_value) in zip(prop_names, prop_values):
 85            # [:-1] is to remove the colon at the end, [TODO] - test for colon
 86            prop_name = prop_name.extract().encode('utf-8')[:-1]
 87            prop_value = prop_value.extract().encode('utf-8')
 88            prop_conditions = ''
 89
 90            # Test for properties without values, with one hardcoded exception
 91            if (not re.match(r'^\d', prop_value) or
 92                    (prop_name == 'Polarizability' and prop_value == '10-24cm3')):
 93                continue
 94
 95            m = re.match(r'(.*) \((.*)\)', prop_name)
 96            if m:
 97                prop_name = m.group(1)
 98                prop_conditions = m.group(2)
 99
100            m = re.match(r'(.*) at (.*)', prop_value)
101            if m:
102                prop_value = m.group(1)
103                prop_conditions = m.group(2)
104
105            new_prop = self.newresult(
106                attribute=prop_name,
107                value=prop_value,
108                source='ChemSpider Predicted - ACD/Labs Tab',
109                conditions=prop_conditions
110            )
111            properties.append(new_prop)
112
113        return properties
114
115    def parse_experimentaldatatab(self, sel):
116        """
117        This function scrapes Experimental Data tab, Physico-chemical 
118        properties in particular.
119        :param sel: a Selector object of the whole page
120        :return: a list of Result items
121        """
122        properties = []
123
124        scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
125                                 'Properties"]//li/table/tr/td')
126        if not scraped_list:
127            return properties
128        # Format is: property name followed by a list of values
129        property_name = scraped_list.pop(0).xpath(
130            'span/text()').extract()[0].rstrip()
131        for line in scraped_list:
132            if line.xpath('span/text()'):
133                property_name = line.xpath('span/text()').extract()[0].rstrip()
134            else:
135                new_prop = self.newresult(
136                    attribute=property_name[:-1],
137                    value=line.xpath('text()').extract()[0].rstrip(),
138                    source=line.xpath('strong/text()').extract()[0].rstrip(),
139                )
140        properties.append(new_prop)
141
142        return properties
143
144    def parse_synonyms(self, sel):
145        """
146        This function scrapes the list of Names and Identifiers
147        :param sel: a Selector object of the whole page
148        :return: a list of Requests
149        """
150        requests = []
151        synonyms = []
152
153        # Exact type for this is unknown, but equivalent to Validated by Expert
154        for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
155            name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
156            synonyms.append(self.new_synonym(syn, name, 'expert'))
157        # These synonyms are labeled by ChemSpider as "Validated by Experts"
158        for syn in sel.xpath('//p[@class="syn"][strong]'):
159            name = syn.xpath('strong/text()').extract()[0]
160            synonyms.append(self.new_synonym(syn, name, 'expert'))
161        # These synonyms are labeled by ChemSpider as "Validated by Users"
162        for syn in sel.xpath(
163                '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
164            name = syn.xpath(
165                'span[@class="synonym_confirmed"]/text()').extract()[0]
166            synonyms.append(self.new_synonym(syn, name, 'user'))
167        # These syonyms are labeled as "Non-validated" and assumed unreliable
168        for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
169            name = syn.xpath('span[@class=""]/text()').extract()[0]
170            synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
171
172        # [TODO] - confirm if English User-Validated synonyms are OK too
173        for syn in synonyms:
174            if syn['category'] == 'expert' and syn['language'] == 'English':
175                log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
176                self._spider.get_synonym_requests(syn['name'])
177
178        return requests
179
180    def new_synonym(self, sel, name, category):
181        """
182        This function scrapes for a single synonym at a given HTML tag
183        :param sel: a Selector object of the given HTML tag
184        :param name: the name of the synonym in the tag
185        :param category: the name of the category the synonym is labeled as
186        :return: a dictionary containing data on the synonym
187        """
188        self.ignore_list.append(name)
189        language = sel.xpath('span[@class="synonym_language"]/text()')
190        if language:
191            # The [1:-1] is to remove brackets around the language name
192            language = language.extract()[0][1:-1]
193        else:
194            # If language is not given, English is assumed, [TODO] - confirm
195            language = 'English'
196        log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
197                level=log.DEBUG)
198        references = []
199        # A synonym can have multiple references, each optionally with link
200        for ref in sel.xpath('span[@class="synonym_ref"]'):
201            refname = ref.xpath('normalize-space(string())')
202            references.append({
203                'name': refname.extract()[0][1:-1],
204                'URI': ''
205            })
206        for ref in sel.xpath('a[@class="synonym_ref"]'):
207            references.append({
208                'name': ref.xpath('@title').extract()[0],
209                'URI': ref.xpath('@href').extract()[0]
210            })
211        for ref in references:
212            log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
213                    level=log.DEBUG)
214        synonym = {
215            'name': name,
216            'category': category,
217            'language': language,
218            'references': references
219        }
220        return synonym
221
222    def parse_extendedinfo(self, response):
223        """
224        This function scrapes data from the ChemSpider GetExtendedCompoundInfo 
225        API, if a token is present in the configuration settings
226        :param response: a Response object to be parsed
227        :return: a list of Result items
228        """
229        sel = Selector(response)
230        properties = []
231        names = sel.xpath('*').xpath('name()').extract()
232        values = sel.xpath('*').xpath('text()').extract()
233        for (name, value) in zip(names, values):
234            result = self.newresult(
235                attribute=name,
236                value=value,  # These values have no unit!
237                source='ChemSpider ExtendedCompoundInfo',
238            )
239            if result['value']:
240                properties.append(result)
241        return properties
242
243    def newresult(self, attribute, value, conditions='', source='ChemSpider'):
244        """
245        This function abstracts from the Result item and provides default 
246        values.
247        :param attribute: the name of the attribute
248        :param value: the value of the attribute
249        :param conditions: optional conditions regarding the value
250        :param source: the name of the source if it is not ChemSpider
251        :return: A Result item
252        """
253        return Result({
254            'attribute': attribute,
255            'value': value,
256            'source': source,
257            'reliability': self.cfg['reliability'],
258            'conditions': conditions
259        })
260
261    def parse_searchrequest(self, response):
262        """
263        This function parses the initial response of the ChemSpider Search API
264        Requires a valid token to function.
265        :param response: the Response object to be parsed
266        :return: A Request for the information page and a Request for the 
267        extendedinfo API call
268        """
269        sel = Selector(response)
270        log.msg('chemspider parse_searchrequest', level=log.DEBUG)
271        sel.register_namespace('cs', 'http://www.chemspider.com/')
272        csids = sel.xpath('.//cs:int/text()').extract()
273        if len(csids) == 0:
274            log.msg('ChemSpider found nothing', level=log.ERROR)
275            return
276        elif len(csids) > 1:
277            log.msg('ChemSpider found multiple substances, taking first '
278                    'element', level=log.DEBUG)
279        csid = csids[0]
280        structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
281        extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
282        log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
283        return [Request(url=structure_url,
284                        callback=self.parse),
285                Request(url=extendedinfo_url,
286                        callback=self.parse_extendedinfo)]
287
288    def new_compound_request(self, compound):
289        """
290        This function is called when a new synonym is returned to the spider 
291        to generate new requests
292        :param compound: the name of the compound to search for
293        """
294        if compound in self.ignore_list or self.cfg['token'] == '':
295            return None
296        searchurl = self.website[:-2].replace("\\", "") + self.search % compound
297        log.msg('chemspider compound', level=log.DEBUG)
298        return Request(url=searchurl, callback=self.parse_searchrequest)