FourmiCrawler/sources/ChemSpider.py at feature/executable

A web scraper build to search specific information for a given compound (and its pseudonyms)
Fourmi / FourmiCrawler / sources / ChemSpider.py
at feature/executable 231 lines 9.6 kB view raw
wrap content
  1import re
  2
  3from scrapy import log
  4from scrapy.http import Request
  5from scrapy.selector import Selector
  6
  7from source import Source
  8from FourmiCrawler.items import Result
  9
 10
 11# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
 12
 13
 14class ChemSpider(Source):
 15    """ChemSpider scraper for synonyms and properties
 16
 17    This parser will manage searching for chemicals through the
 18    ChemsSpider API, and parsing the resulting ChemSpider page.
 19    The token required for the API should be in a configuration file
 20    somewhere.
 21    """
 22
 23    def __init__(self):
 24        Source.__init__(self)
 25
 26    website = 'http://www.chemspider.com/*'
 27
 28    # [TODO] - Save and access token of specific user.
 29    search = ('Search.asmx/SimpleSearch?query=%s&token='
 30              '052bfd06-5ce4-43d6-bf12-89eabefd2338')
 31    structure = 'Chemical-Structure.%s.html'
 32    extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
 33                    '052bfd06-5ce4-43d6-bf12-89eabefd2338')
 34
 35    ignore_list = []
 36
 37    def parse(self, response):
 38        sel = Selector(response)
 39        requests = []
 40        requests_synonyms = self.parse_synonyms(sel)
 41        requests.extend(requests_synonyms)
 42        requests_properties = self.parse_properties(sel)
 43        requests.extend(requests_properties)
 44
 45        return requests
 46
 47    @staticmethod
 48    def parse_properties(sel):
 49        """scrape Experimental Data and Predicted ACD/Labs tabs"""
 50        properties = []
 51
 52        # Predicted - ACD/Labs tab
 53        td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
 54            'normalize-space(string())')
 55        prop_names = td_list[::2]
 56        prop_values = td_list[1::2]
 57        for (prop_name, prop_value) in zip(prop_names, prop_values):
 58            # [:-1] is to remove the colon at the end, [TODO] - test for colon
 59            prop_name = prop_name.extract().encode('utf-8')[:-1]
 60            prop_value = prop_value.extract().encode('utf-8')
 61            prop_conditions = ''
 62
 63            # Test for properties without values, with one hardcoded exception
 64            if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
 65                continue
 66
 67            # Match for condition in parentheses
 68            m = re.match(r'(.*) \((.*)\)', prop_name)
 69            if m:
 70                prop_name = m.group(1)
 71                prop_conditions = m.group(2)
 72
 73            # Match for condition in value seperated by an 'at'
 74            m = re.match(r'(.*) at (.*)', prop_value)
 75            if m:
 76                prop_value = m.group(1)
 77                prop_conditions = m.group(2)
 78
 79            new_prop = Result({
 80                'attribute': prop_name,
 81                'value': prop_value,
 82                'source': 'ChemSpider Predicted - ACD/Labs Tab',
 83                'reliability': 'Unknown',
 84                'conditions': prop_conditions
 85            })
 86            properties.append(new_prop)
 87            log.msg('CS prop: |%s| |%s| |%s|' %
 88                    (new_prop['attribute'], new_prop['value'], new_prop['source']),
 89                    level=log.DEBUG)
 90
 91        # Experimental Data Tab, Physico-chemical properties in particular
 92        scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
 93                                 'Properties"]//li/table/tr/td')
 94        if not scraped_list:
 95            return properties
 96        # Format is: property name followed by a list of values
 97        property_name = scraped_list.pop(0).xpath(
 98            'span/text()').extract()[0].rstrip()
 99        for line in scraped_list:
100            if line.xpath('span/text()'):
101                property_name = line.xpath('span/text()').extract()[0].rstrip()
102            else:
103                new_prop = Result({
104                    'attribute': property_name[:-1],
105                    'value': line.xpath('text()').extract()[0].rstrip(),
106                    'source': line.xpath(
107                        'strong/text()').extract()[0].rstrip(),
108                    'reliability': 'Unknown',
109                    'conditions': ''
110                })
111                properties.append(new_prop)
112                log.msg('CS prop: |%s| |%s| |%s|' %
113                        (new_prop['attribute'], new_prop['value'],
114                         new_prop['source']), level=log.DEBUG)
115
116        return properties
117
118    def parse_synonyms(self, sel):
119        """Scrape list of Names and Identifiers"""
120        requests = []
121        synonyms = []
122
123        # Exact type for this is unknown, but equivalent to Validated by Expert
124        for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
125            name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
126            synonyms.append(self.new_synonym(syn, name, 'expert'))
127        # These synonyms are labeled by ChemSpider as "Validated by Experts"
128        for syn in sel.xpath('//p[@class="syn"][strong]'):
129            name = syn.xpath('strong/text()').extract()[0]
130            synonyms.append(self.new_synonym(syn, name, 'expert'))
131        # These synonyms are labeled by ChemSpider as "Validated by Users"
132        for syn in sel.xpath(
133                '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
134            name = syn.xpath(
135                'span[@class="synonym_confirmed"]/text()').extract()[0]
136            synonyms.append(self.new_synonym(syn, name, 'user'))
137        # These syonyms are labeled as "Non-validated" and assumed unreliable
138        for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
139            name = syn.xpath('span[@class=""]/text()').extract()[0]
140            synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
141
142        # [TODO] - confirm if English User-Validated synonyms are OK too
143        for syn in synonyms:
144            if syn['category'] == 'expert' and syn['language'] == 'English':
145                log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
146                self._spider.get_synonym_requests(syn['name'])
147
148        return requests
149
150    def new_synonym(self, sel, name, category):
151        """Scrape for a single synonym at a given HTML tag"""
152        self.ignore_list.append(name)
153        language = sel.xpath('span[@class="synonym_language"]/text()')
154        if language:
155            # The [1:-1] is to remove brackets around the language name
156            language = language.extract()[0][1:-1]
157        else:
158            # If language is not given, English is assumed, [TODO] - confirm
159            language = 'English'
160        log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
161                level=log.DEBUG)
162        references = []
163        # A synonym can have multiple references, each optionally with link
164        for ref in sel.xpath('span[@class="synonym_ref"]'):
165            refname = ref.xpath('normalize-space(string())')
166            references.append({
167                'name': refname.extract()[0][1:-1],
168                'URI': ''
169            })
170        for ref in sel.xpath('a[@class="synonym_ref"]'):
171            references.append({
172                'name': ref.xpath('@title').extract()[0],
173                'URI': ref.xpath('@href').extract()[0]
174            })
175        for ref in references:
176            log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
177                    level=log.DEBUG)
178        synonym = {
179            'name': name,
180            'category': category,
181            'language': language,
182            'references': references
183        }
184        return synonym
185
186    @staticmethod
187    def parse_extendedinfo(response):
188        """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
189        sel = Selector(response)
190        properties = []
191        names = sel.xpath('*').xpath('name()').extract()
192        values = sel.xpath('*').xpath('text()').extract()
193        for (name, value) in zip(names, values):
194            result = Result({
195                'attribute': name,
196                'value': value,  # These values have no unit!
197                'source': 'ChemSpider ExtendedCompoundInfo',
198                'reliability': 'Unknown',
199                'conditions': ''
200            })
201            if result['value']:
202                properties.append(result)
203        return properties
204
205    def parse_searchrequest(self, response):
206        """Parse the initial response of the ChemSpider Search API """
207        sel = Selector(response)
208        log.msg('chemspider parse_searchrequest', level=log.DEBUG)
209        sel.register_namespace('cs', 'http://www.chemspider.com/')
210        csids = sel.xpath('.//cs:int/text()').extract()
211        if len(csids) == 0:
212            log.msg('ChemSpider found nothing', level=log.ERROR)
213            return
214        elif len(csids) > 1:
215            log.msg('ChemSpider found multiple substances, taking first '
216                    'element', level=log.DEBUG)
217        csid = csids[0]
218        structure_url = self.website[:-1] + self.structure % csid
219        extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
220        log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
221        return [Request(url=structure_url,
222                        callback=self.parse),
223                Request(url=extendedinfo_url,
224                        callback=self.parse_extendedinfo)]
225
226    def new_compound_request(self, compound):
227        if compound in self.ignore_list:  # [TODO] - add regular expression
228            return None
229        searchurl = self.website[:-1] + self.search % compound
230        log.msg('chemspider compound', level=log.DEBUG)
231        return Request(url=searchurl, callback=self.parse_searchrequest)