FourmiCrawler/sources/PubChem.py at develop

A web scraper build to search specific information for a given compound (and its pseudonyms)
Fourmi / FourmiCrawler / sources / PubChem.py
at develop 149 lines 6.1 kB view raw
wrap content
  1import re
  2
  3from scrapy.http import Request
  4from scrapy import log
  5from scrapy.selector import Selector
  6
  7from source import Source
  8from FourmiCrawler.items import Result
  9
 10
 11class PubChem(Source):
 12    """ PubChem scraper for chemical properties
 13
 14        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
 15        including sources of the values of properties.
 16    """
 17
 18    # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
 19    website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
 20    website_www = 'http://www.ncbi.nlm.nih.gov/*'
 21    website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
 22    search = 'pccompound?term=%s'
 23    data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
 24
 25    __spider = None
 26    searched_compounds = set()
 27
 28    def __init__(self, config):
 29        Source.__init__(self, config)
 30        self.cfg = config
 31
 32    def parse(self, response):
 33        """
 34        Distributes the above described behaviour
 35        :param response: The incoming search request
 36        :return Returns the found properties if response is unique or returns none if it's already known
 37        """
 38        requests = []
 39        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
 40
 41        sel = Selector(response)
 42        compound = sel.xpath('//h1/text()').extract()[0]
 43        if compound in self.searched_compounds:
 44            return None
 45
 46        self.searched_compounds.update(compound)
 47        raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
 48        for synonym in raw_synonyms.strip().split(', '):
 49            log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
 50            self.searched_compounds.update(synonym)
 51            self._spider.get_synonym_requests(synonym)
 52        log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
 53
 54        n = re.search(r'cid=(\d+)', response.url)
 55        if n:
 56            cid = n.group(1)
 57        log.msg('cid: %s' % cid, level=log.DEBUG)  # getting the right id of the compound with which it can reach
 58        # the seperate html page which contains the properties and their values
 59
 60        # using this cid to get the right url and scrape it
 61        requests.append(
 62            Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
 63        return requests
 64
 65    def parse_data(self, response):
 66        """
 67        Parse data found in 'Chemical and Physical properties' part of a substance page.
 68        :param response: The response with the page to parse
 69        :return: requests: Returns a list of properties with their values, source, etc.
 70        """
 71        log.msg('parsing data', level=log.DEBUG)
 72        requests = []
 73
 74        sel = Selector(response)
 75        props = sel.xpath('//div')
 76
 77        for prop in props:
 78            prop_name = ''.join(prop.xpath('b/text()').extract())  # name of property that it is parsing
 79            if prop.xpath('a'):  # parsing for single value in property
 80                prop_source = ''.join(prop.xpath('a/@title').extract())
 81                prop_value = ''.join(prop.xpath('a/text()').extract())
 82                new_prop = Result({
 83                    'attribute': prop_name,
 84                    'value': prop_value,
 85                    'source': prop_source,
 86                    'reliability': self.cfg['reliability'],
 87                    'conditions': ''
 88                })
 89                log.msg('PubChem prop: |%s| |%s| |%s|' %
 90                        (new_prop['attribute'], new_prop['value'],
 91                         new_prop['source']), level=log.DEBUG)
 92                requests.append(new_prop)
 93            elif prop.xpath('ul'):  # parsing for multiple values (list) in property
 94                prop_values = prop.xpath('ul//li')
 95                for prop_li in prop_values:
 96                    prop_value = ''.join(prop_li.xpath('a/text()').extract())
 97                    prop_source = ''.join(prop_li.xpath('a/@title').extract())
 98                    new_prop = Result({
 99                        'attribute': prop_name,
100                        'value': prop_value,
101                        'source': prop_source,
102                        'reliability': self.cfg['reliability'],
103                        'conditions': ''
104                    })
105                    log.msg('PubChem prop: |%s| |%s| |%s|' %
106                            (new_prop['attribute'], new_prop['value'],
107                             new_prop['source']), level=log.DEBUG)
108                    requests.append(new_prop)
109
110        return requests
111
112    def parse_searchrequest(self, response):
113        """
114        This function parses the response to the new_compound_request Request
115        :param response: the Response object to be parsed
116        :return: A Request for the compound page or what self.parse returns in
117                 case the search request forwarded to the compound page
118        """
119
120        # check if pubchem forwarded straight to compound page
121        m = re.match(self.website_pubchem, response.url)
122        if m:
123            log.msg('PubChem search forwarded to compound page',
124                    level=log.DEBUG)
125            return self.parse(response)
126
127        sel = Selector(response)
128
129        results = sel.xpath('//div[@class="rsltcont"]')
130        if results:
131            url = results[0].xpath('div/p/a[1]/@href')
132        else:
133            log.msg('PubChem search found nothing or xpath failed',
134                    level=log.DEBUG)
135            return None
136
137        if url:
138            url = 'http:' + ''.join(url[0].extract())
139            log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
140        else:
141            log.msg('PubChem search found results, but no url in first result',
142                    level=log.DEBUG)
143            return None
144
145        return Request(url=url, callback=self.parse)
146
147    def new_compound_request(self, compound):
148        return Request(url=self.website_www[:-1] + self.search % compound,
149                       callback=self.parse_searchrequest)