FourmiCrawler/sources/WikipediaParser.py at develop

A web scraper build to search specific information for a given compound (and its pseudonyms)
Fourmi / FourmiCrawler / sources / WikipediaParser.py
at develop 169 lines 6.9 kB view raw
wrap content
  1import re
  2
  3from scrapy.http import Request
  4from scrapy import log
  5from scrapy.selector import Selector
  6
  7from source import Source
  8from FourmiCrawler.items import Result
  9
 10
 11class WikipediaParser(Source):
 12    """ Wikipedia scraper for chemical properties
 13
 14    This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
 15    It also returns requests with other external sources which contain information on parsed subject.
 16    """
 17
 18    website = "http://en\\.wikipedia\\.org/wiki/.*"
 19    __spider = None
 20    searched_compounds = []
 21
 22    def __init__(self, config=None):
 23        Source.__init__(self, config)
 24
 25    def parse(self, response):
 26        """
 27        Distributes the above described behaviour
 28        :param response: The incoming search request
 29        :return: Returns the found properties if response is unique or returns none if it's already known
 30        """
 31        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
 32        sel = Selector(response)
 33        compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0]  # makes sure to use main page
 34        if compound in self.searched_compounds:
 35            return None
 36        else:
 37            items = self.parse_infobox(sel)
 38            self.searched_compounds.append(compound)
 39            return items
 40
 41    def parse_infobox(self, sel):
 42        """
 43        Scrape data from infobox on wikipedia.
 44
 45        Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
 46        :param sel: The selector with the html-information of the page to parse
 47        :return: item_list: Returns a list of properties with their values, source, etc..
 48        """
 49
 50        items = []
 51
 52        # scrape the chembox (wikipedia template)
 53        items = self.parse_chembox(sel, items)
 54
 55        # scrape the drugbox (wikipedia template)
 56        items = self.parse_drugbox(sel, items)
 57
 58        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
 59        item_list = self.clean_items(items)
 60
 61        identifiers = self.get_identifiers(sel)
 62
 63        #add extra sources to scrape from as requests
 64        for i, identifier in enumerate(identifiers):
 65            request = None
 66            #discard internal wikipedia links
 67            if re.match('//en\.wikipedia', identifier):
 68                log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
 69            #fix links starting with '//www.'
 70            elif re.match('/{2}', identifier):
 71                identifier = re.sub("/{2}", "http://", identifier)
 72                request = Request(identifier)
 73            else:
 74                request = Request(identifier)
 75            log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
 76            item_list.append(request)
 77
 78        return item_list
 79
 80    def parse_chembox(self, sel, items):
 81        """
 82        Scrape data from chembox infobox on wikipedia.
 83
 84        :param sel: The selector with the html-information of the page to parse
 85        :param items: the list of items where the result have to be stored in
 86        :return: items: the list of items with the new found and stored items
 87        """
 88        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
 89            xpath('normalize-space(string())')
 90        prop_names = tr_list[::2]
 91        prop_values = tr_list[1::2]
 92        for i, prop_name in enumerate(prop_names):
 93            item = self.newresult(
 94                attribute=prop_name.extract().encode('utf-8'),
 95                value=prop_values[i].extract().encode('utf-8')
 96            )
 97            items.append(item)
 98            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 99        return items
100
101    def parse_drugbox(self, sel, items):
102        """
103        Scrape data from drugbox infobox on wikipedia.
104
105        :param sel: The selector with the html-information of the page to parse
106        :param items: the list of items where the result have to be stored in
107        :return: items: the list of items with the new found and stored items
108        """
109        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
110        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
111        for tablerow in tr_list2:
112            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
113            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
114                    'normalize-space(string())'):
115                item = self.newresult(
116                    attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
117                    value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
118                )
119                items.append(item)
120                log.msg(
121                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
122                    level=log.DEBUG)
123        return items
124
125    def new_compound_request(self, compound):
126        return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
127
128    @staticmethod
129    def clean_items(items):
130
131        """
132        Clean up properties using regex, makes it possible to split the values from the units
133
134        Almost not in use, only cleans J/K/mol values and boiling/melting points.
135
136        :param items: List of properties with their values, source, etc..
137        :return: items: List of now cleaned up items
138        """
139        for item in items:
140            value = item['value']
141            m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
142            if m:
143                item['value'] = m.group(1) + " K"
144            m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)  # clean up J/K/mol values
145            if m:
146                item['value'] = m.group(1) + " J/K/mol"
147        return items
148
149    @staticmethod
150    def get_identifiers(sel):
151        """
152        Find external links, named 'Identifiers' to different sources.
153
154        :param sel: The selector with the html-information of the page to parse
155        :return: links: New links which can be used to expand the crawlers search
156        """
157        links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
158                          '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
159        return links
160
161    def newresult(self, attribute, value):
162        return Result(
163            {
164                'attribute': attribute,
165                'value': value,
166                'source': 'Wikipedia',
167                'reliability': self.cfg['reliability'],
168                'conditions': ''
169            })