A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'release/v0.3.0'

+153 -34
+21 -2
FourmiCrawler/pipelines.py
··· 2 2 # 3 3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 4 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 + import re 5 6 from scrapy.exceptions import DropItem 6 7 7 8 8 - class FourmiPipeline(object): 9 + class DuplicatePipeline(object): 9 10 10 11 def __init__(self): 11 12 self.known_values = set() ··· 17 18 :param spider: The spider which scraped the spider 18 19 :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 20 """ 20 - value = item['attribute'], item['value'] 21 + value = (item['attribute'], item['value'], item['conditions']) 21 22 if value in self.known_values: 22 23 raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. 23 24 else: 24 25 self.known_values.add(value) 25 26 return item 27 + 28 + class AttributeSelectionPipeline(object): 29 + 30 + def __init__(self): 31 + pass; 32 + 33 + def process_item(self, item, spider): 34 + """ 35 + The items are processed using the selected attribute list available in the spider, 36 + items that don't match the selected items are dropped. 37 + :param item: The incoming item 38 + :param spider: The spider which scraped the item. Should have an attribute "selected_attributes". 39 + :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped. 40 + """ 41 + if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]: 42 + return item 43 + else: 44 + raise DropItem("Attribute not selected by used: %s" % item)
+2 -1
FourmiCrawler/settings.py
··· 11 11 SPIDER_MODULES = ['FourmiCrawler'] 12 12 NEWSPIDER_MODULE = 'FourmiCrawler' 13 13 ITEM_PIPELINES = { 14 - 'FourmiCrawler.pipelines.FourmiPipeline': 100 14 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, 15 + 'FourmiCrawler.pipelines.DuplicatePipeline': 200, 15 16 } 16 17 FEED_URI = 'results.json' 17 18 FEED_FORMAT = 'jsonlines'
+17 -5
FourmiCrawler/sources/ChemSpider.py
··· 47 47 properties = [] 48 48 49 49 # Predicted - ACD/Labs tab 50 - # [TODO] - test if tab contains data, some chemicals do not have data here 51 50 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 52 51 'normalize-space(string())') 53 52 prop_names = td_list[::2] ··· 57 56 prop_name = prop_name.extract().encode('utf-8')[:-1] 58 57 prop_value = prop_value.extract().encode('utf-8') 59 58 prop_conditions = '' 59 + 60 + # Test for properties without values, with one hardcoded exception 61 + if (not re.match(r'^\d', prop_value) or 62 + (prop_name == 'Polarizability' and 63 + prop_value == '10-24cm3')): 64 + continue 60 65 61 66 # Match for condition in parentheses 62 67 m = re.match(r'(.*) \((.*)\)', prop_name) ··· 192 197 'reliability': 'Unknown', 193 198 'conditions': '' 194 199 }) 195 - properties.append(result) 200 + if result['value']: 201 + properties.append(result) 196 202 return properties 197 203 198 204 def parse_searchrequest(self, response): ··· 200 206 sel = Selector(response) 201 207 log.msg('chemspider parse_searchrequest', level=log.DEBUG) 202 208 sel.register_namespace('cs', 'http://www.chemspider.com/') 203 - csid = sel.xpath('.//cs:int/text()').extract()[0] 204 - # [TODO] - handle multiple csids in case of vague search term 209 + csids = sel.xpath('.//cs:int/text()').extract() 210 + if len(csids) == 0: 211 + log.msg('ChemSpider found nothing', level=log.ERROR) 212 + return 213 + elif len(csids) > 1: 214 + log.msg('ChemSpider found multiple substances, taking first ' 215 + 'element', level=log.DEBUG) 216 + csid = csids[0] 205 217 structure_url = self.website[:-1] + self.structure % csid 206 218 extendedinfo_url = self.website[:-1] + self.extendedinfo % csid 207 219 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) ··· 215 227 return None 216 228 searchurl = self.website[:-1] + self.search % compound 217 229 log.msg('chemspider compound', level=log.DEBUG) 218 - return Request(url=searchurl, callback=self.parse_searchrequest) 230 + return Request(url=searchurl, callback=self.parse_searchrequest)
+23 -3
FourmiCrawler/sources/WikipediaParser.py
··· 36 36 """ scrape data from infobox on wikipedia. """ 37 37 items = [] 38 38 39 - #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape 40 - tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\ 39 + #be sure to get chembox (wikipedia template) 40 + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 41 41 xpath('normalize-space(string())') 42 42 prop_names = tr_list[::2] 43 43 prop_values = tr_list[1::2] ··· 46 46 'attribute': prop_name.extract().encode('utf-8'), 47 47 'value': prop_values[i].extract().encode('utf-8'), 48 48 'source': "Wikipedia", 49 - 'reliability': "", 49 + 'reliability': "Unknown", 50 50 'conditions': "" 51 51 }) 52 52 items.append(item) 53 53 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 54 + 55 + #scrape the drugbox (wikipedia template) 56 + tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 57 + log.msg('dit: %s' % tr_list2, level=log.DEBUG) 58 + for tablerow in tr_list2: 59 + log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 60 + if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 61 + 'normalize-space(string())'): 62 + item = Result({ 63 + 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 64 + 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 65 + 'source': "Wikipedia", 66 + 'reliability': "Unknown", 67 + 'conditions': "" 68 + }) 69 + items.append(item) 70 + log.msg( 71 + 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 72 + level=log.DEBUG) 73 + 54 74 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 55 75 item_list = self.clean_items(items) 56 76
+2 -1
FourmiCrawler/spider.py
··· 8 8 __parsers = [] 9 9 synonyms = [] 10 10 11 - def __init__(self, compound=None, *args, **kwargs): 11 + def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 12 12 super(FourmiSpider, self).__init__(*args, **kwargs) 13 13 self.synonyms.append(compound) 14 + self.selected_attributes = selected_attributes; 14 15 15 16 def parse(self, reponse): 16 17 for parser in self.__parsers:
+81
README.md
··· 1 + # Fourmi 2 + 3 + Fourmi is an web scraper for chemical substances. The program is designed to be 4 + used as a search engine to search multiple chemical databases for a specific 5 + substance. The program will produce all available attributes of the substance 6 + and conditions associated with the attributes. Fourmi also attempts to estimate 7 + the reliability of each data point to assist the user in deciding which data 8 + should be used. 9 + 10 + The Fourmi project is open source project licensed under the MIT license. Feel 11 + free to contribute! 12 + 13 + Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source 14 + web scraping framework for python. Most of the functionality of this project can 15 + be traced to this framework. Should the documentation for this application fall 16 + short, we suggest you take a close look at the [Scrapy architecture] 17 + (http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy 18 + documentation](http://doc.scrapy.org/en/latest/index.html). 19 + 20 + ### Installing 21 + 22 + If you're installing Fourmi, please take a look at our [installation guide](...) 23 + on our wiki. When you've installed the application, make sure to check our 24 + [usage guide](...). 25 + 26 + ### Using the Source 27 + 28 + To use the Fourmi source code multiple dependencies are required. Take a look at 29 + the [wiki page](...) on using the application source code for a step by step 30 + installation guide. 31 + 32 + When developing for the Fourmi project keep in mind that code readability is a 33 + must. To maintain the readability, code should be conform with the 34 + [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python 35 + code. More information about the different structures and principles of the 36 + Fourmi application can be found on our [wiki](...). 37 + 38 + ### To Do 39 + 40 + The Fourmi project has the following goals for the nearby future: 41 + 42 + __Main goals:__ 43 + 44 + - Improve our documentation and guides. (Assignee: Dekker) 45 + - Build an graphical user interface(GUI) as alternative for the command line 46 + interface(CLI). (Assignee: Harmen) 47 + - Compiling the source into an windows executable. (Assignee: Bas) 48 + - Create an configuration file to hold logins and API keys. 49 + - Determine reliability of our data point. 50 + - Create an module to gather data from NIST. (Assignee: Rob) 51 + - Create an module to gather data from PubChem. (Assignee: Rob) 52 + 53 + __Side goals:__ 54 + 55 + - Clean and unify data. 56 + - Extensive reliability analysis using statistical tests. 57 + - Test data with Descartes 1. 58 + 59 + ### Project Origin 60 + 61 + The Fourmi project was started in February of 2014 as part of a software 62 + engineering course at the Radboud University for students studying Computer 63 + Science, Information Science or Artificial Intelligence. Students participate in 64 + a real software development project as part of the 65 + [Giphouse](http://www.giphouse.nl/). 66 + 67 + This particular project was started on behalf of Ivo B. Rietveld. As a chemist 68 + he was in need of an application to automatically search information on chemical 69 + substances and create an phase diagram. The so called "Descrates" project was 70 + split into two teams each creating a different application that has part of the 71 + functionality. We are the team Descartes 2 and as we were responsible for 72 + creating a web crawler, we've named our application Fourmi (Englis: Ants). 73 + 74 + The following people were part of the original team: 75 + 76 + - [Jip J. Dekker](http://jip.dekker.li) 77 + - Rob ten Berge 78 + - Harmen Prins 79 + - Bas van Berkel 80 + - Nout van Deijck 81 + - Michail Kuznetcov
-16
README.rst
··· 1 - We are the team Descartes 2. 2 - ---------------------------- 3 - 4 - Our team members are: 5 - 6 - + Rob ten Berge 7 - 8 - + Bas van Berkel 9 - 10 - + Nout van Deijck 11 - 12 - + Jip J. Dekker 13 - 14 - + Michail Kuznetcov 15 - 16 - + Harmen Prins
+7 -6
fourmi.py
··· 12 12 fourmi --version 13 13 14 14 Options: 15 + --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 15 16 -h --help Show this screen. 16 17 --version Show version. 17 18 --verbose Verbose logging output. 18 19 --log=<file> Save log to an file. 19 20 -o <file> --output=<file> Output file [default: result.*format*] 20 21 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] 21 - --include=<sourcenames> Include only sources that match the regular these expressions split by a comma. 22 - --exclude=<sourcenames> Exclude the sources that match the regular these expressions split by a comma. 22 + --include=<regex> Include only sources that match these regular expressions split by a comma. 23 + --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. 23 24 """ 24 25 25 26 from twisted.internet import reactor ··· 32 33 from sourceloader import SourceLoader 33 34 34 35 35 - def setup_crawler(searchable, settings, source_loader): 36 - spider = FourmiSpider(compound=searchable) 36 + def setup_crawler(searchable, settings, source_loader, attributes): 37 + spider = FourmiSpider(compound=searchable, selected_attributes=attributes) 37 38 spider.add_parsers(source_loader.sources) 38 39 crawler = Crawler(settings) 39 40 crawler.signals.connect(reactor.stop, signal=signals.spider_closed) ··· 74 75 def search(docopt_arguments, source_loader): 75 76 start_log(docopt_arguments) 76 77 settings = scrapy_settings_manipulation(docopt_arguments) 77 - setup_crawler(docopt_arguments["<compound>"], settings, source_loader) 78 + setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) 78 79 reactor.run() 79 80 80 81 81 82 if __name__ == '__main__': 82 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6') 83 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') 83 84 loader = SourceLoader() 84 85 85 86 if arguments["--include"]: