A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'develop' of github.com:Recondor/Fourmi into develop

+122 -25
+17 -5
FourmiCrawler/sources/ChemSpider.py
··· 47 47 properties = [] 48 48 49 49 # Predicted - ACD/Labs tab 50 - # [TODO] - test if tab contains data, some chemicals do not have data here 51 50 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 52 51 'normalize-space(string())') 53 52 prop_names = td_list[::2] ··· 57 56 prop_name = prop_name.extract().encode('utf-8')[:-1] 58 57 prop_value = prop_value.extract().encode('utf-8') 59 58 prop_conditions = '' 59 + 60 + # Test for properties without values, with one hardcoded exception 61 + if (not re.match(r'^\d', prop_value) or 62 + (prop_name == 'Polarizability' and 63 + prop_value == '10-24cm3')): 64 + continue 60 65 61 66 # Match for condition in parentheses 62 67 m = re.match(r'(.*) \((.*)\)', prop_name) ··· 192 197 'reliability': 'Unknown', 193 198 'conditions': '' 194 199 }) 195 - properties.append(result) 200 + if result['value']: 201 + properties.append(result) 196 202 return properties 197 203 198 204 def parse_searchrequest(self, response): ··· 200 206 sel = Selector(response) 201 207 log.msg('chemspider parse_searchrequest', level=log.DEBUG) 202 208 sel.register_namespace('cs', 'http://www.chemspider.com/') 203 - csid = sel.xpath('.//cs:int/text()').extract()[0] 204 - # [TODO] - handle multiple csids in case of vague search term 209 + csids = sel.xpath('.//cs:int/text()').extract() 210 + if len(csids) == 0: 211 + log.msg('ChemSpider found nothing', level=log.ERROR) 212 + return 213 + elif len(csids) > 1: 214 + log.msg('ChemSpider found multiple substances, taking first ' 215 + 'element', level=log.DEBUG) 216 + csid = csids[0] 205 217 structure_url = self.website[:-1] + self.structure % csid 206 218 extendedinfo_url = self.website[:-1] + self.extendedinfo % csid 207 219 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) ··· 215 227 return None 216 228 searchurl = self.website[:-1] + self.search % compound 217 229 log.msg('chemspider compound', level=log.DEBUG) 218 - return Request(url=searchurl, callback=self.parse_searchrequest) 230 + return Request(url=searchurl, callback=self.parse_searchrequest)
+23 -3
FourmiCrawler/sources/WikipediaParser.py
··· 36 36 """ scrape data from infobox on wikipedia. """ 37 37 items = [] 38 38 39 - #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape 40 - tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\ 39 + #be sure to get chembox (wikipedia template) 40 + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 41 41 xpath('normalize-space(string())') 42 42 prop_names = tr_list[::2] 43 43 prop_values = tr_list[1::2] ··· 46 46 'attribute': prop_name.extract().encode('utf-8'), 47 47 'value': prop_values[i].extract().encode('utf-8'), 48 48 'source': "Wikipedia", 49 - 'reliability': "", 49 + 'reliability': "Unknown", 50 50 'conditions': "" 51 51 }) 52 52 items.append(item) 53 53 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 54 + 55 + #scrape the drugbox (wikipedia template) 56 + tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 57 + log.msg('dit: %s' % tr_list2, level=log.DEBUG) 58 + for tablerow in tr_list2: 59 + log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 60 + if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 61 + 'normalize-space(string())'): 62 + item = Result({ 63 + 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 64 + 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 65 + 'source': "Wikipedia", 66 + 'reliability': "Unknown", 67 + 'conditions': "" 68 + }) 69 + items.append(item) 70 + log.msg( 71 + 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 72 + level=log.DEBUG) 73 + 54 74 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 55 75 item_list = self.clean_items(items) 56 76
+81
README.md
··· 1 + # Fourmi 2 + 3 + Fourmi is an web scraper for chemical substances. The program is designed to be 4 + used as a search engine to search multiple chemical databases for a specific 5 + substance. The program will produce all available attributes of the substance 6 + and conditions associated with the attributes. Fourmi also attempts to estimate 7 + the reliability of each data point to assist the user in deciding which data 8 + should be used. 9 + 10 + The Fourmi project is open source project licensed under the MIT license. Feel 11 + free to contribute! 12 + 13 + Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source 14 + web scraping framework for python. Most of the functionality of this project can 15 + be traced to this framework. Should the documentation for this application fall 16 + short, we suggest you take a close look at the [Scrapy architecture] 17 + (http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy 18 + documentation](http://doc.scrapy.org/en/latest/index.html). 19 + 20 + ### Installing 21 + 22 + If you're installing Fourmi, please take a look at our [installation guide](...) 23 + on our wiki. When you've installed the application, make sure to check our 24 + [usage guide](...). 25 + 26 + ### Using the Source 27 + 28 + To use the Fourmi source code multiple dependencies are required. Take a look at 29 + the [wiki page](...) on using the application source code for a step by step 30 + installation guide. 31 + 32 + When developing for the Fourmi project keep in mind that code readability is a 33 + must. To maintain the readability, code should be conform with the 34 + [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python 35 + code. More information about the different structures and principles of the 36 + Fourmi application can be found on our [wiki](...). 37 + 38 + ### To Do 39 + 40 + The Fourmi project has the following goals for the nearby future: 41 + 42 + __Main goals:__ 43 + 44 + - Improve our documentation and guides. (Assignee: Dekker) 45 + - Build an graphical user interface(GUI) as alternative for the command line 46 + interface(CLI). (Assignee: Harmen) 47 + - Compiling the source into an windows executable. (Assignee: Bas) 48 + - Create an configuration file to hold logins and API keys. 49 + - Determine reliability of our data point. 50 + - Create an module to gather data from NIST. (Assignee: Rob) 51 + - Create an module to gather data from PubChem. (Assignee: Nout) 52 + 53 + __Side goals:__ 54 + 55 + - Clean and unify data. 56 + - Extensive reliability analysis using statistical tests. 57 + - Test data with Descartes 1. 58 + 59 + ### Project Origin 60 + 61 + The Fourmi project was started in February of 2014 as part of a software 62 + engineering course at the Radboud University for students studying Computer 63 + Science, Information Science or Artificial Intelligence. Students participate in 64 + a real software development project as part of the 65 + [Giphouse](http://www.giphouse.nl/). 66 + 67 + This particular project was started on behalf of Ivo B. Rietveld. As a chemist 68 + he was in need of an application to automatically search information on chemical 69 + substances and create an phase diagram. The so called "Descrates" project was 70 + split into two teams each creating a different application that has part of the 71 + functionality. We are the team Descartes 2 and as we were responsible for 72 + creating a web crawler, we've named our application Fourmi (Englis: Ants). 73 + 74 + The following people were part of the original team: 75 + 76 + - [Jip J. Dekker](http://jip.dekker.li) 77 + - Rob ten Berge 78 + - Harmen Prins 79 + - Bas van Berkel 80 + - Nout van Deijck 81 + - Michail Kuznetcov
-16
README.rst
··· 1 - We are the team Descartes 2. 2 - ---------------------------- 3 - 4 - Our team members are: 5 - 6 - + Rob ten Berge 7 - 8 - + Bas van Berkel 9 - 10 - + Nout van Deijck 11 - 12 - + Jip J. Dekker 13 - 14 - + Michail Kuznetcov 15 - 16 - + Harmen Prins
+1 -1
fourmi.py
··· 80 80 81 81 82 82 if __name__ == '__main__': 83 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6') 83 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') 84 84 loader = SourceLoader() 85 85 86 86 if arguments["--include"]: