···4747 properties = []
48484949 # Predicted - ACD/Labs tab
5050- # [TODO] - test if tab contains data, some chemicals do not have data here
5150 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
5251 'normalize-space(string())')
5352 prop_names = td_list[::2]
···5756 prop_name = prop_name.extract().encode('utf-8')[:-1]
5857 prop_value = prop_value.extract().encode('utf-8')
5958 prop_conditions = ''
5959+6060+ # Test for properties without values, with one hardcoded exception
6161+ if (not re.match(r'^\d', prop_value) or
6262+ (prop_name == 'Polarizability' and
6363+ prop_value == '10-24cm3')):
6464+ continue
60656166 # Match for condition in parentheses
6267 m = re.match(r'(.*) \((.*)\)', prop_name)
···192197 'reliability': 'Unknown',
193198 'conditions': ''
194199 })
195195- properties.append(result)
200200+ if result['value']:
201201+ properties.append(result)
196202 return properties
197203198204 def parse_searchrequest(self, response):
···200206 sel = Selector(response)
201207 log.msg('chemspider parse_searchrequest', level=log.DEBUG)
202208 sel.register_namespace('cs', 'http://www.chemspider.com/')
203203- csid = sel.xpath('.//cs:int/text()').extract()[0]
204204- # [TODO] - handle multiple csids in case of vague search term
209209+ csids = sel.xpath('.//cs:int/text()').extract()
210210+ if len(csids) == 0:
211211+ log.msg('ChemSpider found nothing', level=log.ERROR)
212212+ return
213213+ elif len(csids) > 1:
214214+ log.msg('ChemSpider found multiple substances, taking first '
215215+ 'element', level=log.DEBUG)
216216+ csid = csids[0]
205217 structure_url = self.website[:-1] + self.structure % csid
206218 extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
207219 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
···215227 return None
216228 searchurl = self.website[:-1] + self.search % compound
217229 log.msg('chemspider compound', level=log.DEBUG)
218218- return Request(url=searchurl, callback=self.parse_searchrequest)230230+ return Request(url=searchurl, callback=self.parse_searchrequest)
+23-3
FourmiCrawler/sources/WikipediaParser.py
···3636 """ scrape data from infobox on wikipedia. """
3737 items = []
38383939- #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
4040- tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
3939+ #be sure to get chembox (wikipedia template)
4040+ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
4141 xpath('normalize-space(string())')
4242 prop_names = tr_list[::2]
4343 prop_values = tr_list[1::2]
···4646 'attribute': prop_name.extract().encode('utf-8'),
4747 'value': prop_values[i].extract().encode('utf-8'),
4848 'source': "Wikipedia",
4949- 'reliability': "",
4949+ 'reliability': "Unknown",
5050 'conditions': ""
5151 })
5252 items.append(item)
5353 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
5454+5555+ #scrape the drugbox (wikipedia template)
5656+ tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
5757+ log.msg('dit: %s' % tr_list2, level=log.DEBUG)
5858+ for tablerow in tr_list2:
5959+ log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
6060+ if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
6161+ 'normalize-space(string())'):
6262+ item = Result({
6363+ 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
6464+ 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
6565+ 'source': "Wikipedia",
6666+ 'reliability': "Unknown",
6767+ 'conditions': ""
6868+ })
6969+ items.append(item)
7070+ log.msg(
7171+ 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
7272+ level=log.DEBUG)
7373+5474 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
5575 item_list = self.clean_items(items)
5676
+81
README.md
···11+# Fourmi
22+33+Fourmi is an web scraper for chemical substances. The program is designed to be
44+used as a search engine to search multiple chemical databases for a specific
55+substance. The program will produce all available attributes of the substance
66+and conditions associated with the attributes. Fourmi also attempts to estimate
77+the reliability of each data point to assist the user in deciding which data
88+should be used.
99+1010+The Fourmi project is open source project licensed under the MIT license. Feel
1111+free to contribute!
1212+1313+Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
1414+web scraping framework for python. Most of the functionality of this project can
1515+be traced to this framework. Should the documentation for this application fall
1616+short, we suggest you take a close look at the [Scrapy architecture]
1717+(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
1818+documentation](http://doc.scrapy.org/en/latest/index.html).
1919+2020+### Installing
2121+2222+If you're installing Fourmi, please take a look at our [installation guide](...)
2323+on our wiki. When you've installed the application, make sure to check our
2424+[usage guide](...).
2525+2626+### Using the Source
2727+2828+To use the Fourmi source code multiple dependencies are required. Take a look at
2929+the [wiki page](...) on using the application source code for a step by step
3030+installation guide.
3131+3232+When developing for the Fourmi project keep in mind that code readability is a
3333+must. To maintain the readability, code should be conform with the
3434+[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
3535+code. More information about the different structures and principles of the
3636+Fourmi application can be found on our [wiki](...).
3737+3838+### To Do
3939+4040+The Fourmi project has the following goals for the nearby future:
4141+4242+__Main goals:__
4343+4444+- Improve our documentation and guides. (Assignee: Dekker)
4545+- Build an graphical user interface(GUI) as alternative for the command line
4646+interface(CLI). (Assignee: Harmen)
4747+- Compiling the source into an windows executable. (Assignee: Bas)
4848+- Create an configuration file to hold logins and API keys.
4949+- Determine reliability of our data point.
5050+- Create an module to gather data from NIST. (Assignee: Rob)
5151+- Create an module to gather data from PubChem. (Assignee: Nout)
5252+5353+__Side goals:__
5454+5555+- Clean and unify data.
5656+- Extensive reliability analysis using statistical tests.
5757+- Test data with Descartes 1.
5858+5959+### Project Origin
6060+6161+The Fourmi project was started in February of 2014 as part of a software
6262+engineering course at the Radboud University for students studying Computer
6363+Science, Information Science or Artificial Intelligence. Students participate in
6464+a real software development project as part of the
6565+[Giphouse](http://www.giphouse.nl/).
6666+6767+This particular project was started on behalf of Ivo B. Rietveld. As a chemist
6868+he was in need of an application to automatically search information on chemical
6969+substances and create an phase diagram. The so called "Descrates" project was
7070+split into two teams each creating a different application that has part of the
7171+functionality. We are the team Descartes 2 and as we were responsible for
7272+creating a web crawler, we've named our application Fourmi (Englis: Ants).
7373+7474+The following people were part of the original team:
7575+7676+- [Jip J. Dekker](http://jip.dekker.li)
7777+- Rob ten Berge
7878+- Harmen Prins
7979+- Bas van Berkel
8080+- Nout van Deijck
8181+- Michail Kuznetcov
-16
README.rst
···11-We are the team Descartes 2.
22-----------------------------
33-44-Our team members are:
55-66-+ Rob ten Berge
77-88-+ Bas van Berkel
99-1010-+ Nout van Deijck
1111-1212-+ Jip J. Dekker
1313-1414-+ Michail Kuznetcov
1515-1616-+ Harmen Prins