···22#
33# Don't forget to add your pipeline to the ITEM_PIPELINES setting
44# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
55+import re
56from scrapy.exceptions import DropItem
677888-class FourmiPipeline(object):
99+class DuplicatePipeline(object):
9101011 def __init__(self):
1112 self.known_values = set()
···1718 :param spider: The spider which scraped the spider
1819 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
1920 """
2020- value = item['attribute'], item['value']
2121+ value = (item['attribute'], item['value'], item['conditions'])
2122 if value in self.known_values:
2223 raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
2324 else:
2425 self.known_values.add(value)
2526 return item
2727+2828+class AttributeSelectionPipeline(object):
2929+3030+ def __init__(self):
3131+ pass;
3232+3333+ def process_item(self, item, spider):
3434+ """
3535+ The items are processed using the selected attribute list available in the spider,
3636+ items that don't match the selected items are dropped.
3737+ :param item: The incoming item
3838+ :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
3939+ :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
4040+ """
4141+ if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
4242+ return item
4343+ else:
4444+ raise DropItem("Attribute not selected by used: %s" % item)
···11+# Fourmi
22+33+Fourmi is an web scraper for chemical substances. The program is designed to be
44+used as a search engine to search multiple chemical databases for a specific
55+substance. The program will produce all available attributes of the substance
66+and conditions associated with the attributes. Fourmi also attempts to estimate
77+the reliability of each data point to assist the user in deciding which data
88+should be used.
99+1010+The Fourmi project is open source project licensed under the MIT license. Feel
1111+free to contribute!
1212+1313+Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
1414+web scraping framework for python. Most of the functionality of this project can
1515+be traced to this framework. Should the documentation for this application fall
1616+short, we suggest you take a close look at the [Scrapy architecture]
1717+(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
1818+documentation](http://doc.scrapy.org/en/latest/index.html).
1919+2020+### Installing
2121+2222+If you're installing Fourmi, please take a look at our [installation guide](...)
2323+on our wiki. When you've installed the application, make sure to check our
2424+[usage guide](...).
2525+2626+### Using the Source
2727+2828+To use the Fourmi source code multiple dependencies are required. Take a look at
2929+the [wiki page](...) on using the application source code for a step by step
3030+installation guide.
3131+3232+When developing for the Fourmi project keep in mind that code readability is a
3333+must. To maintain the readability, code should be conform with the
3434+[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
3535+code. More information about the different structures and principles of the
3636+Fourmi application can be found on our [wiki](...).
3737+3838+### To Do
3939+4040+The Fourmi project has the following goals for the nearby future:
4141+4242+__Main goals:__
4343+4444+- Improve our documentation and guides. (Assignee: Dekker)
4545+- Build an graphical user interface(GUI) as alternative for the command line
4646+interface(CLI). (Assignee: Harmen)
4747+- Compiling the source into an windows executable. (Assignee: Bas)
4848+- Create an configuration file to hold logins and API keys.
4949+- Determine reliability of our data point.
5050+- Create an module to gather data from NIST. (Assignee: Rob)
5151+- Create an module to gather data from PubChem. (Assignee: Rob)
5252+5353+__Side goals:__
5454+5555+- Clean and unify data.
5656+- Extensive reliability analysis using statistical tests.
5757+- Test data with Descartes 1.
5858+5959+### Project Origin
6060+6161+The Fourmi project was started in February of 2014 as part of a software
6262+engineering course at the Radboud University for students studying Computer
6363+Science, Information Science or Artificial Intelligence. Students participate in
6464+a real software development project as part of the
6565+[Giphouse](http://www.giphouse.nl/).
6666+6767+This particular project was started on behalf of Ivo B. Rietveld. As a chemist
6868+he was in need of an application to automatically search information on chemical
6969+substances and create an phase diagram. The so called "Descrates" project was
7070+split into two teams each creating a different application that has part of the
7171+functionality. We are the team Descartes 2 and as we were responsible for
7272+creating a web crawler, we've named our application Fourmi (Englis: Ants).
7373+7474+The following people were part of the original team:
7575+7676+- [Jip J. Dekker](http://jip.dekker.li)
7777+- Rob ten Berge
7878+- Harmen Prins
7979+- Bas van Berkel
8080+- Nout van Deijck
8181+- Michail Kuznetcov
-16
README.rst
···11-We are the team Descartes 2.
22-----------------------------
33-44-Our team members are:
55-66-+ Rob ten Berge
77-88-+ Bas van Berkel
99-1010-+ Nout van Deijck
1111-1212-+ Jip J. Dekker
1313-1414-+ Michail Kuznetcov
1515-1616-+ Harmen Prins
+7-6
fourmi.py
···1212 fourmi --version
13131414Options:
1515+ --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
1516 -h --help Show this screen.
1617 --version Show version.
1718 --verbose Verbose logging output.
1819 --log=<file> Save log to an file.
1920 -o <file> --output=<file> Output file [default: result.*format*]
2021 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
2121- --include=<sourcenames> Include only sources that match the regular these expressions split by a comma.
2222- --exclude=<sourcenames> Exclude the sources that match the regular these expressions split by a comma.
2222+ --include=<regex> Include only sources that match these regular expressions split by a comma.
2323+ --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
2324"""
24252526from twisted.internet import reactor
···3233from sourceloader import SourceLoader
333434353535-def setup_crawler(searchable, settings, source_loader):
3636- spider = FourmiSpider(compound=searchable)
3636+def setup_crawler(searchable, settings, source_loader, attributes):
3737+ spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
3738 spider.add_parsers(source_loader.sources)
3839 crawler = Crawler(settings)
3940 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
···7475def search(docopt_arguments, source_loader):
7576 start_log(docopt_arguments)
7677 settings = scrapy_settings_manipulation(docopt_arguments)
7777- setup_crawler(docopt_arguments["<compound>"], settings, source_loader)
7878+ setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
7879 reactor.run()
798080818182if __name__ == '__main__':
8282- arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6')
8383+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
8384 loader = SourceLoader()
84858586 if arguments["--include"]: