A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'feature/GUI' into develop

+292 -4
+2
.gitignore
··· 6 6 7 7 #may contain authentication information 8 8 sources.cfg 9 + #Another of our config files 10 + GUI.cfg 9 11 10 12 #THINGS WE WOULD NEVER EVER WANT! 11 13 #ignore thumbnails created by windows
+6 -2
.travis.yml
··· 3 3 language: python 4 4 python: 2.7 5 5 6 + before_install: 7 + - "export DISPLAY=:99.0" 8 + - "sh -e /etc/init.d/xvfb start" 9 + 6 10 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 7 11 install: 8 12 - pip install Scrapy docopt ··· 10 14 11 15 # command to run tests, e.g. python setup.py test 12 16 script: 13 - - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests 17 + - nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests 14 18 15 19 notifications: 16 20 slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM 17 21 18 22 after_success: 19 - coveralls --verbose 23 + coveralls --verbose
+1 -2
FourmiCrawler/sources/PubChem.py
··· 62 62 Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) 63 63 return requests 64 64 65 - @staticmethod 66 - def parse_data(response): 65 + def parse_data(self, response): 67 66 """ 68 67 Parse data found in 'Chemical and Physical properties' part of a substance page. 69 68 :param response: The response with the page to parse
+10
GUI.cfg.sample
··· 1 + [GUI] 2 + # Personalize options in your User Interface 3 + 4 + # Commonly used parameters are listed in the GUI for easy selection 5 + CommonParameters = Weight, Polarity, Viscosity, Solubility, Name 6 + 7 + # Parameters that are always used in the search 8 + AlwaysParameters = Name 9 + 10 + OutputTypes = csv, json, jsonlines, xml
+1
GUI/__init__.py
··· 1 + import gui
+30
GUI/configImporter.py
··· 1 + import ConfigParser 2 + 3 + 4 + class ConfigImporter(): 5 + def __init__(self, filename): 6 + """Read the filename into the parser.""" 7 + self.filename = filename 8 + self.parser = ConfigParser.ConfigParser() 9 + self.parser.read(self.filename) 10 + 11 + def load_common_attributes(self): 12 + """Loads common attributes from the initialized file.""" 13 + try: 14 + return self.parser.get('GUI', 'CommonParameters') 15 + except: 16 + return 'One, Two, Three' 17 + 18 + def load_output_types(self): 19 + """Loads output types from the initialized file.""" 20 + try: 21 + return self.parser.get('GUI', 'OutputTypes') 22 + except: 23 + return 'csv' 24 + 25 + def load_always_attributes(self): 26 + """Loads attributes that are always searched for from the initialized file.""" 27 + try: 28 + return self.parser.get('GUI', 'AlwaysParameters') 29 + except: 30 + return 'Name, Weight'
+196
GUI/gui.py
··· 1 + from Tkinter import * 2 + import os 3 + import shutil 4 + from tkFileDialog import asksaveasfilename 5 + 6 + from configImporter import * 7 + 8 + 9 + class GUI(): 10 + def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True): 11 + """Boots the window, configuration.""" 12 + if not in_source: 13 + current_dir = os.path.dirname(os.path.abspath(__file__)) 14 + config_file = current_dir + '../' + config_file 15 + if not os.path.isfile(config_file): 16 + try: 17 + shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file) 18 + except IOError: 19 + print "GUI configuration couldn't be found and couldn't be created." 20 + sys.exit() 21 + self.configurator = ConfigImporter(config_file) 22 + self.sourceloader = sourceloader 23 + self.finish_with_search = False 24 + self.values = {} 25 + self.required_variables = ['substance'] 26 + self.search = search 27 + self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types()) 28 + 29 + def load_common_attributes(self): 30 + """Calls the configuration parser for common attributes.""" 31 + return [x.strip() for x in self.configurator.load_common_attributes().split(',')] 32 + 33 + def load_output_types(self): 34 + """Calls the configuration parser for output types.""" 35 + return [x.strip() for x in self.configurator.load_output_types().split(',')] 36 + 37 + def load_always_attributes(self): 38 + """Calls the configuration parser for attributes that are always used.""" 39 + return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')]) 40 + 41 + def set_output(self): 42 + self.variable_output_name.set(asksaveasfilename()) 43 + self.button_output_name.config(text=self.variable_output_name.get()) 44 + 45 + def generate_window(self, common_attributes, output_types): 46 + """Creates all widgets and variables in the window.""" 47 + window = Tk() 48 + window.wm_title("Fourmi Crawler") 49 + 50 + variables = {} 51 + 52 + variable_substance = StringVar(window) 53 + frame_substance = Frame(window) 54 + label_substance = Label(frame_substance, text="Substance: ") 55 + input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance) 56 + variables.update({"substance": variable_substance}) 57 + frame_substance.pack(side=TOP) 58 + label_substance.pack() 59 + input_substance.pack() 60 + input_substance.focus() 61 + 62 + frame_all_attributes = Frame(window) 63 + frame_selecting_attributes = Frame(frame_all_attributes) 64 + frame_new_attributes = Frame(frame_selecting_attributes) 65 + label_new_attributes = Label(frame_new_attributes, text="Parameters: ") 66 + input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5) 67 + variables.update({"new_attributes": input_new_attributes}) 68 + frame_new_attributes.pack(side=LEFT) 69 + label_new_attributes.pack() 70 + input_new_attributes.pack() 71 + 72 + frame_common_attributes = Frame(frame_selecting_attributes) 73 + label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ") 74 + input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7) 75 + scrollbar_common_attributes = Scrollbar(frame_common_attributes) 76 + input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set) 77 + scrollbar_common_attributes.config(command=input_common_attributes.yview) 78 + if common_attributes and len(common_attributes) > 0: 79 + input_common_attributes.insert(END, *common_attributes) 80 + variables.update({"common_attributes": input_common_attributes}) 81 + frame_common_attributes.pack(side=RIGHT) 82 + label_common_attributes.pack(side=TOP) 83 + input_common_attributes.pack(side=LEFT) 84 + scrollbar_common_attributes.pack(side=RIGHT, fill=Y) 85 + frame_selecting_attributes.pack() 86 + 87 + frame_last = Frame(window) 88 + search_button = Button(frame_last, text="Start search", command=self.prepare_search) 89 + cancel_button = Button(frame_last, text="Cancel", command=window.destroy) 90 + frame_last.pack(side=BOTTOM) 91 + search_button.pack(side=LEFT) 92 + cancel_button.pack(side=RIGHT) 93 + 94 + frame_name = Frame(window) 95 + frame_output_name = Frame(frame_name) 96 + label_output_name = Label(frame_output_name, text='Output file:') 97 + self.variable_output_name = StringVar() 98 + self.variable_output_name.set('results.csv') 99 + variables.update({'output_name':self.variable_output_name}) 100 + self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file") 101 + frame_output_name.pack(side=LEFT) 102 + label_output_name.pack() 103 + self.button_output_name.pack() 104 + frame_name.pack(side=BOTTOM) 105 + 106 + 107 + frame_checkboxes = Frame(window) 108 + frame_checkbox_attributes = Frame(frame_checkboxes) 109 + variable_all_attributes = BooleanVar() 110 + variable_all_attributes.set(True) 111 + input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters", 112 + variable=variable_all_attributes) 113 + variables.update({"all_attributes": variable_all_attributes}) 114 + frame_checkbox_attributes.pack(side=LEFT) 115 + input_all_attributes.pack() 116 + 117 + frame_logging = Frame(frame_checkboxes) 118 + variable_logging = BooleanVar() 119 + variable_logging.set(False) 120 + input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging) 121 + variables.update({'logging':variable_logging}) 122 + frame_logging.pack(side=RIGHT) 123 + frame_checkboxes.pack(side=BOTTOM) 124 + input_logging.pack() 125 + frame_all_attributes.pack() 126 + 127 + return window, variables 128 + 129 + def prepare_search(self): 130 + """Saves the values from the window for later retrieval.""" 131 + variables = self.variables 132 + values = {} 133 + 134 + values.update({"Always attributes": self.load_always_attributes()}) 135 + for name, var in variables.iteritems(): 136 + if var.__class__ is StringVar: 137 + values.update({name: var.get()}) 138 + elif var.__class__ is BooleanVar: 139 + values.update({name: var.get()}) 140 + elif var.__class__ is Text: 141 + values.update({name: str(var.get("1.0", END)).strip()}) 142 + elif var.__class__ is Listbox: 143 + values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])}) 144 + else: 145 + print "No known class, {}, {}".format(name, var) 146 + 147 + values.update({'output_name':self.variable_output_name.get()}) 148 + values.update({'output_type':self.check_output_type(values.get('output_name'))}) 149 + 150 + self.values = values 151 + if all([values.get(i) != '' for i in self.required_variables]): 152 + self.finish_with_search = True 153 + self.window.destroy() 154 + else: 155 + self.finish_with_search = False 156 + #tkMessageBox.showinfo('Not all required information was entered!') 157 + 158 + def execute_search(self): 159 + """Calls the Fourmi crawler with the values from the GUI""" 160 + if self.values.get('all_attributes'): 161 + attributes = ".*" 162 + else: 163 + attribute_types = ['attributes', 'Common attributes', 'Always attributes'] 164 + attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types]) 165 + output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths 166 + 167 + arguments = {'--attributes': attributes, 168 + '--exclude': None, 169 + '--format': self.values.get('output_type'), 170 + '--help': False, 171 + '--include': None, 172 + '--log': 'log.txt', 173 + '--output': output_file, 174 + '-v': 0 if self.values.get('logging') else 3, 175 + '--version': False, 176 + '<compound>': self.values.get('substance'), 177 + 'list': False, 178 + 'search': True} 179 + 180 + self.search(arguments, self.sourceloader) 181 + 182 + def run(self): 183 + """Starts the window and the search.""" 184 + self.window.mainloop() 185 + if self.finish_with_search: 186 + self.execute_search() 187 + 188 + def check_output_type(self, filename): 189 + parts = str(filename).split('.') 190 + output_types = self.load_output_types() 191 + extension = parts[-1] 192 + 193 + for type in output_types: 194 + if extension==type: 195 + return extension 196 + return output_types[0]
+6
fourmi.py
··· 3 3 Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms). 4 4 5 5 Usage: 6 + fourmi 6 7 fourmi search <compound> 7 8 fourmi [options] search <compound> 8 9 fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound> ··· 24 25 """ 25 26 26 27 from twisted.internet import reactor 28 + 27 29 from scrapy.crawler import Crawler 28 30 from scrapy import signals, log 29 31 import docopt ··· 31 33 from FourmiCrawler.spider import FourmiSpider 32 34 from utils.configurator import Configurator 33 35 from utils.sourceloader import SourceLoader 36 + from GUI import gui 34 37 35 38 36 39 def setup_crawler(compound, settings, source_loader, attributes): ··· 82 85 elif arguments["list"]: 83 86 print "-== Available Sources ==-" 84 87 print str(loader) 88 + else: 89 + gui_window = gui.GUI(search, sourceloader=SourceLoader()) 90 + gui_window.run()
+1
tests/__init__.py
··· 1 + import test_configurator, test_gui, test_pipeline, test_sourceloader, test_spider 1 2
+32
tests/test_gui.py
··· 1 + import unittest 2 + 3 + from GUI import gui 4 + 5 + class TestGUI(unittest.TestCase): 6 + def setUp(self): 7 + pass 8 + 9 + def test_empty_attributes(self): 10 + self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True) 11 + self.test_gui.window.after(9, self.test_gui.prepare_search) 12 + self.test_gui.window.after(11, self.test_gui.window.destroy) 13 + self.test_gui.run() 14 + 15 + output_type = self.test_gui.configurator.load_output_types().split(',')[0] 16 + 17 + self.assertEqual(self.test_gui.values.get('substance'), '') 18 + self.assertEqual(self.test_gui.values.get('output_type'), output_type) 19 + self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv') 20 + 21 + 22 + def test_no_configurations(self): 23 + self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample") 24 + self.test_gui.configurator = gui.ConfigImporter('') 25 + self.test_gui.finish_with_search = True 26 + self.test_gui.window.after(9, self.test_gui.prepare_search) 27 + self.test_gui.window.after(11, self.test_gui.window.destroy) 28 + self.test_gui.run() 29 + 30 + self.assertEqual(self.test_gui.values.get('substance'), '') 31 + self.assertEqual(self.test_gui.values.get('output_type'), 'csv') 32 + self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
+1
utils/__init__.py
··· 1 + import configurator, sourceloader
+6
utils/configurator.py
··· 1 1 import ConfigParser 2 2 import os 3 + import shutil 3 4 4 5 from scrapy.utils.project import get_project_settings 5 6 ··· 70 71 current_dir = os.path.dirname(os.path.abspath(__file__)) 71 72 config_path = current_dir + '/../sources.cfg' 72 73 # [TODO]: location of sources.cfg should be softcoded eventually 74 + if not os.path.isfile(config_path): 75 + try: 76 + shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path) 77 + except IOError: 78 + print "WARNING: Source configuration couldn't be found and couldn't be created." 73 79 config = ConfigParser.ConfigParser() 74 80 config.read(config_path) 75 81 return config