Merge branch 'master' of github.com:Dekker1/ResearchMethods · dekker.one/monash-researchmethods@6ab555e

+42 -1

mini_proj/report/references.bib

··· 1 + @misc{openData, 2 + title={Open Database License (ODbL) v1.0}, 3 + url={https://opendatacommons.org/licenses/odbl/1.0/}, 4 + journal={Open Data Commons}, 5 + year={2018}, 6 + month={Feb} 7 + } 8 + @techreport{knn, 9 + title={Discriminatory analysis-nonparametric discrimination: consistency properties}, 10 + author={Fix, Evelyn and Hodges Jr, Joseph L}, 11 + year={1951}, 12 + institution={California Univ Berkeley} 13 + } 14 + @article{svm, 15 + title={Support-vector networks}, 16 + author={Cortes, Corinna and Vapnik, Vladimir}, 17 + journal={Machine learning}, 18 + volume={20}, 19 + number={3}, 20 + pages={273--297}, 21 + year={1995}, 22 + publisher={Springer} 23 + } 24 + @article{naivebayes, 25 + title={Idiot's Bayes—not so stupid after all?}, 26 + author={Hand, David J and Yu, Keming}, 27 + journal={International statistical review}, 28 + volume={69}, 29 + number={3}, 30 + pages={385--398}, 31 + year={2001}, 32 + publisher={Wiley Online Library} 33 + } 34 + @article{randomforest, 35 + title={Classification and regression by randomForest}, 36 + author={Liaw, Andy and Wiener, Matthew and others}, 37 + journal={R news}, 38 + volume={2}, 39 + number={3}, 40 + pages={18--22}, 41 + year={2002} 42 + } 1 43 @article{Kotsiantis2007, 2 44 abstract = {Supervised machine learning is the search for algorithms that reason from externally supplied instances to produce general hypotheses, which then make predictions about future instances. In other words, the goal of supervised learning is to build a concise model of the distribution of class labels in terms of predictor features. The resulting classifier is then used to assign class labels to the testing instances where the values of the predictor features are known, but the value of the class label is unknown. This paper describes various supervised machine learning classification techniques. Of course, a single article cannot be a complete review of all supervised machine learning classification algorithms (also known induction classification algorithms), yet we hope that the references cited will cover the major theoretical issues, guiding the researcher in interesting research directions and suggesting possible bias combinations that have yet to be explored.}, 3 45 author = {Kotsiantis, Sotiris B.}, ··· 14 56 volume = {31}, 15 57 year = {2007} 16 58 } 17 -

mini_proj/report/waldo.png

This is a binary file and will not be displayed.

+194 -53

mini_proj/report/waldo.tex

··· 6 6 \usepackage[justification=centering]{caption} % Used for captions 7 7 \captionsetup[figure]{font=small} % Makes captions small 8 8 \newcommand\tab[1][0.5cm]{\hspace*{#1}} % Defines a new command to use 'tab' in text 9 - % Math package 10 - \usepackage{amsmath} 9 + \usepackage[comma, numbers]{natbib} % Used for the bibliography 10 + \usepackage{amsmath} % Math package 11 11 % Enable that parameters of \cref{}, \ref{}, \cite{}, ... are linked so that a reader can click on the number an jump to the target in the document 12 12 \usepackage{hyperref} 13 13 %enable \cref{...} and \Cref{...} instead of \ref: Type of reference included in the link ··· 19 19 \usepackage{bookmark} 20 20 \usepackage{natbib} 21 21 22 + \usepackage{xcolor} 23 + \newcommand{\todo}[1]{\marginpar{{\textsf{TODO}}}{\textbf{\color{red}[#1]}}} 24 + 22 25 \begin{document} 23 - \title{Waldo discovery using Neural Networks} 26 + \title{What is Waldo?} 24 27 \author{Kelvin Davis \and Jip J. Dekker\and Anthony Silvestere} 25 28 \maketitle 26 29 ··· 30 33 31 34 \section{Introduction} 32 35 33 - \section{Background} 36 + Almost every child around the world knows about ``Where's Waldo?'', also 37 + known as ``Where's Wally?'' in some countries. This famous puzzle book has 38 + spread its way across the world and is published in more than 25 different 39 + languages. The idea behind the books is to find the character ``Waldo'', 40 + shown in \Cref{fig:waldo}, in the different pictures in the book. This is, 41 + however, not as easy as it sounds. Every picture in the book is full of tiny 42 + details and Waldo is only one out of many. The puzzle is made even harder by 43 + the fact that Waldo is not always fully depicted, sometimes it is just his 44 + head or his torso popping out from behind something else. Lastly, the reason 45 + that even adults will have trouble spotting Waldo is the fact that the 46 + pictures are full of ``Red Herrings'': things that look like (or are colored 47 + as) Waldo, but are not actually Waldo. 34 48 35 - This paper is mad \cite{Kotsiantis2007}. 49 + \begin{figure}[ht] 50 + \includegraphics[scale=0.35]{waldo} 51 + \centering 52 + \caption{ 53 + A headshot of the character ``Waldo'', or ``Wally''. Pictures of Waldo 54 + copyrighted by Martin Handford and are used under the fair-use policy. 55 + } 56 + \label{fig:waldo} 57 + \end{figure} 36 58 37 - \section{Methods} 59 + The task of finding Waldo is something that relates to a lot of real life 60 + image recognition tasks. Fields like mining, astronomy, surveillance, 61 + radiology, and microbiology often have to analyse images (or scans) to find 62 + the tiniest details, sometimes undetectable by the human eye. These tasks 63 + are especially hard when the thing(s) you are looking for are similar to the 64 + rest of the images. These tasks are thus generally performed using computers 65 + to identify possible matches. 66 + 67 + ``Where's Waldo?'' offers us a great tool to study this kind of problem in a 68 + setting that is humanly tangible. In this report we will try to identify 69 + Waldo in the puzzle images using different classification methods. Every 70 + image will be split into different segments and every segment will have to 71 + be classified as either being ``Waldo'' or ``not Waldo''. We will compare 72 + various different classification methods from more classical machine 73 + learning, like naive Bayes classifiers, to the currently state of the art, 74 + Neural Networks. In \Cref{sec:background} we will introduce the different 75 + classification methods, \Cref{sec:method} will explain the way in which 76 + these methods are trained and how they will be evaluated, in 77 + \Cref{sec:results} will discuss the results, and \Cref{sec:conclusion} will 78 + offer our final conclusions. 79 + 80 + \section{Background} \label{sec:background} 81 + 82 + The classification methods used can separated into two separate groups: 83 + classical machine learning methods and neural network architectures. Many of 84 + the classical machine learning algorithms have variations and improvements 85 + for various purposes; however, for this report we will be using their only 86 + their basic versions. In contrast, we will use different neural network 87 + architectures, as this method is currently the most used for image 88 + classification. 89 + 90 + \textbf{ 91 + \\A couple of papers that may be useful (if needed): 92 + - LeNet: http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf 93 + - AlexNet: http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks 94 + - General comparison of LeNet and AlexNet: 95 + "On the Performance of GoogLeNet and AlexNet Applied to Sketches", Pedro Ballester and Ricardo Matsumura Araujo 96 + - Deep NN Architecture: 97 + https://www-sciencedirect-com.ezproxy.lib.monash.edu.au/science/article/pii/S0925231216315533 98 + } 99 + 100 + \subsection{Classical Machine Learning Methods} 101 + 102 + The following paragraphs will give only brief descriptions of the different 103 + classical machine learning methods used in this reports. For further reading 104 + we recommend reading ``Supervised machine learning: A review of 105 + classification techniques'' \cite{Kotsiantis2007}. 106 + 107 + \paragraph{Naive Bayes Classifier} 108 + 109 + \cite{naivebayes} 110 + 111 + \paragraph{$k$-Nearest Neighbors} 112 + 113 + ($k$-NN) \cite{knn} is one of the simplest machine learning algorithms. It 114 + classifies a new instance based on its ``distance'' to the known instances. 115 + It will find the $k$ closest instances to the new instance and assign the 116 + new instance the class that the majority of the $k$ closest instances has. 117 + The method has to be configured in several ways: the number of $k$, the 118 + distance measure, and (depending on $k$) a tie breaking measure all have to 119 + be chosen. 120 + 121 + \paragraph{Support Vector Machine} 122 + 123 + \cite{svm} 124 + 125 + \paragraph{Random Forest} 126 + 127 + \cite{randomforest} 128 + 129 + \subsection{Neural Network Architectures} 130 + \todo{Did we only do the three in the end? (Alexnet?)} 131 + Yeah, we implemented the LeNet architecture, then improved on it for a fairly standar convolutional neural network (CNN) that was deeper, extracted more features, and condensed that image information more. Then we implemented a more fully convolutional network (FCN) which contained only one dense layer for the final binary classification step. The FCN added an extra convolutional layer, meaning the before classifying each image, the network abstracted the data more than the other two. 132 + \begin{itemize} 133 + \item LeNet 134 + \item CNN 135 + \item FCN 136 + \end{itemize} 137 + 138 + \paragraph{Convolutional Neural Networks} 139 + 140 + \paragraph{LeNet} 141 + 142 + \paragraph{Fully Convolutional Neural Networks} 143 + 144 + 145 + \section{Method} \label{sec:method} 146 + \tab 147 + In order to effectively utilize the aforementioned modelling and classification techniques, a key consideration is the data they are acting on. 148 + A dataset containing Waldo and non-Waldo images was obtained from an Open Database\footnote{``The Open Database License (ODbL) is a license agreement intended to allow users to freely share, modify, and use [a] Database while maintaining [the] same freedom for others"\cite{openData}}hosted on the predictive modelling and analytics competition framework, Kaggle. 149 + The distinction between images containing Waldo, and those that do not, was providied by the separation of the images in different sub-directories. 150 + It was therefore necessary to preprocess these images before they could be utilised by the proposed machine learning algorithms. 151 + 152 + \subsection{Image Processing} 153 + \tab 154 + The Waldo image database consists of images of size 64$\times$64, 128$\times$128, and 256$\times$256 pixels obtained by dividing complete Where's Waldo? puzzles. 155 + Within each set of images, those containing Waldo are located in a folder called `waldo', and those not containing Waldo, in a folder called `not\_waldo'. 156 + Since Where's Waldo? puzzles are usually densely populated and contain fine details, the 64$\times$64 pixel set of images were selected to train and evaluate the machine learning models. 157 + These images provide the added benefit of containing the most individual images of the three size groups. 158 + \\ 159 + \par 160 + Each of the 64$\times$64 pixel images were inserted into a Numpy 161 + \footnote{Numpy is a popular Python programming library for scientific computing} 162 + array of images, and a binary value was inserted into a seperate list at the same index. 163 + These binary values form the labels for each image (waldo or not waldo). 164 + Colour normalisation was performed on each so that artefacts in an image's colour profile correspond to meaningful features of the image (rather than photographic method). 165 + \\ 166 + \par 167 + Each original puzzle is broken down into many images, and only contains one Waldo. Although Waldo might span multiple 64$\times$64 pixel squares, this means that the non-Waldo data far outnumbers the Waldo data. 168 + To combat the bias introduced by the skewed data, all Waldo images were artificially augmented by performing random rotations, reflections, and introducing random noise in the image to produce news images. 169 + In this way, each original Waldo image was used to produce an additional 10 variations of the image, inserted into the image array. 170 + This provided more variation in the true positives of the data set and assists in the development of more robust methods by exposing each technique to variations of the image during the training phase. 171 + \\ 172 + \par 173 + Despite the additional data, there were still over ten times as many non-Waldo images than Waldo images. 174 + Therefore, it was necessary to cull the no-Waldo data, so that there was an even split of Waldo and non-Waldo images, improving the representation of true positives in the image data set. 175 + \\ 38 176 39 - % Kelvin Start 40 - \subsection{Benchmarking}\label{benchmarking} 177 + % Kelvin Start 178 + \subsection{Benchmarking}\label{benchmarking} 41 179 42 - In order to benchmark the Neural Networks, the performance of these 43 - algorithms are evaluated against other Machine Learning algorithms. We 44 - use Support Vector Machines, K-Nearest Neighbours ($K=5$), Gaussian 45 - Naive Bayes and Random Forest classifiers, as provided in Scikit-Learn. 180 + In order to benchmark the Neural Networks, the performance of these 181 + algorithms are evaluated against other Machine Learning algorithms. We 182 + use Support Vector Machines, K-Nearest Neighbours ($K=5$), Gaussian 183 + Naive Bayes and Random Forest classifiers, as provided in Scikit-Learn. 46 184 47 - \subsection{Performance Metrics}\label{performance-metrics} 185 + \subsection{Performance Metrics}\label{performance-metrics} 48 186 49 - To evaluate the performance of the models, we record the time taken by 50 - each model to train, based on the training data and statistics about the 51 - predictions the models make on the test data. These prediction 52 - statistics include: 187 + To evaluate the performance of the models, we record the time taken by 188 + each model to train, based on the training data and statistics about the 189 + predictions the models make on the test data. These prediction 190 + statistics include: 53 191 54 - \begin{itemize} 55 - \tightlist 56 - \item 57 - \textbf{Accuracy:} 58 - \[a = \dfrac{|correct\ predictions|}{|predictions|} = \dfrac{tp + tn}{tp + tn + fp + fn}\] 59 - \item 60 - \textbf{Precision:} 61 - \[p = \dfrac{|Waldo\ predicted\ as\ Waldo|}{|predicted\ as\ Waldo|} = \dfrac{tp}{tp + fp}\] 62 - \item 63 - \textbf{Recall:} 64 - \[r = \dfrac{|Waldo\ predicted\ as\ Waldo|}{|actually\ Waldo|} = \dfrac{tp}{tp + fn}\] 65 - \item 66 - \textbf{F1 Measure:} \[f1 = \dfrac{2pr}{p + r}\] where $tp$ is the 67 - number of true positives, $tn$ is the number of true negatives, 68 - $fp$ is the number of false positives, and $tp$ is the number of 69 - false negatives. 70 - \end{itemize} 192 + \begin{itemize} 193 + \item 194 + \textbf{Accuracy:} 195 + \[a = \dfrac{|correct\ predictions|}{|predictions|} = \dfrac{tp + tn}{tp + tn + fp + fn}\] 196 + \item 197 + \textbf{Precision:} 198 + \[p = \dfrac{|Waldo\ predicted\ as\ Waldo|}{|predicted\ as\ Waldo|} = \dfrac{tp}{tp + fp}\] 199 + \item 200 + \textbf{Recall:} 201 + \[r = \dfrac{|Waldo\ predicted\ as\ Waldo|}{|actually\ Waldo|} = \dfrac{tp}{tp + fn}\] 202 + \item 203 + \textbf{F1 Measure:} \[f1 = \dfrac{2pr}{p + r}\] where $tp$ is the 204 + number of true positives, $tn$ is the number of true negatives, 205 + $fp$ is the number of false positives, and $tp$ is the number of 206 + false negatives. 207 + \end{itemize} 71 208 72 - Accuracy is a common performance metric used in Machine Learning, 73 - however in classification problems where the training data is heavily 74 - biased toward one category, sometimes a model will learn to optimize its 75 - accuracy by classifying all instances as one category. I.e. the 76 - classifier will classify all images that do not contain Waldo as not 77 - containing Waldo, but will also classify all images containing Waldo as 78 - not containing Waldo. Thus we use, other metrics to measure performance 79 - as well. 209 + Accuracy is a common performance metric used in Machine Learning, 210 + however in classification problems where the training data is heavily 211 + biased toward one category, sometimes a model will learn to optimize its 212 + accuracy by classifying all instances as one category. I.e. the 213 + classifier will classify all images that do not contain Waldo as not 214 + containing Waldo, but will also classify all images containing Waldo as 215 + not containing Waldo. Thus we use, other metrics to measure performance 216 + as well. 80 217 81 - \emph{Precision} returns the percentage of classifications of Waldo that 82 - are actually Waldo. \emph{Recall} returns the percentage of Waldos that 83 - were actually predicted as Waldo. In the case of a classifier that 84 - classifies all things as Waldo, the recall would be 0. \emph{F1-Measure} 85 - returns a combination of precision and recall that heavily penalises 86 - classifiers that perform poorly in either precision or recall. 87 - % Kelvin End 218 + \emph{Precision} returns the percentage of classifications of Waldo that 219 + are actually Waldo. \emph{Recall} returns the percentage of Waldos that 220 + were actually predicted as Waldo. In the case of a classifier that 221 + classifies all things as Waldo, the recall would be 0. \emph{F1-Measure} 222 + returns a combination of precision and recall that heavily penalises 223 + classifiers that perform poorly in either precision or recall. 224 + % Kelvin End 88 225 89 - \section{Results} 226 + \section{Results} \label{sec:results} 90 227 91 - \section{Discussion and Conclusion} 228 + \section{Conclusion} \label{sec:conclusion} 92 229 93 - \bibliographystyle{humannat} 230 + \clearpage % Ensures that the references are on a seperate page 231 + \pagebreak 232 + % References 233 + \section{References} 234 + \renewcommand{\refname}{} 235 + \bibliographystyle{alpha} 94 236 \bibliography{references} 95 - 96 237 \end{document}

+73 -4

mini_proj/waldo_model.py

··· 25 25 ''' 26 26 Model definition define the network structure 27 27 ''' 28 - def FCN(): 28 + def CNN(): 29 29 ## List of model layers 30 30 inputs = Input((3, 64, 64)) 31 31 ··· 33 33 m_pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) 34 34 35 35 conv2 = Conv2D(32, (3, 3), activation='relu', padding='same')(m_pool1) 36 - #drop1 = Dropout(0.2)(conv2) # Drop some portion of features to prevent overfitting 37 36 m_pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) 38 37 39 38 conv3 = Conv2D(32, (3, 3), activation='relu', padding='same')(m_pool2) ··· 47 46 drop3 = Dropout(0.2)(dense) 48 47 classif = Dense(2, activation='sigmoid')(drop3) # Final layer to classify 49 48 50 - ## Define the model structure 49 + ## Define the model start and end 50 + model = Model(inputs=inputs, outputs=classif) 51 + # Optimizer recommended Adadelta values (lr=0.01) 52 + model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy', f1]) 53 + 54 + return model 55 + 56 + ''' 57 + Model definition for a fully convolutional (no dense layers) network structure 58 + ''' 59 + def FCN(): 60 + ## List of model layers 61 + inputs = Input((3, 64, 64)) 62 + 63 + conv1 = Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=(64, 64, 3))(inputs) 64 + m_pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) 65 + 66 + conv2 = Conv2D(32, (3, 3), activation='relu', padding='same')(m_pool1) 67 + m_pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) 68 + 69 + conv3 = Conv2D(32, (3, 3), activation='relu', padding='same')(m_pool2) 70 + drop2 = Dropout(0.2)(conv3) # Drop some portion of features to prevent overfitting 71 + m_pool2 = MaxPooling2D(pool_size=(2, 2))(drop2) 72 + 73 + conv4 = Conv2D(64, (2, 2), activation='relu', padding='same')(m_pool2) 74 + 75 + flat = Flatten()(conv4) # Makes data 1D 76 + drop3 = Dropout(0.2)(flat) 77 + classif = Dense(2, activation='sigmoid')(drop3) # Final layer to classify 78 + 79 + ## Define the model start and end 51 80 model = Model(inputs=inputs, outputs=classif) 52 81 # Optimizer recommended Adadelta values (lr=0.01) 53 82 model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy', f1]) 54 83 55 84 return model 56 85 86 + 87 + ''' 88 + Model definition for the network structure of LeNet 89 + Note: LeNet was designed to classify into 10 classes, but we are only performing binary classification 90 + ''' 91 + def LeNet(): 92 + ## List of model layers 93 + inputs = Input((3, 64, 64)) 94 + 95 + conv1 = Conv2D(6, (5, 5), activation='relu', padding='valid', input_shape=(64, 64, 3))(inputs) 96 + m_pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) 97 + 98 + conv2 = Conv2D(16, (5, 5), activation='relu', padding='valid')(m_pool1) 99 + m_pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) 100 + 101 + flat = Flatten()(m_pool2) # Makes data 1D 102 + 103 + dense1 = Dense(120, activation='relu')(flat) # Fully connected layer 104 + dense2 = Dense(84, activation='relu')(dense1) # Fully connected layer 105 + drop3 = Dropout(0.2)(dense2) 106 + classif = Dense(2, activation='sigmoid')(drop3) # Final layer to classify 107 + 108 + ## Define the model start and end 109 + model = Model(inputs=inputs, outputs=classif) 110 + model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy', f1]) 111 + 112 + return model 113 + 114 + ''' 115 + AlexNet architecture 116 + ''' 117 + def AlexNet(): 118 + inputs = Input(shape=(3, 64, 64)) 119 + 120 + 121 + return model 122 + 123 + 57 124 def f1(y_true, y_pred): 58 125 def recall(y_true, y_pred): 59 126 """Recall metric. ··· 110 177 lbl_test = to_categorical(lbl_test) 111 178 112 179 ## Define model 180 + #model = CNN() 113 181 model = FCN() 182 + #model = LeNet() 114 183 # svm_iclf = ImageClassifier(svm.SVC) 115 184 # tree_iclf = ImageClassifier(tree.DecisionTreeClassifier) 116 185 # naive_bayes_iclf = ImageClassifier(naive_bayes.GaussianNBd) 117 186 # ensemble_iclf = ImageClassifier(ensemble.RandomForestClassifier) 118 187 119 188 ## Define training parameters 120 - epochs = 10 # an epoch is one forward pass and back propogation of all training data 189 + epochs = 25 # an epoch is one forward pass and back propogation of all training data 121 190 batch_size = 150 # batch size - number of training example used in one forward/backward pass 122 191 # (higher batch size uses more memory, smaller batch size takes more time) 123 192 #lrate = 0.01 # Learning rate of the model - controls magnitude of weight changes in training the NN