snipt

Ctrl+h for KB shortcuts

Python

blackbox_classification.py

#/usr/bin/python
from __future__ import division
import sys, os, numpy as np
from sklearn import svm
from matplotlib import pyplot as plt
from csv import writer as csvwriter, reader as csvreader
from scipy.stats import nanmean

# globals - you will need to alter these for your problem...
default_dir = r'C:\Users\Chris\Dropbox\ProsserSystems\Python\machine_learning\sklearn_data'
default_trn_path = os.path.join(default_dir, 'train.csv')
default_tst_path = os.path.join(default_dir, 'test.csv')
delim = ','
# zero indexed, don't allow for exclusions, they are handled, -1 means no ids
trn_id_col = -1
tst_id_col = -1
labels_col = 0
# col nos to be excluded - zero indexed
excl_trn_cols = [] #e.g. [3,8,10]
excl_tst_cols = [] # e.g. -1 from above: [i-1 for i in excl_trn_cols] 
heads_in_trn_file, heads_in_tst_file = False, False

# regularisation
auto_find_loops = 1 # WARNING this will run this no * iterations global times.
                     #         set to 1 to turn off and use reg default 
                     #         (specified in main)
adj_rate = 2.0 # size of steps to take in auto_find * or / by this number

# These should have reasonable defaults if you are not sure...
trn_perc = 90
iterations = 1000
verbose = False
show_graphs = False # plots data by one feature against another, only useful if
                    # you have a very small no of features e.g.~ <=8

#initialisations
trn_cols, tst_cols = 0, 0

def scale_features(X, mu, sd, m, n):
    """
    Call with a numpy array for the values to be scaled down to approx -3 to 3.
    This is required for algorithms such as gradient descent so
    that the results can converge more efficiently.
    m & n args are for no of rows and no of cols respectively.
    Returns as a Numpy array with a feature added for x0 (a 1 for each row).
    """
    # function for each element (vectorised later)
    def scale(x, mu, sd):
        return (x-mu)/sd if sd != 0 else x

    # vectorise function above
    scale_vec = np.vectorize(scale, otypes=[np.float])

    if len(mu) == 0:
        mu = np.mean(X, axis=0)
    if len(sd) == 0:
        sd = np.std(X, axis=0)
    X_norm = np.ones((m, n+1))
    X_norm[:, 1:] = scale_vec(X, mu, sd)
    return X_norm, mu, sd

def graphs(y, X, m, n, label):
    # get indexes of positive and negative examples
    pos = [i for i in range(m) if y[i] == 1]
    neg = [i for i in range(m) if y[i] == 0]

    # plot stuff...
    if show_graphs:
        for j in range(1, n): # miss x0
            for j2 in range(j+1, n): # for all combos
                x1_p, x1_n = [X[j][i] for i in pos], [X[j][i] for i in neg]
                x2_p, x2_n = [X[j2][i] for i in pos], [X[j2][i] for i in neg]
                fig = plt.plot(x1_p, x2_p, 'go', x1_n, x2_n, 'rx')
                x_lab, y_lab = plt.xlabel(label[j]), plt.ylabel(label[j2])
                plt.show()

def write_to_file(data, f_name):
    writer = csvwriter(open(f_name, 'wb'))
    writer.writerows(data)

def feature_prep(data, heads, stage='training', use_mean=[], use_sd=[],
                 write_output_to_file=False, cur_loop=0):
    """
    use_mean and use_sd should not be specified for initial training data, but
    then should be passed in for cv, test and predictions (to use the same 
    feature scaling as inital training data).
    """

    if cur_loop == 0 and verbose:
    	l_verbose = True
    else:
    	l_verbose = False
    
    # load in training data
    if l_verbose: print '\n', '-'*80, '\n', 'Stage:', stage
    ids = None
    if stage not in ['test', 'predict']:
        feature_cols = [i for i in range(trn_cols)
                        if i not in [trn_id_col, labels_col]+excl_trn_cols]
        if l_verbose:
            print 'feature_cols:', feature_cols
            print 'trn_cols:', trn_cols, '| trn_id_col:', trn_id_col, \
                  '| labels_col:', labels_col, '| excl_trn_cols:', excl_trn_cols
        X, y= data[:, feature_cols], data[:, labels_col]

        # filter heads
        if heads_in_trn_file:
        	heads = [heads[i] for i in range(n) if i in feature_cols]

    else:
        feature_cols = [i for i in range(tst_cols)
                        if i not in [tst_id_col]+excl_tst_cols]
        if l_verbose:
            print 'feature_cols:', feature_cols
            print 'tst_cols:', tst_cols, '| tst_id_col:', tst_id_col, \
                  '| excl_tst_cols:', excl_tst_cols
        X, y = data[:, feature_cols], None

        # filter heads
        if heads_in_tst_file:
        	heads = [heads[i] for i in range(n) if i in feature_cols]

        # record tst id columns if needed
        if tst_id_col >= 0:
        	data[:, tst_id_col]

    m, n = np.size(X, 0), np.size(X, 1) # no of rows and cols
    
    if l_verbose: 
        print 'Heads used:\n', ', '.join(i for i in heads)
        print 'X:', np.shape(X), 'y:', np.shape(y), 'ids:', np.shape(ids), \
               'data:', np.shape(data), 'm:', m, 'n:', n, '\n'

    # fill blanks with averages
    if len(use_mean) > 0:
        col_default = use_mean
    else:
        # calc means of cols if not passed in as args
        col_default = use_mean = nanmean(X, axis=0)
    inds = np.where(np.isnan(X)) # find indicies of empty cells to be replaced
    X[inds] = np.take(col_default, inds[1])
    
    if show_graphs:
        graphs(y, X, m, n, heads)

    #test
    if l_verbose:
        print '\nFirst ten rows before normalisation:'
        np.set_printoptions(precision=4, suppress=True)
        print X[:10, :], '\n'
    
    # scale the features & write output
    X, mu, sd = scale_features(X, use_mean, use_sd, m, n)
    if write_output_to_file:
        write_to_file(X, os.join.path(default_dir, 'X_'+stage+'.csv'))
    
    #test
    if l_verbose:
        print '\nFirst ten rows after normalisation:'
        print X[:10, 1:], '\n'

    return X, y, mu, sd, ids

def conv(val):
    try:
        return float(val)
    except:
        if not val:
            return None
        return float(sum([ord(i) for i in str(val)])) # sum of ascii vals

def import_data(mode='training'):
    global trn_cols, tst_cols
    
    # get input file (features and labels)
    if len(sys.argv) > 1:
        if mode == 'training':
            fname = sys.argv[1]
        elif len(sys.argv) > 2:
            fname = sys.argv[2]
    else:
        if mode == 'training':  
            fname = default_trn_path
        else:
            fname = default_tst_path

    if not os.path.exists(fname):
        print "usage:", os.path.split(sys.argv[0])[1], "[default_trn_path]", \
        	  "[default_tst_path]"
        print "Valid file paths must be provided as an arg or global varables"
        sys.exit("invalid input")

    # get heads
    reader = csvreader(open(fname, 'rb'))
    r, start_row, heads = 0, 0, []
    for row in reader:
        if r == 0:
            # get no of cols in data
            if mode == 'training':
                trn_cols = len(row)
                if heads_in_trn_file:
                	heads = row
                	start_row += 1
            else:
                tst_cols = len(row)
                if heads_in_tst_file:
                	heads = row
                	start_row += 1
            r += 1
        else:
            break

    # build a dict to map each col to a conv func (if not excl)
    if mode not in ['test', 'predict']:
        cols = [i for i in range(trn_cols) if i not in excl_trn_cols]
        conv_dict = {c: conv for c in cols}
    else:
        cols = [i for i in range(tst_cols) if i not in excl_tst_cols]
        conv_dict = {c: conv for c in cols}

    if verbose: 
        print '\nData import:', mode, '| cols:', cols, '\n'

    # import data
    #   not excluding unneeded cols, import all, just without conversions
    #   they are exlcuded later in feature_prep
    data = np.genfromtxt(fname, delimiter=delim, converters=conv_dict, 
    					 skip_header=start_row)

    if verbose: 
        print 'all heads:\n',  ', '.join(i for i in heads), '\n'
        print 'shape of data:', np.shape(data)
        print data
    
    return data, heads

def split_trn_data(data):
    m = np.size(data, 0)
    rands = np.random.random_sample(m)
    
    # select cases where random no from above is <= threshold
    trn_data = data[rands <= (trn_perc/100), :]
    cv_data = data[rands > (trn_perc/100), :]

    return trn_data, cv_data

def build_classifier(X, y, reg):
    # rbf is guassian kernal
    clf = svm.SVC(kernel='rbf', C=reg, cache_size=1000) 
    return clf.fit(X, y)

def main():

    global adj_rate
    reg, reg_loop, reg_dir = 1.0, 0, 'up'
    reg_rec, trn_rec, cv_rec = [], [], []
    
    # import training data
    data, heads = import_data('training')

    while reg_loop < auto_find_loops:
        
        trn, cv, = [], []
        for i in range(iterations):

            # split data into training and cross validation groups
            trn_data, cv_data = split_trn_data(data)
            if verbose: 
                print '\nSize of training data:', np.shape(trn_data)
                print 'Size of cross val data:', np.shape(cv_data)
            
            # prep training data and build classifier
            X_trn, y_trn, mu, sd, ids = feature_prep(trn_data, heads, 
                                                     'training', 
                                                     [], [], verbose, i)
            clf = build_classifier(X_trn, y_trn, reg)

            # training accuracy
            trn_pred = clf.predict(X_trn)
            trn_accuracy = 1 - (sum(abs(y_trn - trn_pred)) / len(X_trn))
            trn.append(trn_accuracy)

            # load prepare cv set
            if trn_perc < 100:
                X_cv, y_cv, mu, sd, ids = feature_prep(cv_data, heads, 'cv', 
                                                       mu, sd, verbose, i)

                # cv accuracy
                cv_pred = clf.predict(X_cv)
                cv_accuracy = 1 - (sum(abs(y_cv - cv_pred)) / len(X_cv))
                cv.append(cv_accuracy)

        reg_rec.append(reg)
        trn_rec.append(np.mean(trn))
        if trn_perc < 100:
            cv_rec.append(np.mean(cv))
        else:
            cv_rec.append(0)

        if reg_loop == 0:
            print 'Loop  |  C param  |  Trn accuracy  |  CV accuracy   |  Dir'
            print '-----------------------------------------------------------'

        better = (reg_loop == 0 or cv_rec[reg_loop] > cv_rec[reg_loop-1])

        # switch direction & reduce adj_rate if not getting better
        if not better:
            adj_rate *= 0.95
            if reg_dir == 'up':
                reg_dir = 'down'
            else:
                reg_dir = 'up'

        try:
            print str(reg_loop) + ' ' * (6 - len(str(reg_loop))) + '|' + \
                  '  ' + str(round(reg, 3)) + \
                  ' ' * (9 - len(str(round(reg, 3)))) + '|' + \
                  '  ' + str(round(trn_rec[reg_loop], 9)) + \
                  ' ' * (14 - len(str(round(trn_rec[reg_loop], 9)))) + '|' + \
                  '  ' + str(round(cv_rec[reg_loop], 9)) + \
                  ' ' * (14 - len(str(round(cv_rec[reg_loop], 9)))) + '|' + \
                  '  ' + reg_dir
        except:
            print reg_loop, reg, trn_rec[reg_loop], cv_rec[reg_loop], reg_dir
            pass

        if reg_dir == 'up':
            reg *= adj_rate
        else:
            reg /= adj_rate

        reg_loop += 1

    # load in test data and run through the same prep / normalisation
    t_data, t_heads = import_data('test')
    X, tmp_y, mu, sd, ids = feature_prep(t_data, t_heads, 'test', 
                                         mu, sd, verbose, i)
    
    # get predictions and make each item an int in a sublist (required format)
    y = clf.predict(X)
    print '\nFound', int(sum(y)), 'positive predictions out of', len(y)
    print '(iterations:', iterations, '| trn_perc:', trn_perc, ')'

    if tst_id_col >= 0:
	    predictions = [[int(ids[i]), int(round(y[i],0))] for i in range(len(y))]
    else:
        predictions = [[int(round(y[i],0))] for i in range(len(y))]
    if heads_in_trn_file:
	    predictions.insert(0, [t_heads[tst_id_col], t_heads[labels_col]])

    write_to_file(predictions, 
                  os.path.join(default_dir, 'test_predictions.csv'))    

if __name__ == '__main__':
    main()

Description

For Kaggle Titanic problem prepare features (encoding, null handling, normalisation etc) into feature matrix ready to train a leanring algorithm.
https://snipt.net/embed/378e40d64765338a9346e89bac234e32/
/raw/378e40d64765338a9346e89bac234e32/
378e40d64765338a9346e89bac234e32
python
Python
355
2019-08-23T19:53:17
True
False
False
/api/public/snipt/101518/
feature_prep_v2py
<table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><a href="#L-1"> 1</a> <a href="#L-2"> 2</a> <a href="#L-3"> 3</a> <a href="#L-4"> 4</a> <a href="#L-5"> 5</a> <a href="#L-6"> 6</a> <a href="#L-7"> 7</a> <a href="#L-8"> 8</a> <a href="#L-9"> 9</a> <a href="#L-10"> 10</a> <a href="#L-11"> 11</a> <a href="#L-12"> 12</a> <a href="#L-13"> 13</a> <a href="#L-14"> 14</a> <a href="#L-15"> 15</a> <a href="#L-16"> 16</a> <a href="#L-17"> 17</a> <a href="#L-18"> 18</a> <a href="#L-19"> 19</a> <a href="#L-20"> 20</a> <a href="#L-21"> 21</a> <a href="#L-22"> 22</a> <a href="#L-23"> 23</a> <a href="#L-24"> 24</a> <a href="#L-25"> 25</a> <a href="#L-26"> 26</a> <a href="#L-27"> 27</a> <a href="#L-28"> 28</a> <a href="#L-29"> 29</a> <a href="#L-30"> 30</a> <a href="#L-31"> 31</a> <a href="#L-32"> 32</a> <a href="#L-33"> 33</a> <a href="#L-34"> 34</a> <a href="#L-35"> 35</a> <a href="#L-36"> 36</a> <a href="#L-37"> 37</a> <a href="#L-38"> 38</a> <a href="#L-39"> 39</a> <a href="#L-40"> 40</a> <a href="#L-41"> 41</a> <a href="#L-42"> 42</a> <a href="#L-43"> 43</a> <a href="#L-44"> 44</a> <a href="#L-45"> 45</a> <a href="#L-46"> 46</a> <a href="#L-47"> 47</a> <a href="#L-48"> 48</a> <a href="#L-49"> 49</a> <a href="#L-50"> 50</a> <a href="#L-51"> 51</a> <a href="#L-52"> 52</a> <a href="#L-53"> 53</a> <a href="#L-54"> 54</a> <a href="#L-55"> 55</a> <a href="#L-56"> 56</a> <a href="#L-57"> 57</a> <a href="#L-58"> 58</a> <a href="#L-59"> 59</a> <a href="#L-60"> 60</a> <a href="#L-61"> 61</a> <a href="#L-62"> 62</a> <a href="#L-63"> 63</a> <a href="#L-64"> 64</a> <a href="#L-65"> 65</a> <a href="#L-66"> 66</a> <a href="#L-67"> 67</a> <a href="#L-68"> 68</a> <a href="#L-69"> 69</a> <a href="#L-70"> 70</a> <a href="#L-71"> 71</a> <a href="#L-72"> 72</a> <a href="#L-73"> 73</a> <a href="#L-74"> 74</a> <a href="#L-75"> 75</a> <a href="#L-76"> 76</a> <a href="#L-77"> 77</a> <a href="#L-78"> 78</a> <a href="#L-79"> 79</a> <a href="#L-80"> 80</a> <a href="#L-81"> 81</a> <a href="#L-82"> 82</a> <a href="#L-83"> 83</a> <a href="#L-84"> 84</a> <a href="#L-85"> 85</a> <a href="#L-86"> 86</a> <a href="#L-87"> 87</a> <a href="#L-88"> 88</a> <a href="#L-89"> 89</a> <a href="#L-90"> 90</a> <a href="#L-91"> 91</a> <a href="#L-92"> 92</a> <a href="#L-93"> 93</a> <a href="#L-94"> 94</a> <a href="#L-95"> 95</a> <a href="#L-96"> 96</a> <a href="#L-97"> 97</a> <a href="#L-98"> 98</a> <a href="#L-99"> 99</a> <a href="#L-100">100</a> <a href="#L-101">101</a> <a href="#L-102">102</a> <a href="#L-103">103</a> <a href="#L-104">104</a> <a href="#L-105">105</a> <a href="#L-106">106</a> <a href="#L-107">107</a> <a href="#L-108">108</a> <a href="#L-109">109</a> <a href="#L-110">110</a> <a href="#L-111">111</a> <a href="#L-112">112</a> <a href="#L-113">113</a> <a href="#L-114">114</a> <a href="#L-115">115</a> <a href="#L-116">116</a> <a href="#L-117">117</a> <a href="#L-118">118</a> <a href="#L-119">119</a> <a href="#L-120">120</a> <a href="#L-121">121</a> <a href="#L-122">122</a> <a href="#L-123">123</a> <a href="#L-124">124</a> <a href="#L-125">125</a> <a href="#L-126">126</a> <a href="#L-127">127</a> <a href="#L-128">128</a> <a href="#L-129">129</a> <a href="#L-130">130</a> <a href="#L-131">131</a> <a href="#L-132">132</a> <a href="#L-133">133</a> <a href="#L-134">134</a> <a href="#L-135">135</a> <a href="#L-136">136</a> <a href="#L-137">137</a> <a href="#L-138">138</a> <a href="#L-139">139</a> <a href="#L-140">140</a> <a href="#L-141">141</a> <a href="#L-142">142</a> <a href="#L-143">143</a> <a href="#L-144">144</a> <a href="#L-145">145</a> <a href="#L-146">146</a> <a href="#L-147">147</a> <a href="#L-148">148</a> <a href="#L-149">149</a> <a href="#L-150">150</a> <a href="#L-151">151</a> <a href="#L-152">152</a> <a href="#L-153">153</a> <a href="#L-154">154</a> <a href="#L-155">155</a> <a href="#L-156">156</a> <a href="#L-157">157</a> <a href="#L-158">158</a> <a href="#L-159">159</a> <a href="#L-160">160</a> <a href="#L-161">161</a> <a href="#L-162">162</a> <a href="#L-163">163</a> <a href="#L-164">164</a> <a href="#L-165">165</a> <a href="#L-166">166</a> <a href="#L-167">167</a> <a href="#L-168">168</a> <a href="#L-169">169</a> <a href="#L-170">170</a> <a href="#L-171">171</a> <a href="#L-172">172</a> <a href="#L-173">173</a> <a href="#L-174">174</a> <a href="#L-175">175</a> <a href="#L-176">176</a> <a href="#L-177">177</a> <a href="#L-178">178</a> <a href="#L-179">179</a> <a href="#L-180">180</a> <a href="#L-181">181</a> <a href="#L-182">182</a> <a href="#L-183">183</a> <a href="#L-184">184</a> <a href="#L-185">185</a> <a href="#L-186">186</a> <a href="#L-187">187</a> <a href="#L-188">188</a> <a href="#L-189">189</a> <a href="#L-190">190</a> <a href="#L-191">191</a> <a href="#L-192">192</a> <a href="#L-193">193</a> <a href="#L-194">194</a> <a href="#L-195">195</a> <a href="#L-196">196</a> <a href="#L-197">197</a> <a href="#L-198">198</a> <a href="#L-199">199</a> <a href="#L-200">200</a> <a href="#L-201">201</a> <a href="#L-202">202</a> <a href="#L-203">203</a> <a href="#L-204">204</a> <a href="#L-205">205</a> <a href="#L-206">206</a> <a href="#L-207">207</a> <a href="#L-208">208</a> <a href="#L-209">209</a> <a href="#L-210">210</a> <a href="#L-211">211</a> <a href="#L-212">212</a> <a href="#L-213">213</a> <a href="#L-214">214</a> <a href="#L-215">215</a> <a href="#L-216">216</a> <a href="#L-217">217</a> <a href="#L-218">218</a> <a href="#L-219">219</a> <a href="#L-220">220</a> <a href="#L-221">221</a> <a href="#L-222">222</a> <a href="#L-223">223</a> <a href="#L-224">224</a> <a href="#L-225">225</a> <a href="#L-226">226</a> <a href="#L-227">227</a> <a href="#L-228">228</a> <a href="#L-229">229</a> <a href="#L-230">230</a> <a href="#L-231">231</a> <a href="#L-232">232</a> <a href="#L-233">233</a> <a href="#L-234">234</a> <a href="#L-235">235</a> <a href="#L-236">236</a> <a href="#L-237">237</a> <a href="#L-238">238</a> <a href="#L-239">239</a> <a href="#L-240">240</a> <a href="#L-241">241</a> <a href="#L-242">242</a> <a href="#L-243">243</a> <a href="#L-244">244</a> <a href="#L-245">245</a> <a href="#L-246">246</a> <a href="#L-247">247</a> <a href="#L-248">248</a> <a href="#L-249">249</a> <a href="#L-250">250</a> <a href="#L-251">251</a> <a href="#L-252">252</a> <a href="#L-253">253</a> <a href="#L-254">254</a> <a href="#L-255">255</a> <a href="#L-256">256</a> <a href="#L-257">257</a> <a href="#L-258">258</a> <a href="#L-259">259</a> <a href="#L-260">260</a> <a href="#L-261">261</a> <a href="#L-262">262</a> <a href="#L-263">263</a> <a href="#L-264">264</a> <a href="#L-265">265</a> <a href="#L-266">266</a> <a href="#L-267">267</a> <a href="#L-268">268</a> <a href="#L-269">269</a> <a href="#L-270">270</a> <a href="#L-271">271</a> <a href="#L-272">272</a> <a href="#L-273">273</a> <a href="#L-274">274</a> <a href="#L-275">275</a> <a href="#L-276">276</a> <a href="#L-277">277</a> <a href="#L-278">278</a> <a href="#L-279">279</a> <a href="#L-280">280</a> <a href="#L-281">281</a> <a href="#L-282">282</a> <a href="#L-283">283</a> <a href="#L-284">284</a> <a href="#L-285">285</a> <a href="#L-286">286</a> <a href="#L-287">287</a> <a href="#L-288">288</a> <a href="#L-289">289</a> <a href="#L-290">290</a> <a href="#L-291">291</a> <a href="#L-292">292</a> <a href="#L-293">293</a> <a href="#L-294">294</a> <a href="#L-295">295</a> <a href="#L-296">296</a> <a href="#L-297">297</a> <a href="#L-298">298</a> <a href="#L-299">299</a> <a href="#L-300">300</a> <a href="#L-301">301</a> <a href="#L-302">302</a> <a href="#L-303">303</a> <a href="#L-304">304</a> <a href="#L-305">305</a> <a href="#L-306">306</a> <a href="#L-307">307</a> <a href="#L-308">308</a> <a href="#L-309">309</a> <a href="#L-310">310</a> <a href="#L-311">311</a> <a href="#L-312">312</a> <a href="#L-313">313</a> <a href="#L-314">314</a> <a href="#L-315">315</a> <a href="#L-316">316</a> <a href="#L-317">317</a> <a href="#L-318">318</a> <a href="#L-319">319</a> <a href="#L-320">320</a> <a href="#L-321">321</a> <a href="#L-322">322</a> <a href="#L-323">323</a> <a href="#L-324">324</a> <a href="#L-325">325</a> <a href="#L-326">326</a> <a href="#L-327">327</a> <a href="#L-328">328</a> <a href="#L-329">329</a> <a href="#L-330">330</a> <a href="#L-331">331</a> <a href="#L-332">332</a> <a href="#L-333">333</a> <a href="#L-334">334</a> <a href="#L-335">335</a> <a href="#L-336">336</a> <a href="#L-337">337</a> <a href="#L-338">338</a> <a href="#L-339">339</a> <a href="#L-340">340</a> <a href="#L-341">341</a> <a href="#L-342">342</a> <a href="#L-343">343</a> <a href="#L-344">344</a> <a href="#L-345">345</a> <a href="#L-346">346</a> <a href="#L-347">347</a> <a href="#L-348">348</a> <a href="#L-349">349</a> <a href="#L-350">350</a> <a href="#L-351">351</a> <a href="#L-352">352</a> <a href="#L-353">353</a> <a href="#L-354">354</a> <a href="#L-355">355</a></pre></div></td><td class="code"><div class="highlight"><pre><span></span><span id="L-1"><a name="L-1"></a><span class="c1">#/usr/bin/python</span> </span><span id="L-2"><a name="L-2"></a><span class="kn">from</span> <span class="nn">__future__</span> <span class="kn">import</span> <span class="n">division</span> </span><span id="L-3"><a name="L-3"></a><span class="kn">import</span> <span class="nn">sys</span><span class="o">,</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span> </span><span id="L-4"><a name="L-4"></a><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">svm</span> </span><span id="L-5"><a name="L-5"></a><span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span> </span><span id="L-6"><a name="L-6"></a><span class="kn">from</span> <span class="nn">csv</span> <span class="kn">import</span> <span class="n">writer</span> <span class="k">as</span> <span class="n">csvwriter</span><span class="p">,</span> <span class="n">reader</span> <span class="k">as</span> <span class="n">csvreader</span> </span><span id="L-7"><a name="L-7"></a><span class="kn">from</span> <span class="nn">scipy.stats</span> <span class="kn">import</span> <span class="n">nanmean</span> </span><span id="L-8"><a name="L-8"></a> </span><span id="L-9"><a name="L-9"></a><span class="c1"># globals - you will need to alter these for your problem...</span> </span><span id="L-10"><a name="L-10"></a><span class="n">default_dir</span> <span class="o">=</span> <span class="s1">r&#39;C:\Users\Chris\Dropbox\ProsserSystems\Python\machine_learning\sklearn_data&#39;</span> </span><span id="L-11"><a name="L-11"></a><span class="n">default_trn_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">default_dir</span><span class="p">,</span> <span class="s1">&#39;train.csv&#39;</span><span class="p">)</span> </span><span id="L-12"><a name="L-12"></a><span class="n">default_tst_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">default_dir</span><span class="p">,</span> <span class="s1">&#39;test.csv&#39;</span><span class="p">)</span> </span><span id="L-13"><a name="L-13"></a><span class="n">delim</span> <span class="o">=</span> <span class="s1">&#39;,&#39;</span> </span><span id="L-14"><a name="L-14"></a><span class="c1"># zero indexed, don&#39;t allow for exclusions, they are handled, -1 means no ids</span> </span><span id="L-15"><a name="L-15"></a><span class="n">trn_id_col</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> </span><span id="L-16"><a name="L-16"></a><span class="n">tst_id_col</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> </span><span id="L-17"><a name="L-17"></a><span class="n">labels_col</span> <span class="o">=</span> <span class="mi">0</span> </span><span id="L-18"><a name="L-18"></a><span class="c1"># col nos to be excluded - zero indexed</span> </span><span id="L-19"><a name="L-19"></a><span class="n">excl_trn_cols</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1">#e.g. [3,8,10]</span> </span><span id="L-20"><a name="L-20"></a><span class="n">excl_tst_cols</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1"># e.g. -1 from above: [i-1 for i in excl_trn_cols] </span> </span><span id="L-21"><a name="L-21"></a><span class="n">heads_in_trn_file</span><span class="p">,</span> <span class="n">heads_in_tst_file</span> <span class="o">=</span> <span class="bp">False</span><span class="p">,</span> <span class="bp">False</span> </span><span id="L-22"><a name="L-22"></a> </span><span id="L-23"><a name="L-23"></a><span class="c1"># regularisation</span> </span><span id="L-24"><a name="L-24"></a><span class="n">auto_find_loops</span> <span class="o">=</span> <span class="mi">1</span> <span class="c1"># WARNING this will run this no * iterations global times.</span> </span><span id="L-25"><a name="L-25"></a> <span class="c1"># set to 1 to turn off and use reg default </span> </span><span id="L-26"><a name="L-26"></a> <span class="c1"># (specified in main)</span> </span><span id="L-27"><a name="L-27"></a><span class="n">adj_rate</span> <span class="o">=</span> <span class="mf">2.0</span> <span class="c1"># size of steps to take in auto_find * or / by this number</span> </span><span id="L-28"><a name="L-28"></a> </span><span id="L-29"><a name="L-29"></a><span class="c1"># These should have reasonable defaults if you are not sure...</span> </span><span id="L-30"><a name="L-30"></a><span class="n">trn_perc</span> <span class="o">=</span> <span class="mi">90</span> </span><span id="L-31"><a name="L-31"></a><span class="n">iterations</span> <span class="o">=</span> <span class="mi">1000</span> </span><span id="L-32"><a name="L-32"></a><span class="n">verbose</span> <span class="o">=</span> <span class="bp">False</span> </span><span id="L-33"><a name="L-33"></a><span class="n">show_graphs</span> <span class="o">=</span> <span class="bp">False</span> <span class="c1"># plots data by one feature against another, only useful if</span> </span><span id="L-34"><a name="L-34"></a> <span class="c1"># you have a very small no of features e.g.~ &lt;=8</span> </span><span id="L-35"><a name="L-35"></a> </span><span id="L-36"><a name="L-36"></a><span class="c1">#initialisations</span> </span><span id="L-37"><a name="L-37"></a><span class="n">trn_cols</span><span class="p">,</span> <span class="n">tst_cols</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span> </span><span id="L-38"><a name="L-38"></a> </span><span id="L-39"><a name="L-39"></a><span class="k">def</span> <span class="nf">scale_features</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">,</span> <span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span> </span><span id="L-40"><a name="L-40"></a> <span class="sd">&quot;&quot;&quot;</span> </span><span id="L-41"><a name="L-41"></a><span class="sd"> Call with a numpy array for the values to be scaled down to approx -3 to 3.</span> </span><span id="L-42"><a name="L-42"></a><span class="sd"> This is required for algorithms such as gradient descent so</span> </span><span id="L-43"><a name="L-43"></a><span class="sd"> that the results can converge more efficiently.</span> </span><span id="L-44"><a name="L-44"></a><span class="sd"> m &amp; n args are for no of rows and no of cols respectively.</span> </span><span id="L-45"><a name="L-45"></a><span class="sd"> Returns as a Numpy array with a feature added for x0 (a 1 for each row).</span> </span><span id="L-46"><a name="L-46"></a><span class="sd"> &quot;&quot;&quot;</span> </span><span id="L-47"><a name="L-47"></a> <span class="c1"># function for each element (vectorised later)</span> </span><span id="L-48"><a name="L-48"></a> <span class="k">def</span> <span class="nf">scale</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">):</span> </span><span id="L-49"><a name="L-49"></a> <span class="k">return</span> <span class="p">(</span><span class="n">x</span><span class="o">-</span><span class="n">mu</span><span class="p">)</span><span class="o">/</span><span class="n">sd</span> <span class="k">if</span> <span class="n">sd</span> <span class="o">!=</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">x</span> </span><span id="L-50"><a name="L-50"></a> </span><span id="L-51"><a name="L-51"></a> <span class="c1"># vectorise function above</span> </span><span id="L-52"><a name="L-52"></a> <span class="n">scale_vec</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">vectorize</span><span class="p">(</span><span class="n">scale</span><span class="p">,</span> <span class="n">otypes</span><span class="o">=</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">float</span><span class="p">])</span> </span><span id="L-53"><a name="L-53"></a> </span><span id="L-54"><a name="L-54"></a> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">mu</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> </span><span id="L-55"><a name="L-55"></a> <span class="n">mu</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> </span><span id="L-56"><a name="L-56"></a> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sd</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> </span><span id="L-57"><a name="L-57"></a> <span class="n">sd</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">std</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> </span><span id="L-58"><a name="L-58"></a> <span class="n">X_norm</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">((</span><span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="o">+</span><span class="mi">1</span><span class="p">))</span> </span><span id="L-59"><a name="L-59"></a> <span class="n">X_norm</span><span class="p">[:,</span> <span class="mi">1</span><span class="p">:]</span> <span class="o">=</span> <span class="n">scale_vec</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">)</span> </span><span id="L-60"><a name="L-60"></a> <span class="k">return</span> <span class="n">X_norm</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span> </span><span id="L-61"><a name="L-61"></a> </span><span id="L-62"><a name="L-62"></a><span class="k">def</span> <span class="nf">graphs</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">X</span><span class="p">,</span> <span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">label</span><span class="p">):</span> </span><span id="L-63"><a name="L-63"></a> <span class="c1"># get indexes of positive and negative examples</span> </span><span id="L-64"><a name="L-64"></a> <span class="n">pos</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">m</span><span class="p">)</span> <span class="k">if</span> <span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="mi">1</span><span class="p">]</span> </span><span id="L-65"><a name="L-65"></a> <span class="n">neg</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">m</span><span class="p">)</span> <span class="k">if</span> <span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="mi">0</span><span class="p">]</span> </span><span id="L-66"><a name="L-66"></a> </span><span id="L-67"><a name="L-67"></a> <span class="c1"># plot stuff...</span> </span><span id="L-68"><a name="L-68"></a> <span class="k">if</span> <span class="n">show_graphs</span><span class="p">:</span> </span><span id="L-69"><a name="L-69"></a> <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span> <span class="c1"># miss x0</span> </span><span id="L-70"><a name="L-70"></a> <span class="k">for</span> <span class="n">j2</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">j</span><span class="o">+</span><span class="mi">1</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span> <span class="c1"># for all combos</span> </span><span id="L-71"><a name="L-71"></a> <span class="n">x1_p</span><span class="p">,</span> <span class="n">x1_n</span> <span class="o">=</span> <span class="p">[</span><span class="n">X</span><span class="p">[</span><span class="n">j</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">pos</span><span class="p">],</span> <span class="p">[</span><span class="n">X</span><span class="p">[</span><span class="n">j</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">neg</span><span class="p">]</span> </span><span id="L-72"><a name="L-72"></a> <span class="n">x2_p</span><span class="p">,</span> <span class="n">x2_n</span> <span class="o">=</span> <span class="p">[</span><span class="n">X</span><span class="p">[</span><span class="n">j2</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">pos</span><span class="p">],</span> <span class="p">[</span><span class="n">X</span><span class="p">[</span><span class="n">j2</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">neg</span><span class="p">]</span> </span><span id="L-73"><a name="L-73"></a> <span class="n">fig</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">x1_p</span><span class="p">,</span> <span class="n">x2_p</span><span class="p">,</span> <span class="s1">&#39;go&#39;</span><span class="p">,</span> <span class="n">x1_n</span><span class="p">,</span> <span class="n">x2_n</span><span class="p">,</span> <span class="s1">&#39;rx&#39;</span><span class="p">)</span> </span><span id="L-74"><a name="L-74"></a> <span class="n">x_lab</span><span class="p">,</span> <span class="n">y_lab</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">xlabel</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="n">j</span><span class="p">]),</span> <span class="n">plt</span><span class="o">.</span><span class="n">ylabel</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="n">j2</span><span class="p">])</span> </span><span id="L-75"><a name="L-75"></a> <span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> </span><span id="L-76"><a name="L-76"></a> </span><span id="L-77"><a name="L-77"></a><span class="k">def</span> <span class="nf">write_to_file</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">f_name</span><span class="p">):</span> </span><span id="L-78"><a name="L-78"></a> <span class="n">writer</span> <span class="o">=</span> <span class="n">csvwriter</span><span class="p">(</span><span class="nb">open</span><span class="p">(</span><span class="n">f_name</span><span class="p">,</span> <span class="s1">&#39;wb&#39;</span><span class="p">))</span> </span><span id="L-79"><a name="L-79"></a> <span class="n">writer</span><span class="o">.</span><span class="n">writerows</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> </span><span id="L-80"><a name="L-80"></a> </span><span id="L-81"><a name="L-81"></a><span class="k">def</span> <span class="nf">feature_prep</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">heads</span><span class="p">,</span> <span class="n">stage</span><span class="o">=</span><span class="s1">&#39;training&#39;</span><span class="p">,</span> <span class="n">use_mean</span><span class="o">=</span><span class="p">[],</span> <span class="n">use_sd</span><span class="o">=</span><span class="p">[],</span> </span><span id="L-82"><a name="L-82"></a> <span class="n">write_output_to_file</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">cur_loop</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> </span><span id="L-83"><a name="L-83"></a> <span class="sd">&quot;&quot;&quot;</span> </span><span id="L-84"><a name="L-84"></a><span class="sd"> use_mean and use_sd should not be specified for initial training data, but</span> </span><span id="L-85"><a name="L-85"></a><span class="sd"> then should be passed in for cv, test and predictions (to use the same </span> </span><span id="L-86"><a name="L-86"></a><span class="sd"> feature scaling as inital training data).</span> </span><span id="L-87"><a name="L-87"></a><span class="sd"> &quot;&quot;&quot;</span> </span><span id="L-88"><a name="L-88"></a> </span><span id="L-89"><a name="L-89"></a> <span class="k">if</span> <span class="n">cur_loop</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">verbose</span><span class="p">:</span> </span><span id="L-90"><a name="L-90"></a> <span class="n">l_verbose</span> <span class="o">=</span> <span class="bp">True</span> </span><span id="L-91"><a name="L-91"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-92"><a name="L-92"></a> <span class="n">l_verbose</span> <span class="o">=</span> <span class="bp">False</span> </span><span id="L-93"><a name="L-93"></a> </span><span id="L-94"><a name="L-94"></a> <span class="c1"># load in training data</span> </span><span id="L-95"><a name="L-95"></a> <span class="k">if</span> <span class="n">l_verbose</span><span class="p">:</span> <span class="k">print</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">,</span> <span class="s1">&#39;-&#39;</span><span class="o">*</span><span class="mi">80</span><span class="p">,</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">,</span> <span class="s1">&#39;Stage:&#39;</span><span class="p">,</span> <span class="n">stage</span> </span><span id="L-96"><a name="L-96"></a> <span class="n">ids</span> <span class="o">=</span> <span class="bp">None</span> </span><span id="L-97"><a name="L-97"></a> <span class="k">if</span> <span class="n">stage</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;test&#39;</span><span class="p">,</span> <span class="s1">&#39;predict&#39;</span><span class="p">]:</span> </span><span id="L-98"><a name="L-98"></a> <span class="n">feature_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">trn_cols</span><span class="p">)</span> </span><span id="L-99"><a name="L-99"></a> <span class="k">if</span> <span class="n">i</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="n">trn_id_col</span><span class="p">,</span> <span class="n">labels_col</span><span class="p">]</span><span class="o">+</span><span class="n">excl_trn_cols</span><span class="p">]</span> </span><span id="L-100"><a name="L-100"></a> <span class="k">if</span> <span class="n">l_verbose</span><span class="p">:</span> </span><span id="L-101"><a name="L-101"></a> <span class="k">print</span> <span class="s1">&#39;feature_cols:&#39;</span><span class="p">,</span> <span class="n">feature_cols</span> </span><span id="L-102"><a name="L-102"></a> <span class="k">print</span> <span class="s1">&#39;trn_cols:&#39;</span><span class="p">,</span> <span class="n">trn_cols</span><span class="p">,</span> <span class="s1">&#39;| trn_id_col:&#39;</span><span class="p">,</span> <span class="n">trn_id_col</span><span class="p">,</span> \ </span><span id="L-103"><a name="L-103"></a> <span class="s1">&#39;| labels_col:&#39;</span><span class="p">,</span> <span class="n">labels_col</span><span class="p">,</span> <span class="s1">&#39;| excl_trn_cols:&#39;</span><span class="p">,</span> <span class="n">excl_trn_cols</span> </span><span id="L-104"><a name="L-104"></a> <span class="n">X</span><span class="p">,</span> <span class="n">y</span><span class="o">=</span> <span class="n">data</span><span class="p">[:,</span> <span class="n">feature_cols</span><span class="p">],</span> <span class="n">data</span><span class="p">[:,</span> <span class="n">labels_col</span><span class="p">]</span> </span><span id="L-105"><a name="L-105"></a> </span><span id="L-106"><a name="L-106"></a> <span class="c1"># filter heads</span> </span><span id="L-107"><a name="L-107"></a> <span class="k">if</span> <span class="n">heads_in_trn_file</span><span class="p">:</span> </span><span id="L-108"><a name="L-108"></a> <span class="n">heads</span> <span class="o">=</span> <span class="p">[</span><span class="n">heads</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n</span><span class="p">)</span> <span class="k">if</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">feature_cols</span><span class="p">]</span> </span><span id="L-109"><a name="L-109"></a> </span><span id="L-110"><a name="L-110"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-111"><a name="L-111"></a> <span class="n">feature_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">tst_cols</span><span class="p">)</span> </span><span id="L-112"><a name="L-112"></a> <span class="k">if</span> <span class="n">i</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="n">tst_id_col</span><span class="p">]</span><span class="o">+</span><span class="n">excl_tst_cols</span><span class="p">]</span> </span><span id="L-113"><a name="L-113"></a> <span class="k">if</span> <span class="n">l_verbose</span><span class="p">:</span> </span><span id="L-114"><a name="L-114"></a> <span class="k">print</span> <span class="s1">&#39;feature_cols:&#39;</span><span class="p">,</span> <span class="n">feature_cols</span> </span><span id="L-115"><a name="L-115"></a> <span class="k">print</span> <span class="s1">&#39;tst_cols:&#39;</span><span class="p">,</span> <span class="n">tst_cols</span><span class="p">,</span> <span class="s1">&#39;| tst_id_col:&#39;</span><span class="p">,</span> <span class="n">tst_id_col</span><span class="p">,</span> \ </span><span id="L-116"><a name="L-116"></a> <span class="s1">&#39;| excl_tst_cols:&#39;</span><span class="p">,</span> <span class="n">excl_tst_cols</span> </span><span id="L-117"><a name="L-117"></a> <span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">data</span><span class="p">[:,</span> <span class="n">feature_cols</span><span class="p">],</span> <span class="bp">None</span> </span><span id="L-118"><a name="L-118"></a> </span><span id="L-119"><a name="L-119"></a> <span class="c1"># filter heads</span> </span><span id="L-120"><a name="L-120"></a> <span class="k">if</span> <span class="n">heads_in_tst_file</span><span class="p">:</span> </span><span id="L-121"><a name="L-121"></a> <span class="n">heads</span> <span class="o">=</span> <span class="p">[</span><span class="n">heads</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n</span><span class="p">)</span> <span class="k">if</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">feature_cols</span><span class="p">]</span> </span><span id="L-122"><a name="L-122"></a> </span><span id="L-123"><a name="L-123"></a> <span class="c1"># record tst id columns if needed</span> </span><span id="L-124"><a name="L-124"></a> <span class="k">if</span> <span class="n">tst_id_col</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">:</span> </span><span id="L-125"><a name="L-125"></a> <span class="n">data</span><span class="p">[:,</span> <span class="n">tst_id_col</span><span class="p">]</span> </span><span id="L-126"><a name="L-126"></a> </span><span id="L-127"><a name="L-127"></a> <span class="n">m</span><span class="p">,</span> <span class="n">n</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">np</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># no of rows and cols</span> </span><span id="L-128"><a name="L-128"></a> </span><span id="L-129"><a name="L-129"></a> <span class="k">if</span> <span class="n">l_verbose</span><span class="p">:</span> </span><span id="L-130"><a name="L-130"></a> <span class="k">print</span> <span class="s1">&#39;Heads used:</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">,</span> <span class="s1">&#39;, &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">heads</span><span class="p">)</span> </span><span id="L-131"><a name="L-131"></a> <span class="k">print</span> <span class="s1">&#39;X:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">shape</span><span class="p">(</span><span class="n">X</span><span class="p">),</span> <span class="s1">&#39;y:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">shape</span><span class="p">(</span><span class="n">y</span><span class="p">),</span> <span class="s1">&#39;ids:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">shape</span><span class="p">(</span><span class="n">ids</span><span class="p">),</span> \ </span><span id="L-132"><a name="L-132"></a> <span class="s1">&#39;data:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">shape</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="s1">&#39;m:&#39;</span><span class="p">,</span> <span class="n">m</span><span class="p">,</span> <span class="s1">&#39;n:&#39;</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span> </span><span id="L-133"><a name="L-133"></a> </span><span id="L-134"><a name="L-134"></a> <span class="c1"># fill blanks with averages</span> </span><span id="L-135"><a name="L-135"></a> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">use_mean</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span> </span><span id="L-136"><a name="L-136"></a> <span class="n">col_default</span> <span class="o">=</span> <span class="n">use_mean</span> </span><span id="L-137"><a name="L-137"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-138"><a name="L-138"></a> <span class="c1"># calc means of cols if not passed in as args</span> </span><span id="L-139"><a name="L-139"></a> <span class="n">col_default</span> <span class="o">=</span> <span class="n">use_mean</span> <span class="o">=</span> <span class="n">nanmean</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> </span><span id="L-140"><a name="L-140"></a> <span class="n">inds</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">isnan</span><span class="p">(</span><span class="n">X</span><span class="p">))</span> <span class="c1"># find indicies of empty cells to be replaced</span> </span><span id="L-141"><a name="L-141"></a> <span class="n">X</span><span class="p">[</span><span class="n">inds</span><span class="p">]</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="n">col_default</span><span class="p">,</span> <span class="n">inds</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> </span><span id="L-142"><a name="L-142"></a> </span><span id="L-143"><a name="L-143"></a> <span class="k">if</span> <span class="n">show_graphs</span><span class="p">:</span> </span><span id="L-144"><a name="L-144"></a> <span class="n">graphs</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">X</span><span class="p">,</span> <span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">heads</span><span class="p">)</span> </span><span id="L-145"><a name="L-145"></a> </span><span id="L-146"><a name="L-146"></a> <span class="c1">#test</span> </span><span id="L-147"><a name="L-147"></a> <span class="k">if</span> <span class="n">l_verbose</span><span class="p">:</span> </span><span id="L-148"><a name="L-148"></a> <span class="k">print</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">First ten rows before normalisation:&#39;</span> </span><span id="L-149"><a name="L-149"></a> <span class="n">np</span><span class="o">.</span><span class="n">set_printoptions</span><span class="p">(</span><span class="n">precision</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">suppress</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span> </span><span id="L-150"><a name="L-150"></a> <span class="k">print</span> <span class="n">X</span><span class="p">[:</span><span class="mi">10</span><span class="p">,</span> <span class="p">:],</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span> </span><span id="L-151"><a name="L-151"></a> </span><span id="L-152"><a name="L-152"></a> <span class="c1"># scale the features &amp; write output</span> </span><span id="L-153"><a name="L-153"></a> <span class="n">X</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span> <span class="o">=</span> <span class="n">scale_features</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">use_mean</span><span class="p">,</span> <span class="n">use_sd</span><span class="p">,</span> <span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">)</span> </span><span id="L-154"><a name="L-154"></a> <span class="k">if</span> <span class="n">write_output_to_file</span><span class="p">:</span> </span><span id="L-155"><a name="L-155"></a> <span class="n">write_to_file</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">join</span><span class="o">.</span><span class="n">path</span><span class="p">(</span><span class="n">default_dir</span><span class="p">,</span> <span class="s1">&#39;X_&#39;</span><span class="o">+</span><span class="n">stage</span><span class="o">+</span><span class="s1">&#39;.csv&#39;</span><span class="p">))</span> </span><span id="L-156"><a name="L-156"></a> </span><span id="L-157"><a name="L-157"></a> <span class="c1">#test</span> </span><span id="L-158"><a name="L-158"></a> <span class="k">if</span> <span class="n">l_verbose</span><span class="p">:</span> </span><span id="L-159"><a name="L-159"></a> <span class="k">print</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">First ten rows after normalisation:&#39;</span> </span><span id="L-160"><a name="L-160"></a> <span class="k">print</span> <span class="n">X</span><span class="p">[:</span><span class="mi">10</span><span class="p">,</span> <span class="mi">1</span><span class="p">:],</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span> </span><span id="L-161"><a name="L-161"></a> </span><span id="L-162"><a name="L-162"></a> <span class="k">return</span> <span class="n">X</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">,</span> <span class="n">ids</span> </span><span id="L-163"><a name="L-163"></a> </span><span id="L-164"><a name="L-164"></a><span class="k">def</span> <span class="nf">conv</span><span class="p">(</span><span class="n">val</span><span class="p">):</span> </span><span id="L-165"><a name="L-165"></a> <span class="k">try</span><span class="p">:</span> </span><span id="L-166"><a name="L-166"></a> <span class="k">return</span> <span class="nb">float</span><span class="p">(</span><span class="n">val</span><span class="p">)</span> </span><span id="L-167"><a name="L-167"></a> <span class="k">except</span><span class="p">:</span> </span><span id="L-168"><a name="L-168"></a> <span class="k">if</span> <span class="ow">not</span> <span class="n">val</span><span class="p">:</span> </span><span id="L-169"><a name="L-169"></a> <span class="k">return</span> <span class="bp">None</span> </span><span id="L-170"><a name="L-170"></a> <span class="k">return</span> <span class="nb">float</span><span class="p">(</span><span class="nb">sum</span><span class="p">([</span><span class="nb">ord</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">str</span><span class="p">(</span><span class="n">val</span><span class="p">)]))</span> <span class="c1"># sum of ascii vals</span> </span><span id="L-171"><a name="L-171"></a> </span><span id="L-172"><a name="L-172"></a><span class="k">def</span> <span class="nf">import_data</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s1">&#39;training&#39;</span><span class="p">):</span> </span><span id="L-173"><a name="L-173"></a> <span class="k">global</span> <span class="n">trn_cols</span><span class="p">,</span> <span class="n">tst_cols</span> </span><span id="L-174"><a name="L-174"></a> </span><span id="L-175"><a name="L-175"></a> <span class="c1"># get input file (features and labels)</span> </span><span id="L-176"><a name="L-176"></a> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span> </span><span id="L-177"><a name="L-177"></a> <span class="k">if</span> <span class="n">mode</span> <span class="o">==</span> <span class="s1">&#39;training&#39;</span><span class="p">:</span> </span><span id="L-178"><a name="L-178"></a> <span class="n">fname</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> </span><span id="L-179"><a name="L-179"></a> <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">2</span><span class="p">:</span> </span><span id="L-180"><a name="L-180"></a> <span class="n">fname</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span> </span><span id="L-181"><a name="L-181"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-182"><a name="L-182"></a> <span class="k">if</span> <span class="n">mode</span> <span class="o">==</span> <span class="s1">&#39;training&#39;</span><span class="p">:</span> </span><span id="L-183"><a name="L-183"></a> <span class="n">fname</span> <span class="o">=</span> <span class="n">default_trn_path</span> </span><span id="L-184"><a name="L-184"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-185"><a name="L-185"></a> <span class="n">fname</span> <span class="o">=</span> <span class="n">default_tst_path</span> </span><span id="L-186"><a name="L-186"></a> </span><span id="L-187"><a name="L-187"></a> <span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">fname</span><span class="p">):</span> </span><span id="L-188"><a name="L-188"></a> <span class="k">print</span> <span class="s2">&quot;usage:&quot;</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">0</span><span class="p">])[</span><span class="mi">1</span><span class="p">],</span> <span class="s2">&quot;[default_trn_path]&quot;</span><span class="p">,</span> \ </span><span id="L-189"><a name="L-189"></a> <span class="s2">&quot;[default_tst_path]&quot;</span> </span><span id="L-190"><a name="L-190"></a> <span class="k">print</span> <span class="s2">&quot;Valid file paths must be provided as an arg or global varables&quot;</span> </span><span id="L-191"><a name="L-191"></a> <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="s2">&quot;invalid input&quot;</span><span class="p">)</span> </span><span id="L-192"><a name="L-192"></a> </span><span id="L-193"><a name="L-193"></a> <span class="c1"># get heads</span> </span><span id="L-194"><a name="L-194"></a> <span class="n">reader</span> <span class="o">=</span> <span class="n">csvreader</span><span class="p">(</span><span class="nb">open</span><span class="p">(</span><span class="n">fname</span><span class="p">,</span> <span class="s1">&#39;rb&#39;</span><span class="p">))</span> </span><span id="L-195"><a name="L-195"></a> <span class="n">r</span><span class="p">,</span> <span class="n">start_row</span><span class="p">,</span> <span class="n">heads</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="p">[]</span> </span><span id="L-196"><a name="L-196"></a> <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">reader</span><span class="p">:</span> </span><span id="L-197"><a name="L-197"></a> <span class="k">if</span> <span class="n">r</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> </span><span id="L-198"><a name="L-198"></a> <span class="c1"># get no of cols in data</span> </span><span id="L-199"><a name="L-199"></a> <span class="k">if</span> <span class="n">mode</span> <span class="o">==</span> <span class="s1">&#39;training&#39;</span><span class="p">:</span> </span><span id="L-200"><a name="L-200"></a> <span class="n">trn_cols</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">row</span><span class="p">)</span> </span><span id="L-201"><a name="L-201"></a> <span class="k">if</span> <span class="n">heads_in_trn_file</span><span class="p">:</span> </span><span id="L-202"><a name="L-202"></a> <span class="n">heads</span> <span class="o">=</span> <span class="n">row</span> </span><span id="L-203"><a name="L-203"></a> <span class="n">start_row</span> <span class="o">+=</span> <span class="mi">1</span> </span><span id="L-204"><a name="L-204"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-205"><a name="L-205"></a> <span class="n">tst_cols</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">row</span><span class="p">)</span> </span><span id="L-206"><a name="L-206"></a> <span class="k">if</span> <span class="n">heads_in_tst_file</span><span class="p">:</span> </span><span id="L-207"><a name="L-207"></a> <span class="n">heads</span> <span class="o">=</span> <span class="n">row</span> </span><span id="L-208"><a name="L-208"></a> <span class="n">start_row</span> <span class="o">+=</span> <span class="mi">1</span> </span><span id="L-209"><a name="L-209"></a> <span class="n">r</span> <span class="o">+=</span> <span class="mi">1</span> </span><span id="L-210"><a name="L-210"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-211"><a name="L-211"></a> <span class="k">break</span> </span><span id="L-212"><a name="L-212"></a> </span><span id="L-213"><a name="L-213"></a> <span class="c1"># build a dict to map each col to a conv func (if not excl)</span> </span><span id="L-214"><a name="L-214"></a> <span class="k">if</span> <span class="n">mode</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;test&#39;</span><span class="p">,</span> <span class="s1">&#39;predict&#39;</span><span class="p">]:</span> </span><span id="L-215"><a name="L-215"></a> <span class="n">cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">trn_cols</span><span class="p">)</span> <span class="k">if</span> <span class="n">i</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">excl_trn_cols</span><span class="p">]</span> </span><span id="L-216"><a name="L-216"></a> <span class="n">conv_dict</span> <span class="o">=</span> <span class="p">{</span><span class="n">c</span><span class="p">:</span> <span class="n">conv</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">}</span> </span><span id="L-217"><a name="L-217"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-218"><a name="L-218"></a> <span class="n">cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">tst_cols</span><span class="p">)</span> <span class="k">if</span> <span class="n">i</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">excl_tst_cols</span><span class="p">]</span> </span><span id="L-219"><a name="L-219"></a> <span class="n">conv_dict</span> <span class="o">=</span> <span class="p">{</span><span class="n">c</span><span class="p">:</span> <span class="n">conv</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">}</span> </span><span id="L-220"><a name="L-220"></a> </span><span id="L-221"><a name="L-221"></a> <span class="k">if</span> <span class="n">verbose</span><span class="p">:</span> </span><span id="L-222"><a name="L-222"></a> <span class="k">print</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">Data import:&#39;</span><span class="p">,</span> <span class="n">mode</span><span class="p">,</span> <span class="s1">&#39;| cols:&#39;</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span> </span><span id="L-223"><a name="L-223"></a> </span><span id="L-224"><a name="L-224"></a> <span class="c1"># import data</span> </span><span id="L-225"><a name="L-225"></a> <span class="c1"># not excluding unneeded cols, import all, just without conversions</span> </span><span id="L-226"><a name="L-226"></a> <span class="c1"># they are exlcuded later in feature_prep</span> </span><span id="L-227"><a name="L-227"></a> <span class="n">data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">genfromtxt</span><span class="p">(</span><span class="n">fname</span><span class="p">,</span> <span class="n">delimiter</span><span class="o">=</span><span class="n">delim</span><span class="p">,</span> <span class="n">converters</span><span class="o">=</span><span class="n">conv_dict</span><span class="p">,</span> </span><span id="L-228"><a name="L-228"></a> <span class="n">skip_header</span><span class="o">=</span><span class="n">start_row</span><span class="p">)</span> </span><span id="L-229"><a name="L-229"></a> </span><span id="L-230"><a name="L-230"></a> <span class="k">if</span> <span class="n">verbose</span><span class="p">:</span> </span><span id="L-231"><a name="L-231"></a> <span class="k">print</span> <span class="s1">&#39;all heads:</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">,</span> <span class="s1">&#39;, &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">heads</span><span class="p">),</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span> </span><span id="L-232"><a name="L-232"></a> <span class="k">print</span> <span class="s1">&#39;shape of data:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">shape</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> </span><span id="L-233"><a name="L-233"></a> <span class="k">print</span> <span class="n">data</span> </span><span id="L-234"><a name="L-234"></a> </span><span id="L-235"><a name="L-235"></a> <span class="k">return</span> <span class="n">data</span><span class="p">,</span> <span class="n">heads</span> </span><span id="L-236"><a name="L-236"></a> </span><span id="L-237"><a name="L-237"></a><span class="k">def</span> <span class="nf">split_trn_data</span><span class="p">(</span><span class="n">data</span><span class="p">):</span> </span><span id="L-238"><a name="L-238"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> </span><span id="L-239"><a name="L-239"></a> <span class="n">rands</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">random_sample</span><span class="p">(</span><span class="n">m</span><span class="p">)</span> </span><span id="L-240"><a name="L-240"></a> </span><span id="L-241"><a name="L-241"></a> <span class="c1"># select cases where random no from above is &lt;= threshold</span> </span><span id="L-242"><a name="L-242"></a> <span class="n">trn_data</span> <span class="o">=</span> <span class="n">data</span><span class="p">[</span><span class="n">rands</span> <span class="o">&lt;=</span> <span class="p">(</span><span class="n">trn_perc</span><span class="o">/</span><span class="mi">100</span><span class="p">),</span> <span class="p">:]</span> </span><span id="L-243"><a name="L-243"></a> <span class="n">cv_data</span> <span class="o">=</span> <span class="n">data</span><span class="p">[</span><span class="n">rands</span> <span class="o">&gt;</span> <span class="p">(</span><span class="n">trn_perc</span><span class="o">/</span><span class="mi">100</span><span class="p">),</span> <span class="p">:]</span> </span><span id="L-244"><a name="L-244"></a> </span><span id="L-245"><a name="L-245"></a> <span class="k">return</span> <span class="n">trn_data</span><span class="p">,</span> <span class="n">cv_data</span> </span><span id="L-246"><a name="L-246"></a> </span><span id="L-247"><a name="L-247"></a><span class="k">def</span> <span class="nf">build_classifier</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">reg</span><span class="p">):</span> </span><span id="L-248"><a name="L-248"></a> <span class="c1"># rbf is guassian kernal</span> </span><span id="L-249"><a name="L-249"></a> <span class="n">clf</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">SVC</span><span class="p">(</span><span class="n">kernel</span><span class="o">=</span><span class="s1">&#39;rbf&#39;</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="n">reg</span><span class="p">,</span> <span class="n">cache_size</span><span class="o">=</span><span class="mi">1000</span><span class="p">)</span> </span><span id="L-250"><a name="L-250"></a> <span class="k">return</span> <span class="n">clf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span> </span><span id="L-251"><a name="L-251"></a> </span><span id="L-252"><a name="L-252"></a><span class="k">def</span> <span class="nf">main</span><span class="p">():</span> </span><span id="L-253"><a name="L-253"></a> </span><span id="L-254"><a name="L-254"></a> <span class="k">global</span> <span class="n">adj_rate</span> </span><span id="L-255"><a name="L-255"></a> <span class="n">reg</span><span class="p">,</span> <span class="n">reg_loop</span><span class="p">,</span> <span class="n">reg_dir</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="s1">&#39;up&#39;</span> </span><span id="L-256"><a name="L-256"></a> <span class="n">reg_rec</span><span class="p">,</span> <span class="n">trn_rec</span><span class="p">,</span> <span class="n">cv_rec</span> <span class="o">=</span> <span class="p">[],</span> <span class="p">[],</span> <span class="p">[]</span> </span><span id="L-257"><a name="L-257"></a> </span><span id="L-258"><a name="L-258"></a> <span class="c1"># import training data</span> </span><span id="L-259"><a name="L-259"></a> <span class="n">data</span><span class="p">,</span> <span class="n">heads</span> <span class="o">=</span> <span class="n">import_data</span><span class="p">(</span><span class="s1">&#39;training&#39;</span><span class="p">)</span> </span><span id="L-260"><a name="L-260"></a> </span><span id="L-261"><a name="L-261"></a> <span class="k">while</span> <span class="n">reg_loop</span> <span class="o">&lt;</span> <span class="n">auto_find_loops</span><span class="p">:</span> </span><span id="L-262"><a name="L-262"></a> </span><span id="L-263"><a name="L-263"></a> <span class="n">trn</span><span class="p">,</span> <span class="n">cv</span><span class="p">,</span> <span class="o">=</span> <span class="p">[],</span> <span class="p">[]</span> </span><span id="L-264"><a name="L-264"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">iterations</span><span class="p">):</span> </span><span id="L-265"><a name="L-265"></a> </span><span id="L-266"><a name="L-266"></a> <span class="c1"># split data into training and cross validation groups</span> </span><span id="L-267"><a name="L-267"></a> <span class="n">trn_data</span><span class="p">,</span> <span class="n">cv_data</span> <span class="o">=</span> <span class="n">split_trn_data</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> </span><span id="L-268"><a name="L-268"></a> <span class="k">if</span> <span class="n">verbose</span><span class="p">:</span> </span><span id="L-269"><a name="L-269"></a> <span class="k">print</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">Size of training data:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">shape</span><span class="p">(</span><span class="n">trn_data</span><span class="p">)</span> </span><span id="L-270"><a name="L-270"></a> <span class="k">print</span> <span class="s1">&#39;Size of cross val data:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">shape</span><span class="p">(</span><span class="n">cv_data</span><span class="p">)</span> </span><span id="L-271"><a name="L-271"></a> </span><span id="L-272"><a name="L-272"></a> <span class="c1"># prep training data and build classifier</span> </span><span id="L-273"><a name="L-273"></a> <span class="n">X_trn</span><span class="p">,</span> <span class="n">y_trn</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">,</span> <span class="n">ids</span> <span class="o">=</span> <span class="n">feature_prep</span><span class="p">(</span><span class="n">trn_data</span><span class="p">,</span> <span class="n">heads</span><span class="p">,</span> </span><span id="L-274"><a name="L-274"></a> <span class="s1">&#39;training&#39;</span><span class="p">,</span> </span><span id="L-275"><a name="L-275"></a> <span class="p">[],</span> <span class="p">[],</span> <span class="n">verbose</span><span class="p">,</span> <span class="n">i</span><span class="p">)</span> </span><span id="L-276"><a name="L-276"></a> <span class="n">clf</span> <span class="o">=</span> <span class="n">build_classifier</span><span class="p">(</span><span class="n">X_trn</span><span class="p">,</span> <span class="n">y_trn</span><span class="p">,</span> <span class="n">reg</span><span class="p">)</span> </span><span id="L-277"><a name="L-277"></a> </span><span id="L-278"><a name="L-278"></a> <span class="c1"># training accuracy</span> </span><span id="L-279"><a name="L-279"></a> <span class="n">trn_pred</span> <span class="o">=</span> <span class="n">clf</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_trn</span><span class="p">)</span> </span><span id="L-280"><a name="L-280"></a> <span class="n">trn_accuracy</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">-</span> <span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">y_trn</span> <span class="o">-</span> <span class="n">trn_pred</span><span class="p">))</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">X_trn</span><span class="p">))</span> </span><span id="L-281"><a name="L-281"></a> <span class="n">trn</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">trn_accuracy</span><span class="p">)</span> </span><span id="L-282"><a name="L-282"></a> </span><span id="L-283"><a name="L-283"></a> <span class="c1"># load prepare cv set</span> </span><span id="L-284"><a name="L-284"></a> <span class="k">if</span> <span class="n">trn_perc</span> <span class="o">&lt;</span> <span class="mi">100</span><span class="p">:</span> </span><span id="L-285"><a name="L-285"></a> <span class="n">X_cv</span><span class="p">,</span> <span class="n">y_cv</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">,</span> <span class="n">ids</span> <span class="o">=</span> <span class="n">feature_prep</span><span class="p">(</span><span class="n">cv_data</span><span class="p">,</span> <span class="n">heads</span><span class="p">,</span> <span class="s1">&#39;cv&#39;</span><span class="p">,</span> </span><span id="L-286"><a name="L-286"></a> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">,</span> <span class="n">verbose</span><span class="p">,</span> <span class="n">i</span><span class="p">)</span> </span><span id="L-287"><a name="L-287"></a> </span><span id="L-288"><a name="L-288"></a> <span class="c1"># cv accuracy</span> </span><span id="L-289"><a name="L-289"></a> <span class="n">cv_pred</span> <span class="o">=</span> <span class="n">clf</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_cv</span><span class="p">)</span> </span><span id="L-290"><a name="L-290"></a> <span class="n">cv_accuracy</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">-</span> <span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">y_cv</span> <span class="o">-</span> <span class="n">cv_pred</span><span class="p">))</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">X_cv</span><span class="p">))</span> </span><span id="L-291"><a name="L-291"></a> <span class="n">cv</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cv_accuracy</span><span class="p">)</span> </span><span id="L-292"><a name="L-292"></a> </span><span id="L-293"><a name="L-293"></a> <span class="n">reg_rec</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">reg</span><span class="p">)</span> </span><span id="L-294"><a name="L-294"></a> <span class="n">trn_rec</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">trn</span><span class="p">))</span> </span><span id="L-295"><a name="L-295"></a> <span class="k">if</span> <span class="n">trn_perc</span> <span class="o">&lt;</span> <span class="mi">100</span><span class="p">:</span> </span><span id="L-296"><a name="L-296"></a> <span class="n">cv_rec</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">cv</span><span class="p">))</span> </span><span id="L-297"><a name="L-297"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-298"><a name="L-298"></a> <span class="n">cv_rec</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> </span><span id="L-299"><a name="L-299"></a> </span><span id="L-300"><a name="L-300"></a> <span class="k">if</span> <span class="n">reg_loop</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> </span><span id="L-301"><a name="L-301"></a> <span class="k">print</span> <span class="s1">&#39;Loop | C param | Trn accuracy | CV accuracy | Dir&#39;</span> </span><span id="L-302"><a name="L-302"></a> <span class="k">print</span> <span class="s1">&#39;-----------------------------------------------------------&#39;</span> </span><span id="L-303"><a name="L-303"></a> </span><span id="L-304"><a name="L-304"></a> <span class="n">better</span> <span class="o">=</span> <span class="p">(</span><span class="n">reg_loop</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">cv_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="p">]</span> <span class="o">&gt;</span> <span class="n">cv_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span> </span><span id="L-305"><a name="L-305"></a> </span><span id="L-306"><a name="L-306"></a> <span class="c1"># switch direction &amp; reduce adj_rate if not getting better</span> </span><span id="L-307"><a name="L-307"></a> <span class="k">if</span> <span class="ow">not</span> <span class="n">better</span><span class="p">:</span> </span><span id="L-308"><a name="L-308"></a> <span class="n">adj_rate</span> <span class="o">*=</span> <span class="mf">0.95</span> </span><span id="L-309"><a name="L-309"></a> <span class="k">if</span> <span class="n">reg_dir</span> <span class="o">==</span> <span class="s1">&#39;up&#39;</span><span class="p">:</span> </span><span id="L-310"><a name="L-310"></a> <span class="n">reg_dir</span> <span class="o">=</span> <span class="s1">&#39;down&#39;</span> </span><span id="L-311"><a name="L-311"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-312"><a name="L-312"></a> <span class="n">reg_dir</span> <span class="o">=</span> <span class="s1">&#39;up&#39;</span> </span><span id="L-313"><a name="L-313"></a> </span><span id="L-314"><a name="L-314"></a> <span class="k">try</span><span class="p">:</span> </span><span id="L-315"><a name="L-315"></a> <span class="k">print</span> <span class="nb">str</span><span class="p">(</span><span class="n">reg_loop</span><span class="p">)</span> <span class="o">+</span> <span class="s1">&#39; &#39;</span> <span class="o">*</span> <span class="p">(</span><span class="mi">6</span> <span class="o">-</span> <span class="nb">len</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">reg_loop</span><span class="p">)))</span> <span class="o">+</span> <span class="s1">&#39;|&#39;</span> <span class="o">+</span> \ </span><span id="L-316"><a name="L-316"></a> <span class="s1">&#39; &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">reg</span><span class="p">,</span> <span class="mi">3</span><span class="p">))</span> <span class="o">+</span> \ </span><span id="L-317"><a name="L-317"></a> <span class="s1">&#39; &#39;</span> <span class="o">*</span> <span class="p">(</span><span class="mi">9</span> <span class="o">-</span> <span class="nb">len</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">reg</span><span class="p">,</span> <span class="mi">3</span><span class="p">))))</span> <span class="o">+</span> <span class="s1">&#39;|&#39;</span> <span class="o">+</span> \ </span><span id="L-318"><a name="L-318"></a> <span class="s1">&#39; &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">trn_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="p">],</span> <span class="mi">9</span><span class="p">))</span> <span class="o">+</span> \ </span><span id="L-319"><a name="L-319"></a> <span class="s1">&#39; &#39;</span> <span class="o">*</span> <span class="p">(</span><span class="mi">14</span> <span class="o">-</span> <span class="nb">len</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">trn_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="p">],</span> <span class="mi">9</span><span class="p">))))</span> <span class="o">+</span> <span class="s1">&#39;|&#39;</span> <span class="o">+</span> \ </span><span id="L-320"><a name="L-320"></a> <span class="s1">&#39; &#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">cv_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="p">],</span> <span class="mi">9</span><span class="p">))</span> <span class="o">+</span> \ </span><span id="L-321"><a name="L-321"></a> <span class="s1">&#39; &#39;</span> <span class="o">*</span> <span class="p">(</span><span class="mi">14</span> <span class="o">-</span> <span class="nb">len</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">cv_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="p">],</span> <span class="mi">9</span><span class="p">))))</span> <span class="o">+</span> <span class="s1">&#39;|&#39;</span> <span class="o">+</span> \ </span><span id="L-322"><a name="L-322"></a> <span class="s1">&#39; &#39;</span> <span class="o">+</span> <span class="n">reg_dir</span> </span><span id="L-323"><a name="L-323"></a> <span class="k">except</span><span class="p">:</span> </span><span id="L-324"><a name="L-324"></a> <span class="k">print</span> <span class="n">reg_loop</span><span class="p">,</span> <span class="n">reg</span><span class="p">,</span> <span class="n">trn_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="p">],</span> <span class="n">cv_rec</span><span class="p">[</span><span class="n">reg_loop</span><span class="p">],</span> <span class="n">reg_dir</span> </span><span id="L-325"><a name="L-325"></a> <span class="k">pass</span> </span><span id="L-326"><a name="L-326"></a> </span><span id="L-327"><a name="L-327"></a> <span class="k">if</span> <span class="n">reg_dir</span> <span class="o">==</span> <span class="s1">&#39;up&#39;</span><span class="p">:</span> </span><span id="L-328"><a name="L-328"></a> <span class="n">reg</span> <span class="o">*=</span> <span class="n">adj_rate</span> </span><span id="L-329"><a name="L-329"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-330"><a name="L-330"></a> <span class="n">reg</span> <span class="o">/=</span> <span class="n">adj_rate</span> </span><span id="L-331"><a name="L-331"></a> </span><span id="L-332"><a name="L-332"></a> <span class="n">reg_loop</span> <span class="o">+=</span> <span class="mi">1</span> </span><span id="L-333"><a name="L-333"></a> </span><span id="L-334"><a name="L-334"></a> <span class="c1"># load in test data and run through the same prep / normalisation</span> </span><span id="L-335"><a name="L-335"></a> <span class="n">t_data</span><span class="p">,</span> <span class="n">t_heads</span> <span class="o">=</span> <span class="n">import_data</span><span class="p">(</span><span class="s1">&#39;test&#39;</span><span class="p">)</span> </span><span id="L-336"><a name="L-336"></a> <span class="n">X</span><span class="p">,</span> <span class="n">tmp_y</span><span class="p">,</span> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">,</span> <span class="n">ids</span> <span class="o">=</span> <span class="n">feature_prep</span><span class="p">(</span><span class="n">t_data</span><span class="p">,</span> <span class="n">t_heads</span><span class="p">,</span> <span class="s1">&#39;test&#39;</span><span class="p">,</span> </span><span id="L-337"><a name="L-337"></a> <span class="n">mu</span><span class="p">,</span> <span class="n">sd</span><span class="p">,</span> <span class="n">verbose</span><span class="p">,</span> <span class="n">i</span><span class="p">)</span> </span><span id="L-338"><a name="L-338"></a> </span><span id="L-339"><a name="L-339"></a> <span class="c1"># get predictions and make each item an int in a sublist (required format)</span> </span><span id="L-340"><a name="L-340"></a> <span class="n">y</span> <span class="o">=</span> <span class="n">clf</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X</span><span class="p">)</span> </span><span id="L-341"><a name="L-341"></a> <span class="k">print</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">Found&#39;</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="n">y</span><span class="p">)),</span> <span class="s1">&#39;positive predictions out of&#39;</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">y</span><span class="p">)</span> </span><span id="L-342"><a name="L-342"></a> <span class="k">print</span> <span class="s1">&#39;(iterations:&#39;</span><span class="p">,</span> <span class="n">iterations</span><span class="p">,</span> <span class="s1">&#39;| trn_perc:&#39;</span><span class="p">,</span> <span class="n">trn_perc</span><span class="p">,</span> <span class="s1">&#39;)&#39;</span> </span><span id="L-343"><a name="L-343"></a> </span><span id="L-344"><a name="L-344"></a> <span class="k">if</span> <span class="n">tst_id_col</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">:</span> </span><span id="L-345"><a name="L-345"></a> <span class="n">predictions</span> <span class="o">=</span> <span class="p">[[</span><span class="nb">int</span><span class="p">(</span><span class="n">ids</span><span class="p">[</span><span class="n">i</span><span class="p">]),</span> <span class="nb">int</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">],</span><span class="mi">0</span><span class="p">))]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">y</span><span class="p">))]</span> </span><span id="L-346"><a name="L-346"></a> <span class="k">else</span><span class="p">:</span> </span><span id="L-347"><a name="L-347"></a> <span class="n">predictions</span> <span class="o">=</span> <span class="p">[[</span><span class="nb">int</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">],</span><span class="mi">0</span><span class="p">))]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">y</span><span class="p">))]</span> </span><span id="L-348"><a name="L-348"></a> <span class="k">if</span> <span class="n">heads_in_trn_file</span><span class="p">:</span> </span><span id="L-349"><a name="L-349"></a> <span class="n">predictions</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="n">t_heads</span><span class="p">[</span><span class="n">tst_id_col</span><span class="p">],</span> <span class="n">t_heads</span><span class="p">[</span><span class="n">labels_col</span><span class="p">]])</span> </span><span id="L-350"><a name="L-350"></a> </span><span id="L-351"><a name="L-351"></a> <span class="n">write_to_file</span><span class="p">(</span><span class="n">predictions</span><span class="p">,</span> </span><span id="L-352"><a name="L-352"></a> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">default_dir</span><span class="p">,</span> <span class="s1">&#39;test_predictions.csv&#39;</span><span class="p">))</span> </span><span id="L-353"><a name="L-353"></a> </span><span id="L-354"><a name="L-354"></a><span class="k">if</span> <span class="n">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span> </span><span id="L-355"><a name="L-355"></a> <span class="n">main</span><span class="p">()</span> </span></pre></div> </td></tr></table>
machine-learning, prep, python, svm