Diana Lange | 258518 | 5. Sem. Bachelor IMIT | Exercise Sheet 3

Exercise Sheet 3¶

Datasets

Wine Quality: http://archive.ics.uci.edu/ml/datasets/Wine+Quality

Exercise 2: Linear Regression with Gradient Descent¶

You are required to pre-process given datasets.

Part A: Implement Linear Regression with Gradient Descent¶

In this part you are required to implement linear regression algorithm with gradient descent algorithm. Reference lecture https://www.ismll.uni-hildesheim.de/lehre/ml-16w/script/ml-02-A1-linear-regression.pdf

For each dataset given above

Task 1¶

A set of training data Dtrain = {(x(1),y(1)),(x(2),y(2)),...,(x(N),y(N))},where x∈RM,y∈R, N is number of training examples and M is number of features })

%matplotlib inline

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

#read data
redwine = pd.read_csv("winequality-red.csv", sep=";")
whitewine = pd.read_csv("winequality-white.csv", sep=";")

redwine.head()

whitewine.head()

# prepare data:
# the datasets contain a lot of data and the Gradient Descent algorithms
# will run very slow if there is too much data
# the learning will be done with a random choosen subset
# of the original data

# split dataset into X and y, y ('quality') will be the value which will be predicted
trainD_redwine = {"X" : redwine[list(redwine.keys())[:-1]], "y" : redwine['quality']}
trainD_whitewine = {"X" : whitewine[list(whitewine.keys())[:-1]], "y" : whitewine['quality'] }

# will split data into train and test
# maxData is the number of data train should consist of
def splitData(data, maxData):
    if (maxData < len(data['X'].index) and maxData > 0):
        sampler = np.random.permutation(len(data['X'].index))
        temp = {
            "X" : data['X'].take(sampler),
            "y" : data['y'].take(sampler)
        }
        
        Xrandom = data['X'].take(sampler)
        yrandom = data['y'].take(sampler)

        p = float(maxData) / len(sampler)
        limitTrain = dict()
        limitTest = dict()
        limitTrain["X"] = Xrandom[:int(len(sampler) * p)]
        limitTrain["y"] = yrandom[:int(len(sampler) * p)]
        limitTest["X"] = Xrandom[int(len(sampler) * p):]
        limitTest["y"] = yrandom[int(len(sampler) * p):]
        
        return [limitTrain, limitTest]
        
    else:
        return [train, train]

# converts the X, y attributes of data, which are DataFrames, to matrixes
def toMatrix(data):
    return  {
            "X" : data['X'].as_matrix(),
            "y" : data['y'].as_matrix()
    }

# limit and convert data
trainD_redwine = toMatrix(splitData(trainD_redwine, 700)[0])
trainD_whitewine = toMatrix(splitData(trainD_whitewine, 700)[0])

print(len(trainD_redwine['X']))
print(len(trainD_redwine['y']))

print(len(trainD_whitewine['X']))
print(len(trainD_whitewine['y']))

700
700
700
700

Task 2¶

Linear Regression model is given as ˆyn = SUM(m=1 to M) βm * xnm

# calculate the ^y with given x and beta 
def linreg(x, beta):
    y = 0
    for i in range(0, len(x)):
        y += beta[i] * x[i]
    return y
    
# calculates the predictions values (^y[]) with given
# X = Xtest (matrix) and beta parameters (vector) 
def getPredictetValues(X, beta):
    predictY = list()
    for x in X:
        predictY.append(linreg(x, beta));
    return predictY

# calculates the RMSE value with given values for prediction and true data
def rmse(predictions, trueValues):
    return np.sqrt(np.mean((predictions-trueValues)**2))

# helper function that will print a subset of a number-array in a certain format
def printVals(vals, maxPrint, regex="{:2.4f}"):
    # 'true' if True else 'false'

    listlength = len(vals)
    if (maxPrint > listlength or maxPrint < 0):
        maxPrint = listlength
        
    iMax = listlength if listlength < maxPrint else maxPrint
    print("(printing " + str(iMax) + " of " + str(listlength) + " values)")
    s = ""
    for i in range(iMax):
        s += regex.format(vals[i]) + ","
    print(s[:-1])

Task 3¶

Least square loss function is given as l(x,y) = SUM(n=1 to N)(yn − ˆyn)^2

# calculates the loss of a model with given prediction and true values
def loss(predictY, observedY):
    return np.sum((observedY - predictY)**2)

Task 4¶

minimize the loss function l(x,y) using Gradient Descent algorithm. Implement (learn-linregGD and minimize-GD algorithms given in the lecture slides). Choose imax between 100 to 1000.

# f : function
# derivation : derivation of f
# error : function for calculating the error (e.g. RMSE)
# alpha: dictionary, determines how the alpha value is calcultet. options:
#        constant steps: a := c, steplength := float > 0
#        armijo :        a := a, delta := float element of (0, 1)
#        bolddriver :    a := b, increase := float > 1, decrease := element of (0, 1)
# returns: [beta, |f(xi-1) - f(xi)|, RMSE, alpha]
def minimizeGD(f, derivation, error, xStart, alpha, iMax, epsilon=1):
    
    xLast = xStart # xi-1
    fLast = f(xLast) # f(xi-1)
  
    diffRecord = list()
    rmseRecord = list()
    aRecord = list()
    a = 1
    
    for i in range(iMax):
        # gradient
        d = -1 * derivation(xLast)
        
        # calculating the alpha:
        # constant steps
        if (alpha['a'] == 'c'): 
            a = alpha['steplength'] 
        # armijo
        elif (alpha['a'] == 'a'): 
            a = 1
            while (fLast - f(xLast + a * d) < a * alpha['delta'] * np.dot(d.T, d)):
                a /= 2
        #bolddriver
        elif (alpha['a'] == 'b'): 
            newA = a * alpha['increase']
            while (fLast - f(xLast + newA * d) <= 0):
                 newA = newA * alpha['decrease']
            a = newA
                
        xCurrent = xLast + a * d #xi
        fCurrent = f(xCurrent) # f(xi)
        diff = fLast - fCurrent #f(xi-1) - f(xi)
        
        # record all |f(xi-1) - f(xi)|, RMSE, ... with current value for beta / xCurrent
        diffRecord.append(diff if diff >= 0 else diff * -1)
        rmseRecord.append(error(xCurrent))
        aRecord.append(a)
        
        # stop iteration when a good value for beta / xCurrent is found
        if diff < epsilon:
            return [xCurrent, diffRecord, rmseRecord, aRecord]
        
        # updating past values for new iteration
        xLast = xCurrent 
        fLast = fCurrent
    
    # after iMax, the best values are returned
    # this just occures, if the value for epsilon is not reached within iMax iterations
    return [xLast, diffRecord, rmseRecord, aRecord]

# learn the model with given train dataset
# trainD : consists of key X (matrix), y (vector)
# alpha: set the function for calculating the alpha value (constent steplength, armijo or bolddriver)
# iMax: number of iterations
# epsilon: quality of the result (> 0)
def learnLinregGD(trainD, alpha, iMax, epsilon=1):
    
    X = trainD['X']
    y = trainD['y']
    
    # start point for GD algorithm (vector with zeroes)
    beta0 = np.zeros(len(X.T))
 
    # functions for calculating the function value, the gradient (using the derivate of f), and the RMSE
    def f(beta):
        b = y - np.dot(X, beta)
        return np.dot(b.T, b.T)
    
    def derivate(beta):
        return -2 * np.dot(X.T, y - np.dot(X, beta))
                  
    def tempRMSE(beta):
        return rmse(getPredictetValues(X, beta), y)
    
    beta = minimizeGD(f, derivate, tempRMSE, beta0, alpha, iMax, epsilon)
    return beta

Task 5¶

You can choose three suitable values of step length α > 0. For each value of steplength perform the learning and record

alphas_redwine = [0.00000008, 0.00000006, 0.00000003]
alphas_whitewine = [5e-10, 3e-10, 1e-10]

model_redwine = [
    learnLinregGD(trainD_redwine, {"a" : "c", "steplength" : alphas_redwine[0]}, 1000, 0.1),
    learnLinregGD(trainD_redwine, {"a" : "c", "steplength" : alphas_redwine[1]}, 1000, 0.1),
    learnLinregGD(trainD_redwine, {"a" : "c", "steplength" : alphas_redwine[2]}, 1000, 0.5)
]

model_whitewine = [
    learnLinregGD(trainD_whitewine, {"a" : "c", "steplength" : alphas_whitewine[0]}, 1000, 3),
    learnLinregGD(trainD_whitewine, {"a" : "c", "steplength" : alphas_whitewine[1]}, 1000, 3),
    learnLinregGD(trainD_whitewine, {"a" : "c", "steplength" : alphas_whitewine[2]}, 1000, 3)
]

# the values of alpha are choosen by testing the quality of the outcome, with a little help
# of following (commented) lines
'''
a = 0.0000000009
while (a > 0.0000000001):
    beta_whitewine.append(learnLinregGD(trainD_whitewine, {"a" : "f", "steplength" : a}, 1000, 3))
    a *= 0.99
'''

# check results 
print('Betas for redwine')
for m in model_redwine:
    printVals(m[0], -1)
        
print('\nBetas for whitewine')
for m in model_whitewine:
    printVals(m[0], -1)

Betas for redwine
(printing 11 of 11 values)
0.2178,0.0122,0.0069,0.0464,0.0021,0.0212,-0.0038,0.0273,0.0918,0.0187,0.3005
(printing 11 of 11 values)
0.2176,0.0122,0.0069,0.0467,0.0021,0.0232,-0.0039,0.0270,0.0909,0.0185,0.2977
(printing 11 of 11 values)
0.2034,0.0114,0.0065,0.0457,0.0019,0.0442,-0.0032,0.0242,0.0810,0.0163,0.2655

Betas for whitewine
(printing 11 of 11 values)
0.0019,0.0001,0.0001,0.0015,0.0000,0.0087,0.0333,0.0003,0.0009,0.0001,0.0031
(printing 11 of 11 values)
0.0019,0.0001,0.0001,0.0015,0.0000,0.0085,0.0326,0.0003,0.0009,0.0001,0.0030
(printing 11 of 11 values)
0.0017,0.0001,0.0001,0.0014,0.0000,0.0079,0.0303,0.0002,0.0008,0.0001,0.0027

(a) In each iteration of the minimize-GD algorithm calculate |f(xi−1)−f(xi)| and at the end of learning, plot it against iteration number i. Explain the graph.

for i in range(len(model_redwine)):
    plt.figure()
    plt.title('| f(xi-1) - f(xi) | Record Redwine, alpha=' + str(alphas_redwine[i]))
    plt.ylim([0,300])
    plt.xlim([0,1000])
    plt.grid()
    plt.ylabel('epsilon := | f(xi-1) - f(xi) |')
    plt.xlabel('i := iteration')
    # label for minimum
    plt.annotate("i=" + str(len(model_redwine[i][1])) + ' | epsilon='+ '%0.2f' % model_redwine[i][1][-1], xy=(1, model_redwine[i][1][-1]), xytext=(8, 0), 
                 xycoords=('axes fraction', 'data'), textcoords='offset points')
    plt.plot(range(len(model_redwine[i][1])), model_redwine[i][1])
    
for i in range(len(model_redwine)):
    plt.figure()
    plt.title('| f(xi-1) - f(xi) | Record, Whitewine, alpha=' + str(alphas_whitewine[i]))
    plt.ylim([0,300])
    plt.xlim([0,1000])
    plt.grid()
    plt.ylabel('epsilon := | f(xi-1) - f(xi) |')
    plt.xlabel('i := iteration')
    # label for minimum
    plt.annotate("i=" + str(len(model_whitewine[i][1])) + ' | epsilon='+ '%0.2f' % model_whitewine[i][1][-1], xy=(1, model_whitewine[i][1][-1]), xytext=(8, 0), 
                 xycoords=('axes fraction', 'data'), textcoords='offset points')
    plt.plot(range(len(model_whitewine[i][1])), model_whitewine[i][1])


plt.show()

Graph explanation | f(xi-1) - f(xi) | Record

The graph shows for each iteration the difference between the current functions value and the function value of the last iteration. The goal is to minimize the function, which means the algorithm tries to find a global minimum of that function. Around the extrema of a function the value of | f(xi-1) - f(xi) | should be very small, because the gradient in this area is low. The graph shows that behaviour: In the beginning, the value of | f(xi-1) - f(xi) | is very big (-> not close to minimum, -> no good beta parameters); in the end the value is quite small (-> close to minimum). What the graph doesn't show is how big the steps are between i and i-1, which makes it hard to evaluate the quality of | f(xi-1) - f(xi) | (but: the algorithm of Gradient Descent tries to decrease the stepsize of xi in each iteration which means the steps from xi-1 to xi shoudn't be very great). The curves tend to have the same shape: first, there is a fast reduction of | f(xi-1) - f(xi) | in each iteration (-> it is possible to get a relatively good modell in little iterations). But if a good model is required, it costs very much iterations (at least, with constant steplength for alpha). Finally, the graph shows how many iterations were needed to get the result.

(b) In each iteration step also calculate RMSE on test set and at the end of learning, plot it against iteration number i. Explain the graph.

plt.figure()
plt.title('RMSE Record Redwine')
plt.ylim([0,6.0])
plt.xlim([0,1000])
plt.ylabel('RMSE')
plt.xlabel('i := iteration')
plt.grid()
for i in range(len(model_redwine)):
    plt.plot(range(len(model_redwine[i][2])), model_redwine[i][2], label="alpha=" + str(alphas_redwine[i]))
plt.legend()

plt.figure()
plt.title('RMSE Record Whitewine')
plt.ylim([0,6.0])
plt.xlim([0,1000])
plt.ylabel('RMSE')
plt.xlabel('i := iteration')
plt.grid()
for i in range(len(model_whitewine)):
    plt.plot(range(len(model_whitewine[i][2])), model_whitewine[i][2], label="alpha=" + str(alphas_whitewine[i]))
plt.legend()

plt.show()

Graph explanation RMSE Record

The graph shows for each iteration the value for RMSE (root mean square error), which indicates the error of a model. As you can see, the error decreases fast at the beginning of the algorithm, but slows down and doesn't improve greatly at the end. The RMSE value is not in the same scale as the original data. This means with this graph you can't say that true value is the predictet value +/- best RMSE value, but you can compare the quality of different models createt with the same train data (less error = better model).

Part B Step Length for Gradient Descent¶

This task is based on Part A. You have to implement two algorithms steplength-armijo and steplengthbolddriver given in the lecture slides.

# implementation of Armijo and Bolddriver in function minimizeGD
alphas_redwine.append("Armijo")
alphas_redwine.append("Bolddriver")

alphas_whitewine.append("Armijo")
alphas_whitewine.append("Bolddriver")

model_redwine.append(learnLinregGD(trainD_redwine, {"a" : "a", "delta" : 0.8}, 1000, 0.05))
model_redwine.append(learnLinregGD(trainD_redwine, {"a" : "b", "increase" : 1.1, "decrease" : 0.6}, 1000, 0.05))

model_whitewine.append(learnLinregGD(trainD_whitewine, {"a" : "a", "delta" : 0.8}, 1000, 0.05))
model_whitewine.append(learnLinregGD(trainD_whitewine, {"a" : "b", "increase" : 1.1, "decrease" : 0.6}, 1000, 0.05))

Task 1¶

In each iteration of the minimize-GD algorithm calculate |f(xi−1)−f(xi)| and at the end of learning, plot it against iteration number i. Explain the graph.

nice tutorial for matplot: https://www.labri.fr/perso/nrougier/teaching/matplotlib/

for i in range(3, len(model_whitewine)):
    plt.figure()
    plt.title('| f(xi-1) - f(xi) | Record Whitewine ' + alphas_whitewine[i])
    plt.ylim([0,50])
    plt.xlim([0,600])
    plt.ylabel('epsilon := | f(xi-1) - f(xi) |')
    plt.xlabel('i := iteration')
    plt.grid()
    plt.plot(range(len(model_whitewine[i][1])), model_whitewine[i][1], 'x', c='#0C3E56')

for i in range(3, len(model_redwine)):
    plt.figure()
    plt.title('| f(xi-1) - f(xi) | Record Redwine ' +  alphas_redwine[i])
    plt.ylim([0,50])
    plt.xlim([0,600])
    plt.ylabel('epsilon := | f(xi-1) - f(xi) |')
    plt.xlabel('i := iteration')
    plt.grid()
    plt.plot(range(len(model_redwine[i][1])), model_redwine[i][1], 'x', c='#0C3E56')

plt.show()

Graph explanation | f(xi−1)−f(xi) | Record

As bevor, the graph shows for each iteration the difference between the current functions value and the function value of the last iteration. The difference to the graphs with constant steplength for the value of alpha is quite obvious: The value for | f(xi−1)−f(xi) | doesn't get smaller in each iteration, but over time it does. The reasons for these jumps might be, that 'Armijo' and 'Bolddriver' algorithm are looking for a function's global minimum (e.g. 'Armijo' looks one steps ahead and tries to find the best value for alpha) and adjust the steps for xi to find the minimum fast (-> if the steps of xi and xi-1 are great, the difference of the function values will be great too). In generell, the two algorithms achieve good models with less iterations.

Task 2¶

In each iteration step also calculate RMSE on test set and at the end of learning, plot it against iteration number i. Explain the graph.

plt.figure()
plt.title('RMSE Record Whitewine')
plt.ylim([0,6])
plt.xlim([0,600])
plt.ylabel('RMSE')
plt.xlabel('i := iteration')
plt.grid()
for i in range(3, len(model_whitewine)):  
    plt.plot(range(len(model_whitewine[i][2])), model_whitewine[i][2], label=alphas_whitewine[i])
plt.legend()

plt.figure()
plt.title('RMSE Record Redwine')
plt.ylim([0,6])
plt.xlim([0,600])
plt.ylabel('RMSE')
plt.xlabel('i := iteration')
plt.grid()
for i in range(3, len(model_whitewine)):
    plt.plot(range(len(model_redwine[i][2])), model_redwine[i][2], label=alphas_redwine[i])
plt.legend()

plt.show()

Graph explanation RMSE Record

As bevor, the graph shows for each iteration the value for RMSE (root mean square error), which indicates the error of a model. And as befor too, the reduction of the error in each iteration starts fast and slows down at the end. The Armijo algorithm produces good results (small RMSE) with mostly little iterations, but it doesn't decrease the RMSE in each iteration (sometime it is constant). Bolddriver seams to have some problems finding a good model depending on the data (and the values set for "decrease/increase"); but sometimes it is also faster that Armijo. The advantage of both algorithms is, that the value for alpha doesn't need to be established and acceptable models are always found.

Task 3¶

Compare different steplength algorithms: Compare the RMSE graphs of steplength-armijo and steplengthbold driver and the three ﬁxed step length. Explain your graph.

plt.figure()
plt.title('| f(xi-1) - f(xi) | Record Whitewine')
plt.ylim([0,50])
plt.xlim([0,1000])
plt.ylabel('epsilon := | f(xi-1) - f(xi) |')
plt.xlabel('i := iteration')
plt.grid()
for i in range(len(model_whitewine)):
    if (type(alphas_whitewine[i]) is str):
         plt.plot(range(len(model_whitewine[i][1])), model_whitewine[i][1], 'x', label="alpha=" + alphas_whitewine[i])
    else:
        plt.plot(range(len(model_whitewine[i][1])), model_whitewine[i][1], label="alpha=" + str(alphas_whitewine[i]))
plt.legend()

plt.figure()
plt.title('| f(xi-1) - f(xi) | Record Redwine')
plt.ylim([0,50])
plt.xlim([0,1000])
plt.ylabel('epsilon := | f(xi-1) - f(xi) |')
plt.xlabel('i := iteration')
plt.grid()
for i in range(len(model_redwine)):
    if (type(alphas_redwine[i]) is str):
         plt.plot(range(len(model_redwine[i][1])), model_redwine[i][1], 'x', label="alpha=" + alphas_redwine[i])
    else:
        plt.plot(range(len(model_redwine[i][1])), model_redwine[i][1], label="alpha=" + str(alphas_redwine[i]))
plt.legend()

plt.figure()
plt.title('RMSE Record Whitewine')
plt.ylim([0,6.0])
plt.xlim([0,1000])
plt.ylabel('RMSE')
plt.xlabel('i := iteration')
plt.grid()
for i in range(len(model_whitewine)):
    plt.plot(range(len(model_whitewine[i][2])), model_whitewine[i][2], label="alpha=" + str(alphas_whitewine[i]))
plt.legend()

plt.figure()
plt.title('RMSE Record Redwine')
plt.ylim([0,6.0])
plt.xlim([0,1000])
plt.ylabel('RMSE')
plt.xlabel('i := iteration')
plt.grid()
for i in range(len(model_redwine)):
    plt.plot(range(len(model_redwine[i][2])), model_redwine[i][2], label="alpha=" + str(alphas_redwine[i]))
plt.legend()


plt.show()

Graph explanation

The final comparison of all alpha values show, that Armijo / Bolddriver are producing better results then constant stepslength in some way:

okay or good models are found in less iterations
most of the time a very good models (with small RMSE) are found faster
the best values for | f(xi-1) - f(xi) | are smaller which indicates, that these models are closer to a real global minimum of that function
the best values for RMSE are smaller, which prooves, that there is less loss in these models

Additional : Loss comparison¶

print('Loss redwine')
for i in range(len(model_redwine)):
    s = "alpha=" + str(alphas_redwine[i]) + ": "
    predict = getPredictetValues(trainD_redwine['X'], model_redwine[i][0])
    s += str(loss(predict, trainD_redwine['y']))
    print(s)
    
print('\nLoss whitewine')
for i in range(len(model_whitewine)):
    s = "alpha=" + str(alphas_whitewine[i]) + ": "
    predict = getPredictetValues(trainD_whitewine['X'], model_whitewine[i][0])
    s += str(loss(predict, trainD_whitewine['y']))
    print(s)

Loss redwine
alpha=8e-08: 398.668340454
alpha=6e-08: 402.896208141
alpha=3e-08: 513.916814058
alpha=Armijo: 378.97744767
alpha=Bolddriver: 395.718907249

Loss whitewine
alpha=5e-10: 3059.20151904
alpha=3e-10: 3122.86131943
alpha=1e-10: 3442.61246289
alpha=Armijo: 445.795287696
alpha=Bolddriver: 604.493911276

Additional: Alpha values over time¶

plt.figure()
plt.title('Alpha Record Whitewine')
plt.ylim([0,0.00000015])
plt.xlim([0,600])
plt.ylabel('alpha')
plt.xlabel('i := iteration')
for i in range(len(model_whitewine)):
    plt.plot(range(len(model_whitewine[i][3])), model_whitewine[i][3], label="alpha=" + str(alphas_redwine[i]))
plt.legend()
    
plt.figure()
plt.title('Alpha Record Redwine')
plt.ylim([0,0.000001])
plt.xlim([0,600])
plt.ylabel('alpha')
plt.xlabel('i := iteration')
for i in range(len(model_redwine)):
    plt.plot(range(len(model_redwine[i][3])), model_redwine[i][3],label="alpha=" + str(alphas_redwine[i]))
plt.legend()
    
plt.show()

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.0	0.27	0.36	20.7	0.045	45.0	170.0	1.0010	3.00	0.45	8.8	6
1	6.3	0.30	0.34	1.6	0.049	14.0	132.0	0.9940	3.30	0.49	9.5	6
2	8.1	0.28	0.40	6.9	0.050	30.0	97.0	0.9951	3.26	0.44	10.1	6
3	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6
4	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6