python如何实现梯度下降求解逻辑回归

线性回归1.线性回归函数
似然函数的定义：给定联合样本值x下关于(未知)参数的函数
似然函数：什么样的参数跟我们的数据组合后恰好是真实值
2.线性回归似然函数
对数似然：
3.线性回归目标函数（误差的表达式，我们的目的就是使得真实值与预测值之前的误差最小）
（导数为0取得极值，得到函数的参数）
逻辑回归逻辑回归是在线性回归的结果外加一层sigmoid函数
1.逻辑回归函数
2.逻辑回归似然函数前提数据服从伯努利分布
对数似然：
引入转变为梯度下降任务，逻辑回归目标函数
梯度下降法求解我的理解就是求导更新参数，达到一定条件后停止，得到近似最优解
代码实现sigmoid函数
def sigmoid(z): return 1 / (1 + np.exp(-z))
预测函数
def model(x, theta): return sigmoid(np.dot(x, theta.t))
目标函数
def cost(x, y, theta): left = np.multiply(-y, np.log(model(x, theta))) right = np.multiply(1 - y, np.log(1 - model(x, theta))) return np.sum(left - right) / (len(x))
梯度
def gradient(x, y, theta): grad = np.zeros(theta.shape) error = (model(x, theta)- y).ravel() for j in range(len(theta.ravel())): #for each parmeter term = np.multiply(error, x[:,j]) grad[0, j] = np.sum(term) / len(x) return grad
梯度下降停止策略
stop_iter = 0stop_cost = 1stop_grad = 2 def stopcriterion(type, value, threshold): # 设定三种不同的停止策略 if type == stop_iter: # 设定迭代次数 return value > threshold elif type == stop_cost: # 根据损失值停止 return abs(value[-1] - value[-2]) < threshold elif type == stop_grad: # 根据梯度变化停止 return np.linalg.norm(value) < threshold
样本重新洗牌
import numpy.random#洗牌def shuffledata(data): np.random.shuffle(data) cols = data.shape[1] x = data[:, 0:cols-1] y = data[:, cols-1:] return x, y
梯度下降求解
def descent(data, theta, batchsize, stoptype, thresh, alpha): # 梯度下降求解 init_time = time.time() i = 0 # 迭代次数 k = 0 # batch x, y = shuffledata(data) grad = np.zeros(theta.shape) # 计算的梯度 costs = [cost(x, y, theta)] # 损失值 while true: grad = gradient(x[k:k + batchsize], y[k:k + batchsize], theta) k += batchsize # 取batch数量个数据 if k >= n: k = 0 x, y = shuffledata(data) # 重新洗牌 theta = theta - alpha * grad # 参数更新 costs.append(cost(x, y, theta)) # 计算新的损失 i += 1 if stoptype == stop_iter: value = i elif stoptype == stop_cost: value = costs elif stoptype == stop_grad: value = grad if stopcriterion(stoptype, value, thresh): break return theta, i - 1, costs, grad, time.time() - init_time
完整代码import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport osimport numpy.randomimport time def sigmoid(z): return 1 / (1 + np.exp(-z)) def model(x, theta): return sigmoid(np.dot(x, theta.t)) def cost(x, y, theta): left = np.multiply(-y, np.log(model(x, theta))) right = np.multiply(1 - y, np.log(1 - model(x, theta))) return np.sum(left - right) / (len(x)) def gradient(x, y, theta): grad = np.zeros(theta.shape) error = (model(x, theta) - y).ravel() for j in range(len(theta.ravel())): # for each parmeter term = np.multiply(error, x[:, j]) grad[0, j] = np.sum(term) / len(x) return grad stop_iter = 0stop_cost = 1stop_grad = 2 def stopcriterion(type, value, threshold): # 设定三种不同的停止策略 if type == stop_iter: # 设定迭代次数 return value > threshold elif type == stop_cost: # 根据损失值停止 return abs(value[-1] - value[-2]) < threshold elif type == stop_grad: # 根据梯度变化停止 return np.linalg.norm(value) < threshold # 洗牌def shuffledata(data): np.random.shuffle(data) cols = data.shape[1] x = data[:, 0:cols - 1] y = data[:, cols - 1:] return x, y def descent(data, theta, batchsize, stoptype, thresh, alpha): # 梯度下降求解 init_time = time.time() i = 0 # 迭代次数 k = 0 # batch x, y = shuffledata(data) grad = np.zeros(theta.shape) # 计算的梯度 costs = [cost(x, y, theta)] # 损失值 while true: grad = gradient(x[k:k + batchsize], y[k:k + batchsize], theta) k += batchsize # 取batch数量个数据 if k >= n: k = 0 x, y = shuffledata(data) # 重新洗牌 theta = theta - alpha * grad # 参数更新 costs.append(cost(x, y, theta)) # 计算新的损失 i += 1 if stoptype == stop_iter: value = i elif stoptype == stop_cost: value = costs elif stoptype == stop_grad: value = grad if stopcriterion(stoptype, value, thresh): break return theta, i - 1, costs, grad, time.time() - init_time def runexpe(data, theta, batchsize, stoptype, thresh, alpha): # import pdb # pdb.set_trace() theta, iter, costs, grad, dur = descent(data, theta, batchsize, stoptype, thresh, alpha) name = "original" if (data[:, 1] > 2).sum() > 1 else "scaled" name += " data - learning rate: {} - ".format(alpha) if batchsize == n: strdesctype = "gradient" # 批量梯度下降 elif batchsize == 1: strdesctype = "stochastic" # 随机梯度下降 else: strdesctype = "mini-batch ({})".format(batchsize) # 小批量梯度下降 name += strdesctype + " descent - stop: " if stoptype == stop_iter: strstop = "{} iterations".format(thresh) elif stoptype == stop_cost: strstop = "costs change < {}".format(thresh) else: strstop = "gradient norm < {}".format(thresh) name += strstop print("***{}\ntheta: {} - iter: {} - last cost: {:03.2f} - duration: {:03.2f}s".format( name, theta, iter, costs[-1], dur)) fig, ax = plt.subplots(figsize=(12, 4)) ax.plot(np.arange(len(costs)), costs, 'r') ax.set_xlabel('iterations') ax.set_ylabel('cost') ax.set_title(name.upper() + ' - error vs. iteration') return theta path = 'data' + os.sep + 'logireg_data.txt'pddata = pd.read_csv(path, header=none, names=['exam 1', 'exam 2', 'admitted'])positive = pddata[pddata['admitted'] == 1]negative = pddata[pddata['admitted'] == 0] # 画图观察样本情况fig, ax = plt.subplots(figsize=(10, 5))ax.scatter(positive['exam 1'], positive['exam 2'], s=30, c='b', marker='o', label='admitted')ax.scatter(negative['exam 1'], negative['exam 2'], s=30, c='r', marker='x', label='not admitted')ax.legend()ax.set_xlabel('exam 1 score')ax.set_ylabel('exam 2 score') pddata.insert(0, 'ones', 1) # 划分训练数据与标签orig_data = pddata.valuescols = orig_data.shape[1]x = orig_data[:, 0:cols - 1]y = orig_data[:, cols - 1:cols]# 设置初始参数0theta = np.zeros([1, 3]) # 选择的梯度下降方法是基于所有样本的n = 100runexpe(orig_data, theta, n, stop_iter, thresh=5000, alpha=0.000001)runexpe(orig_data, theta, n, stop_cost, thresh=0.000001, alpha=0.001)runexpe(orig_data, theta, n, stop_grad, thresh=0.05, alpha=0.001)runexpe(orig_data, theta, 1, stop_iter, thresh=5000, alpha=0.001)runexpe(orig_data, theta, 1, stop_iter, thresh=15000, alpha=0.000002)runexpe(orig_data, theta, 16, stop_iter, thresh=15000, alpha=0.001) from sklearn import preprocessing as pp # 数据预处理scaled_data = orig_data.copy()scaled_data[:, 1:3] = pp.scale(orig_data[:, 1:3]) runexpe(scaled_data, theta, n, stop_iter, thresh=5000, alpha=0.001)runexpe(scaled_data, theta, n, stop_grad, thresh=0.02, alpha=0.001)theta = runexpe(scaled_data, theta, 1, stop_grad, thresh=0.002 / 5, alpha=0.001)runexpe(scaled_data, theta, 16, stop_grad, thresh=0.002 * 2, alpha=0.001) # 设定阈值def predict(x, theta): return [1 if x >= 0.5 else 0 for x in model(x, theta)] # 计算精度scaled_x = scaled_data[:, :3]y = scaled_data[:, 3]predictions = predict(scaled_x, theta)correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]accuracy = (sum(map(int, correct)) % len(correct))print('accuracy = {0}%'.format(accuracy))
逻辑回归的优缺点优点形式简单，模型的可解释性非常好。从特征的权重可以看到不同的特征对最后结果的影响，某个特征的权重值比较高，那么这个特征最后对结果的影响会比较大。
模型效果不错。在工程上是可以接受的（作为baseline)，如果特征工程做的好，效果不会太差，并且特征工程可以大家并行开发，大大加快开发的速度。
训练速度较快。分类的时候，计算量仅仅只和特征的数目相关。并且逻辑回归的分布式优化sgd发展比较成熟，训练的速度可以通过堆机器进一步提高，这样我们可以在短时间内迭代好几个版本的模型。
资源占用小,尤其是内存。因为只需要存储各个维度的特征值。
方便输出结果调整。逻辑回归可以很方便的得到最后的分类结果，因为输出的是每个样本的概率分数，我们可以很容易的对这些概率分数进行cutoff，也就是划分阈值(大于某个阈值的是一类，小于某个阈值的是一类)。
缺点准确率并不是很高。因为形式非常的简单(非常类似线性模型)，很难去拟合数据的真实分布。
很难处理数据不平衡的问题。举个例子：如果我们对于一个正负样本非常不平衡的问题比如正负样本比 10000:1.我们把所有样本都预测为正也能使损失函数的值比较小。但是作为一个分类器，它对正负样本的区分能力不会很好。
处理非线性数据较麻烦。逻辑回归在不引入其他方法的情况下，只能处理线性可分的数据，或者进一步说，处理二分类的问题。
逻辑回归本身无法筛选特征。有时候，我们会用gbdt来筛选特征，然后再上逻辑回归。
以上就是python如何实现梯度下降求解逻辑回归的详细内容。

python如何实现梯度下降求解逻辑回归

VIP推荐