梯度下降
import numpy as np
import matplotlib.pyplot as plt
"""
seed( ) 用于指定随机数生成时所用算法开始的整数值,如果使用相同的seed( )值,则每次生成的随即数都相同,
如果不设置这个值,则系统根据时间来自己选择这个值,此时每次生成的随机数因时间差异而不同
"""
np.random.seed(42)
N = 100
x = np.random.rand(N, 1) * 5
y = 9.81 * x
noise = 2 * np.random.randn(N, 1)
y_obs = y + noise
plt.scatter(x, y_obs, label="Observations")
plt.plot(x, y, c='r', label="True function")
plt.legend()
plt.show()
def f(w):
return w * x
def loss_function(e):
L = np.sum(np.square(e)) / N
return L
def dL_dw(e, w):
return -2 * np.mean(e * x)
def gradient_descent(iter=100, gamma=0.1):
"""
梯度下降
:param iter: 迭代轮数
:param gamma: 学习率
:return: w的历史值,loss的历史值
"""
w = 10 * np.random.random()
params = []
loss = np.zeros((iter, 1))
for i in range(iter):
params.append(w)
e = y_obs - f(w)
loss[i] = loss_function(e)
w_news = w - gamma * dL_dw(e, w)
w = w_news
return params, loss
params, loss = gradient_descent()
iter = 100
gamma = 0.1
w = 10 * np.random.randn()
params = []
loss = np.zeros((iter, 1))
for i in range(iter):
params.append(w)
e = y_obs - f(w)
loss[i] = loss_function(e)
w_news = w - gamma * dL_dw(e, w)
w = w_news
print(dL_dw(e, w))
plt.plot(loss)
plt.show()
params = np.array(params)
plt.plot(params)
plt.title('Gradient Descent')
plt.xlabel('w')
plt.show()
print(params[-1])
k-means
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)
K = 3
D = 2
N = 1000
Ns = [300, 400, 300]
means = 5 * np.random.randn(K, D)
print(means)
x = []
for n, m in zip(Ns, means):
print(n, m)
x.append(np.random.randn(n, D) + m)
print([x_.shape for x_ in x])
for x_, m in zip(x, means):
print(x_.shape, m.shape)
plt.scatter(x_[:, 0], x_[:, 1])
plt.plot(m[0], m[1], 'kx')
plt.title('True Clusters')
plt.show()
data = np.vstack(x)
print(data.shape)
plt.scatter(data[:, 0], data[:, 1])
plt.show()
def dist(x1, x2, axis=None):
"""
Calculate euclidean distance
:param x1: 点1 (x1,y1)
:param x2: 点2 (x2,y2)
:param axis:
:return:
"""
return np.sqrt(np.sum(np.square(x1 - x2), axis))
def distance_matrix(x, m):
"""
Calculates the distance from each element x to each element in m.
:param x: data points
:param m: possible means
:return: distance matrix
"""
d = np.zeros((len(x), len(m)))
for i in range(len(x)):
for j in range(len(m)):
d[i, j] = dist(x[i, :], m[j, :])
return d
x_test = np.array([[0, 1], [1, 0], [0, 0], [1, 1]])
print(distance_matrix(x_test, x_test))
def dist(x1, x2, axis=-1):
"""
Calculate euclidean distance
:param x1: 点1 (x1,y1)
:param x2: 点2 (x2,y2)
:param axis:
:return:
"""
return np.sqrt(np.sum(np.square(x1 - x2), axis))
def distance_matrix_broadcasting(x, m):
d = [dist(x, m_, axis=1) for m_ in m]
d = np.stack(d, axis=1)
return d
x_test = np.array([[0, 1], [1, 0], [0, 0], [1, 1]])
print(distance_matrix_broadcasting(x_test, x_test))
k = 3
iters = 10
print(data.shape)
print(data)
means = np.random.randn(k, data.shape[1])
for i in range(iters):
d = distance_matrix_broadcasting(data, means)
'''
array([[0, 4, 2],
[3, 1, 5]]) # 3*2
>>> np.argmin(a, axis=0)
映射到行x
找出每列最小元素的下标
array([0 1 0]) # (3,)
>>> np.argmin(a, axis=1)
映射到列y
找出每行最小元素的下标
array([0 1]) #(2,)
'''
cluster = d.argmin(axis=-1)
for j in range(k):
idx = cluster == j
plt.plot(means[j, 0], means[j, 1], 'rx')
plt.scatter(data[idx, 0], data[idx, 1])
means[j, :] = data[idx].mean(axis=0)
plt.show()
KNN
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.style.use('ggplot')
rowdata = {'颜色深度': [14.13, 13.2, 13.16, 14.27, 13.24, 12.07, 12.43, 11.79, 12.37, 12.04],
'酒精浓度': [5.64, 4.28, 5.68, 4.80, 4.22, 2.76, 3.94, 3.1, 2.12, 2.6],
'品种': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]}
wine_data = pd.DataFrame(rowdata)
print(wine_data)
X = np.array(wine_data.iloc[:, 0:2])
y = np.array(wine_data.iloc[:, -1])
new_data = np.array([12.03, 4.1])
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='red', label='赤霞珠')
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='purple', label='黑皮诺')
plt.scatter(new_data[0], new_data[1], color='yellow')
print(new_data)
plt.xlabel('酒精浓度')
plt.ylabel('颜色深度')
plt.legend(loc='lower right')
plt.savefig('葡萄酒样本.png')
plt.show()
from math import sqrt
distance = [sqrt(np.sum((x - new_data) ** 2)) for x in X]
print(distance)
sort_dist = np.argsort(distance)
print('sort_dist:\n', sort_dist)
k = 3
topK = [y[i] for i in sort_dist[:k]]
print(topK)
print(pd.Series(topK).value_counts().index[0])
def KNN(new_data, dataSet, k):
'''
函数功能:KNN分类器
参数说明:
new_data: 需要预测分类的数据集
dataSet: 已知分类标签的数据集
k: k-近邻算法参数,选择距离最小的k个点
return:
result: 分类结果
'''
from math import sqrt
from collections import Counter
import numpy as np
import pandas as pd
result = []
distance = [sqrt(np.sum((x - new_data) ** 2)) for x in
np.array(dataSet.iloc[:, 0:len(dataSet.columns) - 2])]
sort_dist = np.argsort(distance)
topK = [dataSet.iloc[:, -1][i] for i in sort_dist[:k]]
result.append(pd.Series(topK).value_counts().index[0])
return result
new_data = np.array([12.03, 4.1])
k = 3
print(KNN(new_data, wine_data, k))
决策树
香农熵
def calEnt(dataSet):
n = dataSet.shape[0]
iset = dataSet.iloc[:, -1].value_counts()
p = iset / n
ent = (-p * np.log2(p)).sum()
return ent
calEnt(dataSet)
数据集最佳切分函数
import numpy as np
import pandas as pd
row_data = {'是否陪伴': [0, 0, 0, 1, 1],
'是否玩游戏': [1, 1, 0, 1, 1],
'渣男': ['是', '是', '不是', '不是', '不是']}
dataSet = pd.DataFrame(row_data)
def calEnt(dataSet):
n = dataSet.shape[0]
iset = dataSet.iloc[:, -1].value_counts()
p = iset / n
ent = (-p * np.log2(p)).sum()
return ent
def bestSplit(dataSet):
baseEnt = calEnt(dataSet)
bestGain = 0
axis = -1
for i in range(dataSet.shape[1] - 1):
levels = dataSet.iloc[:, i].value_counts().index
ents = 0
for j in levels:
childSet = dataSet[dataSet.iloc[:, i] == j]
ent = calEnt(childSet)
ents += (childSet.shape[0] / dataSet.shape[0]) * ent
print('第{}列的信息熵为{}'.format(i, ents))
infoGain = baseEnt - ents
print('第{}列的信息增益为{}\n'.format(i, infoGain))
if (infoGain > bestGain):
bestGain = infoGain
axis = i
print("第{}列为最优切分列".format(axis))
return axis
print(bestSplit(dataSet))
按照给定列切分数据集
def mySplit(dataSet, axis, value):
"""
函数功能:按照给定的列划分数据集
参数说明:
dataSet:原始数据集
axis:指定的列索引
value:指定的属性值
return:redataSet:按照指定列索引和属性值切分后的数据集
"""
col = dataSet.columns[axis]
redataSet = dataSet.loc[dataSet[col] == value, :].drop(col, axis=1)
return redataSet
print(mySplit(dataSet, 0, 1))