jupyter问题总结

https://github.com/ryan403/reactor-sh/blob/336c5ee685f5f70146157e4e326c2affebac72f5/linear_regression/linear_regression_sample.py

读取文件夹中的表格

import pandas as pd
import os
#加载包准备
filedir = os.listdir('project1')#一定要是同一文件夹里的文件夹,只能写文件夹的名字
filedir#此处输出文件夹里包含的文件名

pd.read_excel('project1/excel.xlsx')#读取表格,要加文件后缀,读取csv文件用pd.read_csv

输出字符

list_test = ['zhangsan','1',13]
print(list_test[1])   #输出字符1
print(list_test[0]+str('asd'))  #输出zhangsanasd ,注意字符串只能与字符串相加

同一文件夹下的表格合并在一个集合里

data_raw_list=[]
for i in range=(1,len(firedir)+1):    #firedir=os.listdir('project1')即该文件夹下包含的所有文件
    data_raw_list.append(pd.read_excel('project1/进度分析'+str(i)+'.xlsx')) #此处需要合并的表格全部命名为进度分析1 2 3...
    print('project1/进度分析'+str(i)+'.xlsx')
    data_raw_list[0]#输出第一个表格

数据排序

mammals ['bb-ratio'] = mammal['brain']/(mammals['body'])#加入新列
mammals.sort_values(by = 'bb-ratio',ascending=False).head(10)#按bb-ratio排序,显示前十名

散点图

mammals.describe()#生成平均值等等数字特征值

mammals.plot.scatter(x='金額',y='金額')#散点图
lynx.plot(x='國文',y='數學') #折线图

mammals.plot.scatter(x='金額',y='金額',loglog=True)#xy取对数散点图

线性回归与预测

mammals['body_log'] = np.log(mammals['body'])
mammals['brain_log'] = np.log(mammals['brain'])



X = mammals[['body_log']]
y = mammals['brain_log']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)#0.3指30%做测试,70%训练
reg = LinearRegression()
reg.fit(X_train, y_train)
r2_score(y_test, reg.predict(X_test))#观察预测结果

plt.scatter(y_test, reg.predict(X_test))
plt.plot(y_test, y_test, color='red')#线性回归参考线

K-means Clustering

iris = load_iris()
iris_df = pd.DataFrame(data = np.c_[iris['data'],iris['target']], columns = iris['feature_names']+['class'])#整理数据
iris_df
X_iris = iris_df[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']]#多维数据
y_iris = iris_df['class']
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, test_size = 0.3, random_state=0) #切割做预测
-----------
this_KMeans = KMeans(n_clusters=3, random_state=0)#分成3群
this_km = this_KMeans.fit(X_iris_train)#fit
y_pred = this_km.predict(X_iris_test)#预测

metrics.accuracy_score(y_iris_test,y_pred)#分数
-----------
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, test_size = 0.3, random_state=0)#切割数据预测

this_KNC = KNeighborsClassifier(n_neighbors=5)
this_KNC_model = this_KNC.fit(X_iris_train, y_iris_train)
y_knc_pred = this_KNC_model.predict(X_iris_test)

metrics.accuracy_score(y_iris_test,y_knc_pred)

------------
scores = cross_val_score(this_KNC_model,X_iris, y_iris, cv=10, scoring='accuracy')
scores.mean()

-------------计算n_neighbors设为多少时效果最好
k_range = list(range(1,26))
k_dict = {}
for k in k_range:
    this_KNC = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(this_KNC,X_iris, y_iris, cv=10, scoring='accuracy')
    k_dict[k] = scores.mean()

scores_max = max(k_dict, key=k_dict.get)
print(scores_max, k_dict[scores_max])

-------------PCA转换。利用PCA将4个数据集降至3维呈现
X_iris_reduced = PCA(n_components=3).fit_transform(X_iris)

fig = plt.figure(1, figsize=(8,6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(X_iris_reduced[:, 0],X_iris_reduced[:,1],X_iris_reduced[:,2],c=y_iris, cmap='viridis',edgecolors='k',s=40)#cmap色调与数值变化、edgecolors点的边框颜色(此处是黑色)、s点的大小、elev海拔视角(默认值30)、azim方位视角(默认值-60)

筛选符合条件的行列

data11=data.loc[data['Stock'] == 'A']
y=data.iloc[:,5:]#选择第5列之后的数据