认识数据
数据分析
cv 一般不用数据处理
模型搭建
用 Kaggle 的内核
- code - new notebook
深度学习模型
pytorch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch.nn import Module, Linear, Conv2d, MaxPool2d, LogSoftmax, ReLU, Sequential, AdaptiveAvgPool2d, Dropout
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tensorflow.math import argmax
from tqdm import tqdm
from torch import flatten
import os
import torchvision
import gc#内存回收模块
class dataset(Dataset):
"""
数据集函数
Paramerters:
Datasets
"""
def __init__(self, path):
_dataset = pd.read_csv(path)
self.data = _dataset.drop("label", axis=1)
self.labels = _dataset["label"]
def __len__(self):
return len(self.labels.index)
#将数据转化成tensor
def __getitem__(self, i):
x = torch.as_tensor(np.array(self.data.loc[i,:]).reshape((28,28)),dtype=torch.float32, device= 'cuda' if torch.cuda.is_available else 'cpu').unsqueeze(0)
y = torch.as_tensor(np.array(self.labels[i]),dtype=torch.long, device = 'cuda' if torch.cuda.is_available else 'cpu')
return x, y
#读取数据
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_set = dataset(train_path)
test_set = pd.read_csv(test_path)
#将测试集转化成tensor
test_set = test_set.values.reshape(-1,1,28,28)
test_set.shape
test_set = torch.as_tensor(np.array(test_set), dtype=torch.float32, device = 'cuda' if torch.cuda.is_available else 'cpu')
test_set.shape
class CNN(Module):
def __init__(self, output_size = 10, in_size = 1):
super(CNN, self).__init__()
self.conv = Sequential(
Conv2d(in_channels = in_size, out_channels = 64, kernel_size = 3),
ReLU(inplace=True),
MaxPool2d(kernel_size = 2, stride= 2),
Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3),
ReLU(inplace= True),
MaxPool2d(kernel_size = 2, stride = 2),
Dropout(0.5),
Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3),
ReLU(inplace=True)
)
self.adap = AdaptiveAvgPool2d(output_size=(6,6))
self.fc = Sequential(
Linear(256 * 6 * 6,120),
ReLU(inplace = True),
Linear(120,84),
ReLU(inplace = True),
Linear(84,output_size),
LogSoftmax(dim = 1),
)
def forward(self, x):
x = self.conv(x)
x = self.adap(x)
x = flatten(x, 1)
x = self.fc(x)
return x
BATCH_SIZE = 8
EPOCHS = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNN().to(device= device)
#数据加载
train_data_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
steps_per_epoch = len(train_data_loader)
for epoch in range(0,EPOCHS):
num_samples = 0
num_correct = 0
running_loss = 0
for (X,y) in tqdm(train_data_loader, desc="Steps per epoch", unit="steps"):
X = X.to(device)
y = y.to(device)
scores = model(X)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
loss = criterion(scores, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f'Epoch {epoch + 1} / {EPOCHS} - loss: {running_loss / steps_per_epoch} - accuracy {float(num_correct) / float(num_samples)*100:.2f}')
print("Training finished")
gc.collect()#内存回收
def predict(model, input_data):
model.eval()
with torch.no_grad():
predictions = model(input_data)
return predictions
predictions = predict(model,test_set)
predictions = predictions.cpu()#通过cpu做推理
new_pred = argmax(predictions, axis = -1)
df_test = pd.read_csv(test_path)
sub = {'ImageId': df_test.index + 1, 'Label': new_pred}
basic_sub = pd.DataFrame(data=sub)
basic_sub.to_csv("submission.csv", index=False)
basic_sub.head()
tensorflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from keras.utils.np_utils import to_categorical # 独热编码
from sklearn.model_selection import train_test_split
from keras.models import Sequential#导入序贯模型
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D#导入卷积网络的所有的模块卷积、池化、全连接、展平等
from tensorflow.keras.optimizers import RMSprop#导入优化器
%matplotlib inline
import os
os.listdir("/kaggle/input/digit-recognizer")
#读取数据
train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
#数据划分和数据可视化
y_train = train['label']
X_train = train.drop('label',axis=1)
sns.countplot(y_train)
y_train.value_counts()
# 数据归一化
X_train = X_train / 255.0
test = test / 255.0
X_train.isnull().any().describe()#做数据观察
# 数据重朔
# 也就是将 784 像素的向量重塑为 28x28x3 的 3D 矩阵。
X_train = X_train.values.reshape(-1,28,28,1)#-1表示第一个部分未知,后面的会自动的帮你计算
test = test.values.reshape(-1,28,28,1)
z = np.array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12],
[13, 14, 15, 16]])
# 标签独热编码
y_train = to_categorical(y_train, num_classes = 10)
# 划分数据
X_train, X_val, Y_train, Y_val = train_test_split(X_train, y_train, test_size = 0.1)
# 数据可视化
plt.imshow(X_train[0][:,:,0])
# 自己手动设置cnn模型
# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out
model = Sequential()
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',
activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',
activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',
activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',
activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))
# 设置优化器
optimizer = RMSprop(learning_rate=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
# 编译模型
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])
#训练
history= model.fit(X_train,Y_train,epochs=10,batch_size=80,validation_data = (X_val,Y_val))
# 画出对比图loss和accuracy
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)
ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)
# 预测结果
results = model.predict(test)
# 选择索引和最高概率的类别
results = np.argmax(results,axis = 1)
results = pd.Series(results,name="Label")
results
传统模型
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
train = pd.read_csv("data/train.csv")
train.info()
test = pd.read_csv("data/test.csv")
sub = pd.read_csv("data/sample_submission.csv")
X_train = train.drop('label', axis=1)
y_train = train['label']
estimator = RandomForestClassifier()
param_grid = {'n_estimators':[50,120,160,200,250]}
grid = GridSearchCV(estimator, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
grid_search=grid.fit(X_train, y_train)
## 大家在这里思考一下当我有很多参数的时候我应该怎么办(也就是说我应该怎么调参)
## https://blog.csdn.net/qq_35040963/article/details/88832030
print(grid_search.best_params_)
print("Training accuracy after optimization : {}".format(grid_search.best_score_) )
estimator = RandomForestClassifier(n_neighbors=3)
estimator.fit(X_train, y_train)
pred_test = estimator.predict(test)
sub['Label']= np.array(pred_test)
sub.to_csv('data/submission.csv',index=False)
sub