# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score,recall_score,average_precision_score,auc
# In[2]:
data = pd.read_csv("D:/Users/SGG91044/Desktop/MEP_no_defect_data_pivot_test.csv")
# In[3]:
data.head()
# In[4]:
data.drop(columns=["lotid","waferid","defect_count","eqpid","Chamber","Step","Recipie_Name"],inplace=True)
data
# In[5]:
data.iloc[:,0:17] = data.iloc[:,0:17].apply(pd.to_numeric,errors='coerce')
# In[6]:
for i in range(0,17):
med = np.median(data.iloc[:,i][data.iloc[:,i].isna() == False])
data.iloc[:,i] = data.iloc[:,i].fillna(med)
# In[10]:
nz = Normalizer()
X=data.iloc[:,0:19]=pd.DataFrame(nz.fit_transform(data.iloc[:,0:17]),columns=data.iloc[:,0:17].columns)
# In[11]:
X
# In[12]:
X_train, X_test = train_test_split(
X, test_size=0.3, random_state=8)
# In[30]:
# fit the model
clf = IsolationForest( max_samples=10000,random_state=10 )
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
# In[35]:
scores_pred = clf.decision_function(X_train.values)
scores_pred
# In[36]:
clf.decision_function(X_test)