dummy_variable

扫码查看
有些时候需要把categorical data转化成number,需要使用dummy variable.我发现做这个事情的方法太多了,可以用pd.Series.map, 可以用pd.DataFrame.applymap,也可以用sklearn.preprocessing里面提供的一些Encoder
e.g

点击(此处)折叠或打开

  1. from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
代码如下

点击(此处)折叠或打开

  1. #import package
  2. import statsmodels.api as sm
  3. import statsmodels.formula.api as smf
  4. import seaborn as sns
  5. import matplotlib.pyplot as plt
  6. import pandas as pd
  7. import numpy as np
  8. from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

  9. sns.set()

  10. data=pd.read_csv('data-analysis/python-jupyter/1.03. Dummies.csv')
  11. # %load 11 12 15 30
  12. print(data.shape)
  13. new_data=data.copy()
  14. #new_data['Attendance']=new_data['Attendance'].map({'Yes':1, 'No':0})

  15. y=new_data['GPA']
  16. X1=new_data[['SAT','Attendance']]

  17. '''
  18. or do this
  19. '''
  20. label_encoder=LabelEncoder()
  21. new_data['Attendance']=label_encoder.fit_transform(new_data['Attendance'])


  22. results=smf.ols(formula='GPA ~ SAT + Attendance', data=new_data).fit()
  23. '''
  24. or do this
  25. x=sm.add_constant(X1)
  26. results=sm.OLS(y,x).fit()
  27. '''
  28. print(results.summary())
  29. # %load 23-28

  30. plt.scatter(new_data['SAT'], y)
  31. yhat_no=0.6439 + 0.0014 * new_data['SAT']
  32. yhat_yes = 0.8665 + 0.0014*new_data['SAT']
  33. fig = plt.plot(new_data['SAT'], yhat_no, lw=2, c='#006837')
  34. fig2= plt.plot(new_data['SAT'], yhat_yes, lw=2, c='#050026')
  35. plt.xlabel('SAT', fontsize=20)
  36. plt.ylabel('GPA', fontsize=20)
  37. plt.show()


10-01 05:15
查看更多