In [1]:
import pandas as pd gl=pd.read_csv('./Titanic_Data-master/Titanic_Data-master/train.csv') gl.head()
Out[1]:
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [2]:
gl.shape#查看大小
Out[2]:
(891, 12)
1.查看基本数据信息
In [3]:
gl.info(memory_usage='deep')#查看基本信息
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 318.5 KB
2.查看不同数据类型的占用空间
In [4]:
for dtype in['float64','int64','object']: selevtrd_dtype=gl.select_dtypes(include=[dtype])#塞出不同数据类型 mean_usage_b=selevtrd_dtype.memory_usage(deep=True).mean()#求对应的数据类型的内存平均值 mean_usage_mb=mean_usage_b/1024**2 print('平均内存占用',dtype,mean_usage_mb)
平均内存占用 float64 0.004572550455729167 平均内存占用 int64 0.005685170491536458 平均内存占用 object 0.043910980224609375
In [5]:
import numpy as np int_types=['uint8','int8','int16','int32','int64'] for it in int_types: print(np.iinfo(it))#简化版info,查看每种数据类型的值范围
Machine parameters for uint8 --------------------------------------------------------------- min = 0 max = 255 --------------------------------------------------------------- Machine parameters for int8 --------------------------------------------------------------- min = -128 max = 127 --------------------------------------------------------------- Machine parameters for int16 --------------------------------------------------------------- min = -32768 max = 32767 --------------------------------------------------------------- Machine parameters for int32 --------------------------------------------------------------- min = -2147483648 max = 2147483647 --------------------------------------------------------------- Machine parameters for int64 --------------------------------------------------------------- min = -9223372036854775808 max = 9223372036854775807 ---------------------------------------------------------------
3.通过转换数据类型来减少数据占用内存
In [6]:
def menu_usage(pandas_obj): if isinstance(pandas_obj,pd.DataFrame):#isinstance() 函数来判断一个对象是否是一个已知的类型,类似 type()。 usage_b=pandas_obj.memory_usage(deep=True).sum()#求内存占用量的总和 else: usage_b=pandas_obj.memory_usage(deep=True) usage_mb=usage_b/1024**2 return'{:03.2f}MB'.format(usage_mb)#规定数据格式是小数点后2位 gl_int=gl.select_dtypes(include=['int64'])#去除int64的数据 coverter_int=gl_int.apply(pd.to_numeric,downcast='unsigned')#pd.to_numeric数据转换;downcast='unsigned'向下转换成无符号 print(menu_usage(gl_int)) print(menu_usage(coverter_int))
0.03MB 0.01MB
In [7]:
gl_float=gl.select_dtypes(include=['float64'])#去除int64的数据 coverter_float=gl_int.apply(pd.to_numeric,downcast='float')#pd.to_numeric数据转换;downcast='unsigned'向下转换成无符号 print(menu_usage(gl_float)) print(menu_usage(coverter_float))
0.01MB 0.02MB
4.把所有数据类型转换成对应的不同的数据类型
In [8]:
optimized_gl=gl.copy() optimized_gl[coverter_int.columns]=coverter_int optimized_gl[coverter_float.columns]=coverter_float print(menu_usage(gl)) print(menu_usage(optimized_gl))
0.31MB 0.29MB
5.describe():统计各项lable的属性指标
In [9]:
gl_obj=gl.select_dtypes(include=['object']).copy() gl_obj.describe()
Out[9]:
count | 891 | 891 | 891 | 204 | 889 |
unique | 891 | 2 | 681 | 147 | 3 |
top | Turkula, Mrs. (Hedwig) | male | 347082 | C23 C25 C27 | S |
freq | 1 | 577 | 7 | 4 | 644 |
6.把重复的lable放在一个空间里:即转换成category类型
In [10]:
dow=gl_obj.Sex#抽出对应lable的数据 dow.head()
Out[10]:
0 male 1 female 2 female 3 female 4 male Name: Sex, dtype: object
7.通过把数据类型object=>category,来减少占用的空间
In [11]:
dow_cat=dow.astype('category')#把上面的object转成category类型 dow_cat.head()
Out[11]:
0 male 1 female 2 female 3 female 4 male Name: Sex, dtype: category Categories (2, object): [female, male]
In [12]:
dow_cat.head(10).cat.codes#连接字符串,查出不同类
Out[12]:
0 1 1 0 2 0 3 0 4 1 5 1 6 1 7 1 8 0 9 0 dtype: int8
In [13]:
print(menu_usage(dow))#object类型 print(menu_usage(dow_cat))#category类型,占空间减少了
0.05MB 0.00MB
8.计算整个表通过转换成category类型后的内存
In [14]:
converted_obj=pd.DataFrame()#定义成空的DataFrame for col in gl_obj.columns: num_unique_values=len(gl_obj[col].unique()) num_total_values=len(gl_obj[col]) if num_unique_values/num_total_values<0.5:#寻找重复量最大的一列 converted_obj.loc[:,col]=gl_obj[col].astype('category')#astype转换类型 else: converted_obj.loc[:,col]=gl_obj[col]
In [15]:
print(menu_usage(gl_obj)) print(menu_usage(converted_obj))
0.26MB 0.14MB