用不同的方法处理 pandas 数据
import pandas as pd
def printf(t):
print(t)
print("-" * 80)
path = "F:\All date\IMDB-Movie-Data.csv"
date = pd.read_csv(path)
printf(date.info())
#-------------------------------------------------------------
#1. 求评分的平均分
printf(date["Rating"].mean())
#-------------------------------------------------------------
#2. 求导演的人数
date1 = date["Director"].values
printf(type(date))
printf(type(date["Director"]))
printf(type(date1))
"""
(列下标) (.values)
pandas Dateframe -> series -> numpy
"""
printf(len(set(date1)))
#-------------------------------------------------------------
# 3.求演员的人数
date2 = date["Actors"].values
ls = []
for str1 in date2:
for j in str1.split():
ls.append(j)
printf(len(set(ls)))
#-------------------------------------------------------------
用pandas处理整型数据并作直方图
"""
之前处理数据都是手动造数据 , 无法处理大规模数据
用 numpy (loadtxt) 处理数据无法处理非数值数据
而用 pandas 就很好的解决了这个问题 , pandas 处理文件很方便
"""
from matplotlib import pyplot as plt
import matplotlib as mtb
import pandas as pd
def printf(t):
print(t)
print("-" * 80)
#实现中文输出
mtb.rcParams['font.sans-serif'] = ["SimHei"]
mtb.rcParams["axes.unicode_minus"] = False
path = "F:\All date\IMDB-Movie-Data.csv"
date = pd.read_csv(path)
#----------------------------------------------------------------------
# printf(date.columns)
#注意这里要把那一维的 value 截取下来
dateruntime = date['Runtime (Minutes)'].values
# printf(type(dateruntime)) #<class 'numpy.ndarray'>
#----------------------------------------------------------------------
#设置大小 和 分辨率
plt.figure(figsize = (15,5) , dpi = 80)
d = 10#组距
num = (dateruntime.max() - dateruntime.min()) // d
# printf(num)
plt.hist(dateruntime , [dateruntime.min() + i * d for i in range(num + 2)], color = "#FF7F50")
# 设置横坐标
plt.xticks(range(dateruntime.min() , dateruntime.max() + 2 * d , d))
plt.yticks(range(1,300,25))
#设置标签
plt.xlabel("电影时长")
plt.ylabel("数量")
plt.title("1000部电影电影时长情况分布统计")
# 设置网格 alpha 是清晰度
plt.grid(alpha = 0.3 , color = "#000000")
#显示图像
plt.show()
用pandas 处理浮点型数据并做直方图
from matplotlib import pyplot as plt
import matplotlib as mtb
import pandas as pd
def printf(t):
print(t)
print("-" * 80)
#实现中文输出
mtb.rcParams['font.sans-serif'] = ["SimHei"]
mtb.rcParams["axes.unicode_minus"] = False
path = "F:\All date\IMDB-Movie-Data.csv"
date = pd.read_csv(path)
#----------------------------------------------------------------------
# printf(date.columns)
#注意这里要把那一维的 value 截取下来
daterate = date["Rating"].values
# printf(type(dateruntime)) #<class 'numpy.ndarray'>
#----------------------------------------------------------------------
#设置大小 和 分辨率
plt.figure(figsize = (15,5) , dpi = 80)
d = 0.7#组距
num = (daterate.max() - daterate.min()) // d
# printf(num)
num = int(num)
plt.hist(daterate , [daterate.min() + i * d for i in range(num + 2)], color = "#FF7F50")
# 设置横坐标
st = daterate.min()
# print(type(st))
# 浮点型的数据用range函数不好处理 , 要用列表来处理
_x = [daterate.min() + i * d for i in range(num + 2)]
plt.xticks(_x)
plt.yticks(range(1,300,25))
#设置标签
plt.xlabel("电影得分")
plt.ylabel("数量")
plt.title("1000部电影得分情况分布统计")
# 设置网格 alpha 是清晰度
plt.grid(alpha = 0.3 , color = "#000000")
#显示图像
plt.show()
用 pandas 处理字符串(1)
# import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mtb
#实现中文输出
mtb.rcParams['font.sans-serif'] = ["SimHei"]
mtb.rcParams["axes.unicode_minus"] = False
plt.figure(figsize = (15,5) , dpi = 80)
def printf(t):
print(t)
print("-" * 80)
path = "F:\All date\IMDB-Movie-Data.csv"
date = pd.read_csv(path)
# Genre 分类情况
# print(date.info())
# printf(date["Genre"].values)
#------------------------------------------------------------------------------
Gdate = date["Genre"].values
# print(type(Gdate))
book = {}
for str1 in Gdate:
for j in str1.split(','):
book[j] = book.get(j,0) + 1
datenum = pd.Series(book)
#------------------------------------------------------------------------------
# printf(datenum)
_x = datenum.index
_y = datenum.values
plt.bar(_x , _y , width = 0.5, color = "#4B0082" )
plt.yticks(range(0,600,50))
plt.xlabel("标签类别")
plt.ylabel("数量")
plt.title("不同标签电影数量")
plt.grid(alpha = 0.3 , color = "#000000")
plt.show()
#------------------------------------------------------------------------------
# printf(ls)
用 pandas 处理字符串(2)
# import numpy as np
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mtb
#实现中文输出
mtb.rcParams['font.sans-serif'] = ["SimHei"]
mtb.rcParams["axes.unicode_minus"] = False
plt.figure(figsize = (15,5) , dpi = 80)
def printf(t):
print(t)
print("-" * 80)
path = "F:\All date\IMDB-Movie-Data.csv"
date = pd.read_csv(path)
datelist = date["Genre"].str.split(",").tolist() # 列表嵌套列表
# print(datelist)
ls = list(set([j for i in datelist for j in i])) #去重生成列
# print(ls)
# print(date.shape[0])
# 生成一个全为 0 的表格
zero_list = pd.DataFrame(np.zeros((date.shape[0] , len(ls))) , columns = ls)
# print(zero_list)
#pandas 的切片和索引快速处理
for i in range(zero_list.shape[0]):
zero_list.loc[i,datelist[i]] = 1
# print(zero_list.shape[1])
datesum = zero_list.sum(axis = 0)
# print(type(datesum))
datesum = datesum.sort_values(ascending = False)
_x = datesum.index
_y = datesum.values
plt.bar(_x , _y , width = 0.5, color = "#4B0082" )
plt.yticks(range(0,600,50))
plt.xlabel("标签类别")
plt.ylabel("数量")
plt.title("不同标签电影数量")
plt.grid(alpha = 0.3 , color = "#000000")
plt.show()
pandas 数据合并
"""
数据的合并
"""
import pandas as pd
import numpy as np
def printf(t):
print(t)
print('-' * 80)
#-----------------------join(行索引)--------------------------------------------------------------------------------
"""
join 按行索引合并 , 类索引中不能出现相同 , 不然会报错 , 没有的位置填充 NAN
"""
d1 = pd.DataFrame(np.array(range(12)).reshape(3,4),index = ['F' , 'D' , 'B'] ,columns=list("abcd"))
d2 = pd.DataFrame(np.array(range(8,16)).reshape(2,4),index = ['A' , 'B' ] ,columns=list("sxhj"))
print(d1,d2,sep='\n')
printf(d1.join(d2))
printf(d2.join(d1))
#-------------------------------------------------------------------------------------------------------------------
#----------------------------merge(列索引)--------------------------------------------------------------------------
# 合并方式 on / letf_on / right_on
# 按照那一列进行合并 , on 用在两边有相同列的时候 , left_on / right_on 以两个不同列为基准
# 合并方式 inner / outer 默认 inner 取交集 , outer 取并集
d3 = pd.DataFrame(np.array([1.0 for i in range(12)]).reshape(3,4),index = ['A' , 'B' , 'C'] ,columns=list("MNOP"))
d3['O']['A'] = 'a'
d3['O']['B'] = 'b'
d3['O']['C'] = 'c'
# printf(d3)
d4 = pd.DataFrame(np.array([1.0 for i in range(10)]).reshape(2,5),index = ['A' , 'B' ] ,columns=list("VWXYZ"))
d4['X']['A'] = 'c'
d4['X']['B'] = 'd'
# printf(d4)
printf(d3.merge(d4 , left_on = 'O', right_on = 'X')) #默认取交集
printf(d3.merge(d4 , left_on = 'O', right_on = 'X' , how = 'inner')) #inner取交集
printf(d3.merge(d4 , left_on = 'O', right_on = 'X' , how = 'outer')) #outer取并集
printf(d3.merge(d4 , left_on = 'O', right_on = 'X' , how = 'left')) #inner取并集 但以左为全集
printf(d3.merge(d4 , left_on = 'O', right_on = 'X' , how = 'right')) #inner取交集 但以右为全集
#------------------------------------------------------------------------------------------------------------------