将鱼图像数据进行操作，使用numpy知识

import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

#咱们可以不用show方法，嵌套代码里面

fish = plt.imread('./fish.png')

plt.imshow(fish)

<matplotlib.image.AxesImage at 0x8e231d0>

（一）pandas的两种对象-LMLPHP

fish.shape

(243, 326, 3)

#把图片变成灰色的

fish1 = fish[::,::,0]

fish1.shape

(243, 326)

plt.imshow(fish1,cmap = "gray")

#灰度化处理   （本质  就是降维）  黑白照片就是二维数据，彩色照片三维或者多维的！！！

<matplotlib.image.AxesImage at 0x9783c88>

（一）pandas的两种对象-LMLPHP

#人脸识别的

#使用一个叫opencv扩展库  计算机视觉库（书专门介绍  opencv的）

import cv2

sanpang = cv2.imread("./cv2_change_head/j.jpg")

#RGB   CV2在读数据的时候，BGR

plt.imshow(sanpang[::,::,::-1])

#专门用来识别人脸部位的一个对象

cascade = cv2.CascadeClassifier()

#加载现成的算法

cascade.load("./cv2_change_head/haarcascade_frontalface_default.xml")

#使用人脸识别的类进行识别

face = cascade.detectMultiScale(sanpang)

face

array([[225,  76,  72,  72]], dtype=int32)

（一）pandas的两种对象-LMLPHP

dog = cv2.imread("./cv2_change_head/dog.jpg")

small_dog  = cv2.resize(dog, (72,72))

for (h,w, p,p) in face:

    #([[225,  76,  72,  72]]

    sanpang[w:w+p,h:h+p] = small_dog

plt.imshow(sanpang[::,::,::-1])

<matplotlib.image.AxesImage at 0xb385630>

（一）pandas的两种对象-LMLPHP

Pandas的数据结构

导入pandas：

三剑客

import pandas as pd

from pandas import Series,DataFrame

import matplotlib.pyplot as plt

import numpy as np

1、Series

Series是一种类似与一维数组的对象，由下面两个部分组成：

values：一组数据（ndarray类型）
index：相关的数据索引标签

#Series 其实是对ndarray的一个封装（包装）

#index: 索引

#values：值，是一个（一维的ndarray）

1）Series的创建

两种创建方式：

(1) 由列表或numpy数组创建

默认索引为0到N-1的整数型索引

nd = np.array([1,2,3])

nd

array([1, 2, 3])

s = Series([1,2,3])

s

0    1

1    2

2    3

dtype: int64

s = Series(nd, index=list("abc"))

s[0]

#注意index 索引传值的时候是一个list

s = Series(nd, index = ["a","b","c"])

s

a    1

b    2

c    3

dtype: int32

#咱们的索引值可不可以相同

s = Series(nd, index = list("AAA"))

s

A    1

A    2

A    3

dtype: int32

s["A"]

A    1

A    2

A    3

dtype: int32

#当索引值相同的时候，使用默认的索引拿数据的时候会出现问题！！！！！    如果自己定义索引不相同的时候，是可以使用默认的索引的！！！！！

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)

   2559             return self._engine.get_value(s, k,

-> 2560                                           tz=getattr(series.dtype, 'tz', None))

   2561         except KeyError as e1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine._get_loc_duplicates()

TypeError: '<' not supported between instances of 'str' and 'int'

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)

<ipython-input-31-c9c96910e542> in <module>()

----> 1 s[0]

C:\anaconda\lib\site-packages\pandas\core\series.py in __getitem__(self, key)

    621         key = com._apply_if_callable(key, self)

    622         try:

--> 623             result = self.index.get_value(self, key)

    624

    625             if not is_scalar(result):

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)

   2578             # python 3

   2579             if is_scalar(key):  # pragma: no cover

-> 2580                 raise IndexError(key)

   2581             raise InvalidIndexError(key)

   2582

IndexError: 0

s = Series(data = np.random.randint(0,100,size = 10), index = list("abcdefghtq"))

s

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-38-bb7520ab96cf> in <module>()

----> 1 s = Series(data = np.random.randint(0,100,size = 10), index = list("abcdefghtqw"))

      2 s

C:\anaconda\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)

    264                                        raise_cast_failure=True)

    265

--> 266                 data = SingleBlockManager(data, index, fastpath=True)

    267

    268         generic.NDFrame.__init__(self, data, fastpath=True)

C:\anaconda\lib\site-packages\pandas\core\internals.py in __init__(self, block, axis, do_integrity_check, fastpath)

   4400         if not isinstance(block, Block):

   4401             block = make_block(block, placement=slice(0, len(axis)), ndim=1,

-> 4402                                fastpath=True)

   4403

   4404         self.blocks = [block]

C:\anaconda\lib\site-packages\pandas\core\internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)

   2955                      placement=placement, dtype=dtype)

   2956

-> 2957     return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)

   2958

   2959 # TODO: flexible with index=None and/or items=None

C:\anaconda\lib\site-packages\pandas\core\internals.py in __init__(self, values, placement, ndim, fastpath)

    118             raise ValueError('Wrong number of items passed %d, placement '

    119                              'implies %d' % (len(self.values),

--> 120                                              len(self.mgr_locs)))

    121

    122     @property

ValueError: Wrong number of items passed 10, placement implies 11

还可以通过设置index参数指定索引

a    32

b    11

c    73

d    13

e    34

f     4

g    67

h    76

t    62

q    76

dtype: int32

s.index = list("ABCDEFGHTQ")

s

A    32

B    11

C    73

D    13

E    34

F     4

G    67

H    76

T    62

Q    76

dtype: int32

#可以不可以单个的去修改索引

s.index[0] = ["Y"]

s

#Series 索引值，不能是对他单个修改的

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-42-22ddc72ada94> in <module>()

      1 #可以不可以单个的去修改索引

----> 2 s.index[0] = ["Y"]

      3 s

      4 #Series 索引值，不能是对他单个修改的

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)

   1722

   1723     def __setitem__(self, key, value):

-> 1724         raise TypeError("Index does not support mutable operations")

   1725

   1726     def __getitem__(self, key):

TypeError: Index does not support mutable operations

特别地，由ndarray创建的是引用，而不是副本。对Series元素的改变也会改变原来的ndarray对象中的元素。（列表没有这种情况）

nd = np.array([0,2,4,6])

s = Series(nd, index = list("ABCD"))

s

A    0

B    2

C    4

D    6

dtype: int32

s['C'] = 16

s

A     0

B     2

C    16

D     6

dtype: int32

nd

array([ 0,  2, 16,  6])

(2) 由字典创建

s = Series(data  = {"a":10,"pi":3.14,"e":2.713,"g":0.618}, index =["a","pi","e","g","kk"])

s

#假如使用字典的时候，index可以多出来值，但是数据会补上Nan

a     10.000

pi     3.140

e      2.713

g      0.618

kk       NaN

dtype: float64

============================================

练习1：

使用多种方法创建以下Series，命名为s1：

语文 150

数学 150

英语 150

理综 300

============================================

s1 = Series(data = {"语文":93,"数学":79,"英语":120,"理综":20})

s1

数学     79

理综     20

英语    120

语文     93

dtype: int64

2）Series的索引和切片

可以使用中括号取单个索引（此时返回的是元素类型），或者中括号里一个列表取多个索引（此时返回的仍然是一个Series类型）。分为显示索引和隐式索引：

(1) 显式索引：

- 使用index中的元素作为索引值

- 使用.loc[]（推荐）

注意，此时是闭区间

a     10.000

pi     3.140

e      2.713

g      0.618

kk       NaN

dtype: float64

s["e"]

#返回的是float

2.713

s["a","g"]

#这种写法是不对的！！！

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)

   2565             try:

-> 2566                 return libts.get_value_box(s, key)

   2567             except IndexError:

pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box()

pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box()

TypeError: 'tuple' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)

<ipython-input-54-caa8c0183215> in <module>()

----> 1 s["a","g"]

C:\anaconda\lib\site-packages\pandas\core\series.py in __getitem__(self, key)

    621         key = com._apply_if_callable(key, self)

    622         try:

--> 623             result = self.index.get_value(self, key)

    624

    625             if not is_scalar(result):

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)

   2572                     raise InvalidIndexError(key)

   2573                 else:

-> 2574                     raise e1

   2575             except Exception:  # pragma: no cover

   2576                 raise e1

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)

   2558         try:

   2559             return self._engine.get_value(s, k,

-> 2560                                           tz=getattr(series.dtype, 'tz', None))

   2561         except KeyError as e1:

   2562             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: ('a', 'g')

s[["a","g"]]

#Series

a    10.000

g     0.618

dtype: float64

s1 = s.loc[["a","g"]]

s1

#使用loc取多个的值

a    10.000

g     0.618

dtype: float64

#取单个的值

s.loc["a"]

#float

10.0

type(s.loc[["a"]])

#Series

pandas.core.series.Series

(2) 隐式索引：

- 使用整数作为索引值

- 使用.iloc[]（推荐）

注意，此时是半开区间

a     10.000

pi     3.140

e      2.713

g      0.618

kk       NaN

dtype: float64

s[0]

10.0

s.iloc[0]

#在使用iloc的时候  必须穿的值是隐藏起来的索引值（也就是整型的）

10.0

s.iloc[[0,1,2]]

#取多个值的时候，加两个中括号

a     10.000

pi     3.140

e      2.713

dtype: float64

#切片

s["a":"g"]

#左闭右闭

a     10.000

pi     3.140

e      2.713

g      0.618

dtype: float64

s.loc["a":"g"]

a     10.000

pi     3.140

e      2.713

g      0.618

dtype: float64

s.iloc[0:3]

#在使用iloc的时候，左闭右开

a     10.000

pi     3.140

e      2.713

dtype: float64

============================================

练习2：

使用多种方法对练习1创建的Series s1进行索引和切片：

索引：

数学 150

切片：

语文 150

数学 150

英语 150

============================================

3）Series的基本概念

可以把Series看成一个定长的有序字典

可以通过shape，size，index,values等得到series的属性

a     10.000

pi     3.140

e      2.713

g      0.618

kk       NaN

dtype: float64

s.shape

(5,)

s.size

s.index

Index(['a', 'pi', 'e', 'g', 'kk'], dtype='object')

s.values

#打印出来的数据是一个ndarray

numpy.ndarray

可以通过head(),tail()快速查看Series对象的样式

#扩展

data = pd.read_csv("./president_heights.csv")

type(data)

pandas.core.frame.DataFrame

data

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

0	1	George Washington	189
1	2	John Adams	170
2	3	Thomas Jefferson	189
3	4	James Madison	163
4	5	James Monroe	183
5	6	John Quincy Adams	171
6	7	Andrew Jackson	185
7	8	Martin Van Buren	168
8	9	William Henry Harrison	173
9	10	John Tyler	183
10	11	James K. Polk	173
11	12	Zachary Taylor	173
12	13	Millard Fillmore	175
13	14	Franklin Pierce	178
14	15	James Buchanan	183
15	16	Abraham Lincoln	193
16	17	Andrew Johnson	178
17	18	Ulysses S. Grant	173
18	19	Rutherford B. Hayes	174
19	20	James A. Garfield	183
20	21	Chester A. Arthur	183
21	23	Benjamin Harrison	168
22	25	William McKinley	170
23	26	Theodore Roosevelt	178
24	27	William Howard Taft	182
25	28	Woodrow Wilson	180
26	29	Warren G. Harding	183
27	30	Calvin Coolidge	178
28	31	Herbert Hoover	182
29	32	Franklin D. Roosevelt	188
30	33	Harry S. Truman	175
31	34	Dwight D. Eisenhower	179
32	35	John F. Kennedy	183
33	36	Lyndon B. Johnson	193
34	37	Richard Nixon	182
35	38	Gerald Ford	183
36	39	Jimmy Carter	177
37	40	Ronald Reagan	185
38	41	George H. W. Bush	188
39	42	Bill Clinton	188
40	43	George W. Bush	182
41	44	Barack Obama	185

#一个DataFrame就是由多个Series组成的！！1

s_height = data['height(cm)']

type(s_height)

pandas.core.series.Series

s_height.head(2)

#head方法  取数据的前五条，而且还可以传参自定义出来的数据

0    189

1    170

Name: height(cm), dtype: int64

s_height.tail()

#tail方法，去最后的五条数据

37    185

38    188

39    188

40    182

41    185

Name: height(cm), dtype: int64

当索引没有对应的值时，可能出现缺失数据显示NaN（not a number）的情况

s = Series(data = {"a":10,"b":20,"c":30}, index  =list("abcd"))

s

a    10.0

b    20.0

c    30.0

d     NaN

dtype: float64

可以使用pd.isnull()，pd.notnull()，或自带isnull(),notnull()函数检测缺失数据

#后面会用到

pd.isnull(s)

a    False

b    False

c    False

d     True

dtype: bool

ind = s.isnull()

ind

a    False

b    False

c    False

d     True

dtype: bool

#使用ind给空值赋值,后面会用到

s[ind] = 1000

a      10.0

b      20.0

c      30.0

d    1000.0

dtype: float64

pd.notnull(s)

a    True

b    True

c    True

d    True

dtype: bool

s.notnull()

a    True

b    True

c    True

d    True

dtype: bool

Series对象本身及其实例都有一个name属性

s  =Series(data = np.random.randint(0,150,size = 5), index = ["张三","李四","Lisa","Sara","Jack"])

s

张三       36

李四       83

Lisa     67

Sara    110

Jack     58

dtype: int32

s.name = "Python"

张三       36

李四       83

Lisa     67

Sara    110

Jack     58

Name: Python, dtype: int32

s  =Series(data = np.random.randint(0,150,size = 5), index = ["张三","李四","Lisa","Sara","Jack"], name = "Math")

s

张三      72

李四      40

Lisa    69

Sara    27

Jack     8

Name: Math, dtype: int32

#扩展

df = pd.read_csv("./president_heights.csv")

s2 = df["order"]

s2

0      1

1      2

2      3

3      4

4      5

5      6

6      7

7      8

8      9

9     10

10    11

11    12

12    13

13    14

14    15

15    16

16    17

17    18

18    19

19    20

20    21

21    23

22    25

23    26

24    27

25    28

26    29

27    30

28    31

29    32

30    33

31    34

32    35

33    36

34    37

35    38

36    39

37    40

38    41

39    42

40    43

41    44

Name: order, dtype: int64

4）Series的运算

(1) 适用于numpy的数组运算也适用于Series

张三      72

李四      40

Lisa    69

Sara    27

Jack     8

Name: Math, dtype: int32

s2 = s + 50

s2

张三      122

李四       90

Lisa    119

Sara     77

Jack     58

Name: Math, dtype: int32

s.add(20)

张三      92

李四      60

Lisa    89

Sara    47

Jack    28

Name: Math, dtype: int32

(2) Series之间的运算

在运算中自动对齐不同索引的数据
如果索引不对应，则补NaN
注意：要想保留所有的index，则需要使用.add()函数

s3 = s1.add(s2,fill_value = 1)

s3

A       114.0

B        48.0

C        26.0

Jack     59.0

Lisa    120.0

Sara    103.0

张三      123.0

李四       91.0

dtype: float64

s1 = Series(np.random.randint(0,150,size  =4), index = ["A","B","C","Sara"], name = "数学")

s1

A       113

B        47

C        25

Sara     26

Name: 数学, dtype: int32

s2

张三      122

李四       90

Lisa    119

Sara     77

Jack     58

Name: Math, dtype: int32

np.nan

nan

113 + np.nan

nan

s1 + s2

#s1 里面有A  值 113  s2没有A值   Nan

A         NaN

B         NaN

C         NaN

Jack      NaN

Lisa      NaN

Sara    103.0

张三        NaN

李四        NaN

dtype: float64

============================================

练习3：

想一想Series运算和ndarray运算的规则有什么不同？
新建另一个索引包含“文综”的Series s2，并与s2进行多种算术操作。思考如何保存所有数据。

============================================

nd1 = np.array([0,1,2])

nd2 = np.array([4,5,6])

nd1 + nd2

array([4, 6, 8])

2、DataFrame

DataFrame是一个【表格型】的数据结构，可以看做是【由Series组成的字典】（共用同一个索引）。DataFrame由按一定顺序排列的多列数据组成。设计初衷是将Series的使用场景从一维拓展到多维。DataFrame既有行索引，也有列索引。

行索引：index
列索引：columns
值：values（numpy的二维数组）

#重点

1）DataFrame的创建

最常用的方法是传递一个字典来创建。DataFrame以字典的键作为每一【列】的名称，以字典的值（一个数组）作为每一列。

此外，DataFrame会自动加上每一行的索引（和Series一样）。

同Series一样，若传入的列与字典的键不匹配，则相应的值为NaN。

import pandas as pd

from pandas import Series,DataFrame

#创建  第一种写法

df1 = DataFrame(data = {"Python":[99,101,120,98], "数学":[120,136,141,123],"语文":[98,78,99,101]}, index = list("abcd"))

df1

#这种情况是行索引多的的话会报错

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	99	120	98
b	101	136	78
c	120	141	99
d	98	123	101

df1 = DataFrame(data = {"Python":[99,101,120,98], "数学":[120,136,141,123],"语文":[98,78,99,101]},index = list("abcd"),

                columns = ["Python","数学","语文","英语"])

df1

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	99	120	98	NaN
b	101	136	78	NaN
c	120	141	99	NaN
d	98	123	101	NaN

#列更加重要点

DataFrame属性：values、columns、index、shape

df1.values

array([[99, 120, 98, nan],

       [101, 136, 78, nan],

       [120, 141, 99, nan],

       [98, 123, 101, nan]], dtype=object)

df1.columns

#列索引

Index(['Python', '数学', '语文', '英语'], dtype='object')

df1.index

Index(['a', 'b', 'c', 'd'], dtype='object')

df1.shape

(4, 4)

import numpy as np

#第二种写法

df2 = DataFrame(data = np.random.randint(0,150,size = (4,4)), index = list("abcd"), columns = ["Python","Java","PHP","Html"])

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	98	37	124
b	28	71	23	148
c	68	66	127	13
d	70	28	74	83

============================================

练习4：

根据以下考试成绩表，创建一个DataFrame，命名为df：

    张三  李四

语文 150  0

数学 150  0

英语 150  0

理综 300  0

============================================

2）DataFrame的索引

(1) 对列进行索引

- 通过类似字典的方式

- 通过属性的方式

可以将DataFrame的列获取为一个Series。返回的Series拥有原DataFrame相同的索引，且name属性也已经设置好了，就是相应的列名。

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	98	37	124
b	28	71	23	148
c	68	66	127	13
d	70	28	74	83

df2["Python"]

a    48

b    28

c    68

d    70

Name: Python, dtype: int32

df2.Python

#columns 列名  属性名

a    48

b    28

c    68

d    70

Name: Python, dtype: int32

df2[["Python","Java"]]

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	98
b	28	71
c	68	66
d	70	28

#让你拿出来一行数据 a  DataFrame 是无法通过中括号来直接获取行数据的

df2["a"]

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)

   2524             try:

-> 2525                 return self._engine.get_loc(key)

   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'a'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)

<ipython-input-22-326aeb23cee0> in <module>()

      1 #让你拿出来一行数据 a

----> 2 df2["a"]

C:\anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)

   2137             return self._getitem_multilevel(key)

   2138         else:

-> 2139             return self._getitem_column(key)

   2140

   2141     def _getitem_column(self, key):

C:\anaconda\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)

   2144         # get column

   2145         if self.columns.is_unique:

-> 2146             return self._get_item_cache(key)

   2147

   2148         # duplicate columns & possible reduce dimensionality

C:\anaconda\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)

   1840         res = cache.get(item)

   1841         if res is None:

-> 1842             values = self._data.get(item)

   1843             res = self._box_item_values(item, values)

   1844             cache[item] = res

C:\anaconda\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)

   3841

   3842             if not isna(item):

-> 3843                 loc = self.items.get_loc(item)

   3844             else:

   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)

   2525                 return self._engine.get_loc(key)

   2526             except KeyError:

-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))

   2528

   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'a'

#切片

df2["a":"c"]

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	98	37	124
b	28	71	23	148
c	68	66	127	13

(2) 对行进行索引

- 使用.loc[]加index来进行行索引

- 使用.iloc[]加整数来进行行索引

同样返回一个Series，index为原来的columns。

df2.loc["a"]

#Series

Python     48

Java       98

PHP        37

Html      124

Name: a, dtype: int32

df2.loc[["a"]]

#DataFrame数据

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	98	37	124

df2.iloc[0]

Python     48

Java       98

PHP        37

Html      124

Name: a, dtype: int32

df2.iloc[[1]]

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

b	28	71	23	148

df2.iloc[[1,2]]

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

b	28	71	23	148
c	68	66	127	13

df2.iloc[0:3]

#左闭右开

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	98	37	124
b	28	71	23	148
c	68	66	127	13

df2.loc["a"]["Java"]

df2.loc['a',"Java"]

#忘记这种情况吧！！！！！！

df2["Java"]["a"]

df2["Java","a"]

#总结统一一下， 取单个数据数据的时候，行和列不能写在同一个中括号里面

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)

   2524             try:

-> 2525                 return self._engine.get_loc(key)

   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: ('Java', 'a')

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)

<ipython-input-36-ca977865b75c> in <module>()

----> 1 df2["Java","a"]

C:\anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)

   2137             return self._getitem_multilevel(key)

   2138         else:

-> 2139             return self._getitem_column(key)

   2140

   2141     def _getitem_column(self, key):

C:\anaconda\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)

   2144         # get column

   2145         if self.columns.is_unique:

-> 2146             return self._get_item_cache(key)

   2147

   2148         # duplicate columns & possible reduce dimensionality

C:\anaconda\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)

   1840         res = cache.get(item)

   1841         if res is None:

-> 1842             values = self._data.get(item)

   1843             res = self._box_item_values(item, values)

   1844             cache[item] = res

C:\anaconda\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)

   3841

   3842             if not isna(item):

-> 3843                 loc = self.items.get_loc(item)

   3844             else:

   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)

   2525                 return self._engine.get_loc(key)

   2526             except KeyError:

-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))

   2528

   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: ('Java', 'a')

(3) 对元素索引的方法

- 使用列索引

- 使用行索引(iloc[3,1]相当于两个参数;iloc[[3,3]] 里面的[3,3]看做一个参数)

- 使用values属性（二维numpy数组）

df2["Java"]["a":"c"]

#左闭右闭

a    98

b    71

c    66

Name: Java, dtype: int32

df2.iloc[1:3]["Html"]

#左闭右开

b    148

c     13

Name: Html, dtype: int32

df2.loc["a","Python"]

df2.loc[["a","b"], "Python"]

#loc是一个非常特殊的方法

a    48

b    28

Name: Python, dtype: int32

df2.loc['a':"b", "Python"]

a    48

b    28

Name: Python, dtype: int32

df2.iloc[0:2,"Python"]

#不行啦，和loc不一样！！！！！

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-43-56bffcb627a4> in <module>()

----> 1 df2.iloc[0:2,"Python"]

C:\anaconda\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)

   1365             except (KeyError, IndexError):

   1366                 pass

-> 1367             return self._getitem_tuple(key)

   1368         else:

   1369             # we by definition only have the 0th axis

C:\anaconda\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)

   1735     def _getitem_tuple(self, tup):

   1736

-> 1737         self._has_valid_tuple(tup)

   1738         try:

   1739             return self._getitem_lowerdim(tup)

C:\anaconda\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)

    205                 raise ValueError("Location based indexing can only have "

    206                                  "[{types}] types"

--> 207                                  .format(types=self._valid_types))

    208

    209     def _should_validate_iterable(self, axis=None):

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

#赋值

df2["Python"]["b"] += 50

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	98	37	124
b	78	71	23	148
c	68	66	127	13
d	70	28	74	83

df2.loc["a":"c", "Java"] += 20

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	138	37	124
b	78	111	23	148
c	68	106	127	13
d	70	28	74	83

【注意】

直接用中括号时：

索引表示的是列索引
切片表示的是行切片

df2["a":"b"]

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	138	37	124
b	78	111	23	148

df2["Python"]

a    48

b    78

c    68

d    70

Name: Python, dtype: int32

============================================

练习5：

使用多种方法对ddd进行索引和切片，并比较其中的区别

============================================

3）DataFrame的运算

（1） DataFrame之间的运算

同Series一样：

在运算中自动对齐不同索引的数据
如果索引不对应，则补NaN

df1

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	99	120	98	NaN
b	101	136	78	NaN
c	120	141	99	NaN
d	98	123	101	NaN

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	138	37	124
b	78	111	23	148
c	68	106	127	13
d	70	28	74	83

df1 + df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	NaN	NaN	NaN	147	NaN	NaN	NaN
b	NaN	NaN	NaN	179	NaN	NaN	NaN
c	NaN	NaN	NaN	188	NaN	NaN	NaN
d	NaN	NaN	NaN	168	NaN	NaN	NaN

df1.add(df2, fill_value = 0)

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	125.0	139.0	38.0	147	121.0	NaN	99.0
b	149.0	112.0	24.0	179	137.0	NaN	79.0
c	14.0	107.0	128.0	188	142.0	NaN	100.0
d	84.0	29.0	75.0	168	124.0	NaN	102.0

df1 = DataFrame(np.random.randint(0,150,size = (4,2)),

                index = list("cdef"),

                columns = ["Python","Java"])

df1

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

c	82	122
d	128	130
e	15	126
f	74	133

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	138	37	124
b	78	111	23	148
c	68	106	127	13
d	70	28	74	83

df1.add(df2, axis = "index", fill_value = 0)

#出现第一个bug    axis  在这个地方显示不出来！！！！！

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	124.0	138.0	37.0	48.0
b	148.0	111.0	23.0	78.0
c	13.0	228.0	127.0	150.0
d	83.0	158.0	74.0	198.0
e	NaN	126.0	NaN	15.0
f	NaN	133.0	NaN	74.0

创建DataFrame df1 不同人员的各科目成绩，月考一

创建DataFrame df2 不同人员的各科目成绩，月考二

有新学生转入

下面是Python 操作符与pandas操作函数的对应表：

`+`	`add()`
`-`	`sub()`, `subtract()`
`*`	`mul()`, `multiply()`
`/`	`truediv()`, `div()`, `divide()`
`//`	`floordiv()`
`%`	`mod()`
`**`	`pow()`

（2） Series与DataFrame之间的运算

【重要】

使用Python操作符：以行为单位操作（参数必须是行），对所有行都有效。（类似于numpy中二维数组与一维数组的运算，但可能出现NaN）

使用pandas操作函数：

axis=0：以列为单位操作（参数必须是列），对所有列都有效。

axis=1：以行为单位操作（参数必须是行），对所有行都有效。

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	138	37	124
b	78	111	23	148
c	68	106	127	13
d	70	28	74	83

s_row = df2.loc['c']

s_row

#Series

Python     68

Java      106

PHP       127

Html       13

Name: c, dtype: int32

s_columns = df2["Python"]

s_columns

a    48

b    78

c    68

d    70

Name: Python, dtype: int32

df2

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	48	138	37	124
b	78	111	23	148
c	68	106	127	13
d	70	28	74	83

df2.add(s_columns,axis = 0)

#s_columns

#a    48

# b    78

# c    68

# d    70

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	96	186	85	172
b	156	189	101	226
c	136	174	195	81
d	140	98	144	153

df2.add(s_columns,axis = "index")

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	96	186	85	172
b	156	189	101	226
c	136	174	195	81
d	140	98	144	153

df2.add(s_row,axis = "columns")

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	116	244	164	137
b	146	217	150	161
c	136	212	254	26
d	138	134	201	96

df2 + s_row

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

a	116	244	164	137
b	146	217	150	161
c	136	212	254	26
d	138	134	201	96

#DataFrame和Series进行运算的时候要严格注意 axis

#Series,是一个一维的数据，DataFrame 是一个多维的数据索引不对齐的话肯定会出现错误

============================================

练习6：

假设ddd是期中考试成绩，ddd2是期末考试成绩，请自由创建ddd2，并将其与ddd相加，求期中期末平均值。
假设张三期中考试数学被发现作弊，要记为0分，如何实现？
李四因为举报张三作弊立功，期中考试所有科目加100分，如何实现？
后来老师发现有一道题出错了，为了安抚学生情绪，给每位学生每个科目都加10分，如何实现？

============================================

a	NaN	NaN	NaN	147	NaN	NaN	NaN
b	NaN	NaN	NaN	179	NaN	NaN	NaN
c	NaN	NaN	NaN	188	NaN	NaN	NaN
d	NaN	NaN	NaN	168	NaN	NaN	NaN

a	NaN	NaN	NaN	147	NaN	NaN	NaN
b	NaN	NaN	NaN	179	NaN	NaN	NaN
c	NaN	NaN	NaN	188	NaN	NaN	NaN
d	NaN	NaN	NaN	168	NaN	NaN	NaN

a	NaN	NaN	NaN	147	NaN	NaN	NaN
b	NaN	NaN	NaN	179	NaN	NaN	NaN
c	NaN	NaN	NaN	188	NaN	NaN	NaN
d	NaN	NaN	NaN	168	NaN	NaN	NaN