csv文件读取¶
In [1]:
import pandas as pd import sys
In [2]:
%cat examples/ex2.csv
1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
In [3]:
#文件没有标签数据 pd.read_csv('examples/ex2.csv',header=None)
Out[3]:
0 | 1 | 2 | 3 | 4 | hello |
---|---|---|---|---|---|
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
In [4]:
pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'])
Out[4]:
0 | 1 | 2 | 3 | 4 | hello |
---|---|---|---|---|---|
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
In [5]:
#将其中的一列设为索引列 pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'],index_col='massage')
Out[5]:
hello | 1 | 2 | 3 | 4 |
---|---|---|---|---|
world | 5 | 6 | 7 | 8 |
foo | 9 | 10 | 11 | 12 |
In [6]:
list(open('examples/ex3.txt'))
Out[6]:
[' A B C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb 0.927272 0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382 1.100491\n']
In [7]:
#多出的一列数据自动识别为索引,分隔符不同使用正则表达式 pd.read_csv('examples/ex3.csv',sep='\s+')
Out[7]:
aaa | -0.264438 | -1.026059 | -0.619500 |
---|---|---|---|
bbb | 0.927272 | 0.302904 | -0.032399 |
ccc | -0.264273 | -0.386314 | -0.217601 |
ddd | -0.871858 | -0.348382 | 1.100491 |
将读取的非空数据设为NaN¶
In [8]:
%cat examples/ex5.csv
something,a,b,c,d,message one,1,2,3,4,NA two,5,6,,8,world three,9,10,11,12,foo
In [9]:
pd.read_csv('examples/ex5.csv',na_values={'something':'two','massage':['NA','foo']})
Out[9]:
0 | one | 1 | 2 | 3.0 | 4 | NaN |
---|---|---|---|---|---|---|
1 | NaN | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
In [10]:
#只读取一部分数据 pd.read_csv('examples/ex6.csv',nrows=10)
Out[10]:
0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
---|---|---|---|---|---|
1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
5 | 1.817480 | 0.742273 | 0.419395 | -2.251035 | Q |
6 | -0.776764 | 0.935518 | -0.332872 | -1.875641 | U |
7 | -0.913135 | 1.530624 | -0.572657 | 0.477252 | K |
8 | 0.358480 | -0.497572 | -0.367016 | 0.507702 | S |
9 | -1.740877 | -1.160417 | -1.637830 | 2.172201 | G |
In [11]:
#将数据分块读取 chunker = pd.read_csv('examples/ex6.csv',chunksize=1000) for piece in chunker: print(piece.iloc[0])
one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key L Name: 0, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key T Name: 1000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key 1 Name: 2000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key H Name: 3000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key H Name: 4000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key 1 Name: 5000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key I Name: 6000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key 1 Name: 7000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key 7 Name: 8000, dtype: object one 0.467976 two -0.0386485 three -0.295344 four -1.82473 key B Name: 9000, dtype: object
写入csv文件¶
In [12]:
data = pd.read_csv('examples/ex5.csv') data.to_csv(sys.stdout)
,something,a,b,c,d,message 0,one,1,2,3.0,4, 1,two,5,6,,8,world 2,three,9,10,11.0,12,foo
In [13]:
data.to_csv(sys.stdout,sep='|')
|something|a|b|c|d|message 0|one|1|2|3.0|4| 1|two|5|6||8|world 2|three|9|10|11.0|12|foo
In [14]:
#对缺失值进行标识 data.to_csv(sys.stdout,na_rep='NULL')
,something,a,b,c,d,message 0,one,1,2,3.0,4,NULL 1,two,5,6,NULL,8,world 2,three,9,10,11.0,12,foo
In [15]:
data.to_csv(sys.stdout,index=False,header=False)
one,1,2,3.0,4, two,5,6,,8,world three,9,10,11.0,12,foo
In [16]:
#只写入子集 data.to_csv(sys.stdout,index=False,columns=['a','b','c'])
a,b,c 1,2,3.0 5,6, 9,10,11.0
json文件¶
In [17]:
%cat examples/example.json
[{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}, {"a": 7, "b": 8, "c": 9}]
In [18]:
data = pd.read_json('examples/example.json') data
Out[18]:
0 | 1 | 2 | 3 |
---|---|---|---|
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
In [19]:
data.to_json(sys.stdout)
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
In [20]:
#按行写入 data.to_json(sys.stdout,orient='records')
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]
HTML¶
In [21]:
#搜索并解析包含在table标签中的数据 tables = pd.read_html('examples/fdic_failed_bank_list.html') #只有一张表格 len(tables)
Out[21]:
1
In [22]:
data = tables[0] data.head()
Out[22]:
0 | Allied Bank | Mulberry | AR | 91 | Today's Bank | September 23, 2016 | November 17, 2016 |
---|---|---|---|---|---|---|---|
1 | The Woodbury Banking Company | Woodbury | GA | 11297 | United Bank | August 19, 2016 | November 17, 2016 |
2 | First CornerStone Bank | King of Prussia | PA | 35312 | First-Citizens Bank & Trust Company | May 6, 2016 | September 6, 2016 |
3 | Trust Company Bank | Memphis | TN | 9956 | The Bank of Fayette County | April 29, 2016 | September 6, 2016 |
4 | North Milwaukee State Bank | Milwaukee | WI | 20364 | First-Citizens Bank & Trust Company | March 11, 2016 | June 16, 2016 |
excel¶
In [23]:
data = pd.read_excel('examples/ex1.xlsx','Sheet1') data
Out[23]:
0 | 0 | 1 | 2 | 3 | 4 | hello |
---|---|---|---|---|---|---|
1 | 1 | 5 | 6 | 7 | 8 | world |
2 | 2 | 9 | 10 | 11 | 12 | foo |
In [24]:
data.to_excel('examples/ex2.xlsx')
Web API¶
In [25]:
import requests url = 'https://api.github.com/repos/pandas-dev/pandas/issues' resp = requests.get(url) resp
Out[25]:
<Response [200]>
In [26]:
data = resp.json()#data为字典数组 issues = pd.DataFrame(data,columns=['title','url','state','labels'])#提取部分字段 issues.head()
Out[26]:
0 | Resample on MultiIndex level is much longer th... | https://api.github.com/repos/pandas-dev/pandas... | open | [] |
---|---|---|---|---|
1 | BUG: value_counts can handle the case even wit... | https://api.github.com/repos/pandas-dev/pandas... | open | [] |
2 | Build error on Arch linux | https://api.github.com/repos/pandas-dev/pandas... | open | [] |
3 | BUG: DataFrame.to_html validates formatters ha... | https://api.github.com/repos/pandas-dev/pandas... | open | [] |
4 | BUG DataFream.rindex方法参数的错误 | https://api.github.com/repos/pandas-dev/pandas... | open | [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=... |