问题描述
我一直在与Pandas一起玩,以便将HTTP日志输入到Pandas中进行分析,因为它是大量数据的良好来源,并且可以让我学习Pandas.
我一次将日志流式传输到一行,因此无法从CSV导入,需要将它们泵"到Pandas DataFrame中,然后将其保存到HDFStore文件中.
我目前编写的代码确实是从GZIP中读取的,因此我可以继续进行该过程,但是一旦完成了Panda的工作,我将其修改为使用pubsub风格的例程进行事件驱动. /p>
到目前为止,这是我的代码:
import os
import gzip, io
import pandas as pd
import numpy as np
def read_gzip(file):
if os.path.isdir(os.path.dirname(file)):
if os.path.isfile(file):
while True:
for line in io.BufferedReader(gzip.open(file)):
yield line
class ElbParser(object):
def __init__(self, file):
self.file = file
def get_log_lines(self):
return read_gzip(self.file)
def build_series(self, log_line):
def normalise_ip(input_string):
try:
text = input_string.split(':')[0]
except:
text = np.NaN
finally:
return text
def normalise_time(input_string):
try:
text = float(input_string)
except:
text = np.NaN
finally:
return text
def normalise_ints(input_string):
try:
text = int(input_string)
except:
text = np.NaN
finally:
return text
log_list = log_line.split()
elb_series = pd.Series({
'timestamp' : np.datetime64(log_list[0]),
'elb_name' : log_list[1],
'client_ip' : normalise_ip(log_list[2]),
'backend_ip' : normalise_ip(log_list[3]),
'request_processing_time' : normalise_time(log_list[4]),
'backend_processing_time' : normalise_time(log_list[5]),
'response_processing_time' : normalise_time(log_list[6]),
'elb_status_code' : normalise_ints(log_list[7]),
'backend_status_code' : normalise_ints(log_list[8]),
'received_bytes' : normalise_ints(log_list[9]),
'sent_bytes' : normalise_ints(log_list[10]),
'http_method' : log_list[11].strip("'").strip('"'),
'request' : log_list[13].strip("'").strip('"')
})
return elb_series
以下是用于执行上述代码的基本测试代码:
import os
import sys
import unittest
import pandas as pd
from lbreader import ElbParser
test_elb_log_file = 'tests/resources/lb_log.gz'
class ElbTest(unittest.TestCase):
#
# Fixture Framework
#
def setUp(self):
print '\nTest Fixture setUp'
self.elbparser = ElbParser(test_elb_log_file)
def tearDown(self):
print '\nTest Fixture tearDown'
del self.elbparser
#
# Tests
#
def test_01_get_aws_elb_log_line(self):
print '\n running get aws elb log line'
log_lines = self.elbparser.get_log_lines()
self.failUnlessEqual('2014-05-15T23:00:04.696671Z poundlb 192.168.0.10:53345 - -1 -1 -1 408 408 55 0 "PUT https://localhost:443/private/urls HTTP/1.1"\n'
, log_lines.next())
def test_02_build_series_with_aws_elb_log_line(self):
print '\n running build series from elb log'
log_lines = self.elbparser.get_log_lines()
test = self.elbparser.build_series(log_lines.next())
test_line_dict = {'backend_ip': '-',
'backend_processing_time': -1.0,
'backend_status_code': 408,
'client_ip': '192.168.0.10',
'lb_name': 'poundlb',
'elb_status_code': 408,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': -1.0,
'response_processing_time': -1.0,
'sent_bytes': 0,
'timestamp': numpy.datetime64('2014-05-16T00:00:04.696671+0100')}
self.failUnlessEqual(test_line_dict, test.to_dict())
if __name__ == '__main__':
unittest.main()
现在我似乎已经被卡住了:
我能够创建pandas.Series数据,但是当我尝试将其推入数据框时,它将以11列的两列显示.
test.values
>>> array(['-', -1.0, 408, '192.168.0.10', 'poundlb', 408, 'PUT', 55,
'HTTP/1.1', -1.0, -1.0, 0,
numpy.datetime64('2014-05-16T00:00:04.696671+0100')], dtype=object)
df = pd.DataFrame(test.values.tolist(), column_list)
>>> df
Out[35]:
0
backend_ip -
backend_processing_time -1
backend_status_code 408
client_ip 192.168.0.10
elb_name poundlb
elb_status_code 408
http_method PUT
received_bytes 55
request HTTP/1.1
request_processing_time -1
response_processing_time -1
sent_bytes 0
timestamp 2014-05-16T00:00:04.696671+0100
[13 rows x 1 columns]
这与我希望获得的结果相去甚远,我希望获得[1行x 1列]:
backend_ip backend_processing_time backend_status_code backend_status_code client_ip elb_name elb_status_code http_method received_bytes request request_processing_time response_processing_time sent_bytes timestamp
0 - -1 408 408 192.168.0.10 poundlb 408 PUT 55 HTTP/1.1 -1 -1 0 2014-05-16T00:00:04
1 - -1 408 408 192.168.0.11 poundlb 408 PUT 55 HTTP/1.1 -1 -1 0 2014-05-16T00:00:05
2 - -1 408 408 192.168.0.12 poundlb 408 PUT 55 HTTP/1.1 -1 -1 0 2014-05-16T00:00:06
例如,如果我添加另一条pandas.Series日志行,则会得到另一行,依此类推.我还打算索引以下内容:时间戳记,client_ip,backend_ip
真的很感谢您的帮助,因为我似乎无法正确设置行/列.
经过一番游戏后,我得到了以下信息:因此,在玩了一点之后,我想出了以下内容,但仍然不能概括/附加.
>>> test0 = elbparser.build_series(log_lines.next())
>>> test1 = elbparser.build_series(log_lines.next())
>>> test2 = elbparser.build_series(log_lines.next())
>>> test3 = elbparser.build_series(log_lines.next())
>>> test4 = elbparser.build_series(log_lines.next())
>>> test5 = elbparser.build_series(log_lines.next())
>>> test6 = elbparser.build_series(log_lines.next())
>>> test7 = elbparser.build_series(log_lines.next())
>>> test8 = elbparser.build_series(log_lines.next())
>>> test9 = elbparser.build_series(log_lines.next())
>>> test10 = elbparser.build_series(log_lines.next())
>>> test_list = [test.to_dict(), test1.to_dict(), test2.to_dict(), test3.to_dict(), test4.to_dict(), test5.to_dict(), test6.to_dict(), test7.to_dict(), test8.to_dict(), test9.to_dict(), test10.to_dict()]
>>> test_list
[{'backend_ip': '-',
'backend_processing_time': -1.0,
'backend_status_code': 408,
'client_ip': '192.168.0.1',
'elb_name': 'poundlb',
'elb_status_code': 408,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': -1.0,
'response_processing_time': -1.0,
'sent_bytes': 0,
'timestamp': numpy.datetime64('2014-05-16T00:00:04.696671+0100')},
{'backend_ip': '10.0.0.241',
'backend_processing_time': 59.246736,
'backend_status_code': 403,
'client_ip': '192.168.0.2',
'elb_name': 'poundlb',
'elb_status_code': 403,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': 3.4e-05,
'response_processing_time': 2.9e-05,
'sent_bytes': 64,
'timestamp': numpy.datetime64('2014-05-16T00:00:30.980494+0100')},
{'backend_ip': '10.0.0.242',
'backend_processing_time': 59.42053,
'backend_status_code': 200,
'client_ip': '192.168.0.3',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'PUT',
'received_bytes': 173,
'request': 'HTTP/1.1',
'request_processing_time': 8.4e-05,
'response_processing_time': 6.9e-05,
'sent_bytes': 149,
'timestamp': numpy.datetime64('2014-05-16T00:00:32.687835+0100')},
{'backend_ip': '10.0.0.39',
'backend_processing_time': 0.016443,
'backend_status_code': 200,
'client_ip': '192.168.0.4',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 7.9e-05,
'response_processing_time': 4e-05,
'sent_bytes': 289,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.247760+0100')},
{'backend_ip': '10.0.0.41',
'backend_processing_time': 0.008624,
'backend_status_code': 200,
'client_ip': '192.168.0.5',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 5.4e-05,
'response_processing_time': 3.2e-05,
'sent_bytes': 200,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.432535+0100')},
{'backend_ip': '10.0.0.43',
'backend_processing_time': 0.138925,
'backend_status_code': 200,
'client_ip': '192.168.0.6',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 4.5e-05,
'response_processing_time': 4.3e-05,
'sent_bytes': 268,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.509598+0100')},
{'backend_ip': '10.0.0.38',
'backend_processing_time': 0.013578,
'backend_status_code': 200,
'client_ip': '192.168.0.7',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'POST',
'received_bytes': 291,
'request': 'HTTP/1.1',
'request_processing_time': 4.2e-05,
'response_processing_time': 2.7e-05,
'sent_bytes': 36,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.667479+0100')},
{'backend_ip': '10.0.0.42',
'backend_processing_time': 0.017493,
'backend_status_code': 200,
'client_ip': '192.168.0.8',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 3.7e-05,
'response_processing_time': 2.7e-05,
'sent_bytes': 290,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.708697+0100')},
{'backend_ip': '10.0.0.40',
'backend_processing_time': 0.014167,
'backend_status_code': 200,
'client_ip': '192.168.0.9',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'POST',
'received_bytes': 297,
'request': 'HTTP/1.1',
'request_processing_time': 3.5e-05,
'response_processing_time': 2.7e-05,
'sent_bytes': 36,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.746867+0100')},
{'backend_ip': '10.0.0.40',
'backend_processing_time': 0.094383,
'backend_status_code': 200,
'client_ip': '192.168.0.10',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'PUT',
'received_bytes': 79,
'request': 'HTTP/1.1',
'request_processing_time': 3.4e-05,
'response_processing_time': 3.6e-05,
'sent_bytes': 148,
'timestamp': numpy.datetime64('2014-05-16T00:00:39.333482+0100')},
{'backend_ip': '10.0.0.42',
'backend_processing_time': 0.061355,
'backend_status_code': 200,
'client_ip': '192.168.0,10',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'PUT',
'received_bytes': 79,
'request': 'HTTP/1.1',
'request_processing_time': 9.6e-05,
'response_processing_time': 5.8e-05,
'sent_bytes': 148,
'timestamp': numpy.datetime64('2014-05-16T00:00:39.345097+0100')}]
>>> df = pd.DataFrame(test.to_dict(), index_list)
>>> df
Out[45]:
backend_ip backend_processing_time
backend_ip - -1
backend_processing_time - -1
backend_status_code - -1
client_ip - -1
elb_name - -1
elb_status_code - -1
http_method - -1
received_bytes - -1
request - -1
request_processing_time - -1
response_processing_time - -1
sent_bytes - -1
timestamp - -1
backend_status_code client_ip
backend_ip 408 192.168.0.10
backend_processing_time 408 192.168.0.10
backend_status_code 408 192.168.0.10
client_ip 408 192.168.0.10
elb_name 408 192.168.0.10
elb_status_code 408 192.168.0.10
http_method 408 192.168.0.10
received_bytes 408 192.168.0.10
request 408 192.168.0.10
request_processing_time 408 192.168.0.10
response_processing_time 408 192.168.0.10
sent_bytes 408 192.168.0.10
timestamp 408 192.168.0.10
elb_name elb_status_code http_method
backend_ip poundlb 408 PUT
backend_processing_time poundlb 408 PUT
backend_status_code poundlb 408 PUT
client_ip poundlb 408 PUT
elb_name poundlb 408 PUT
elb_status_code poundlb 408 PUT
http_method poundlb 408 PUT
received_bytes poundlb 408 PUT
request poundlb 408 PUT
request_processing_time poundlb 408 PUT
response_processing_time poundlb 408 PUT
sent_bytes poundlb 408 PUT
timestamp poundlb 408 PUT
received_bytes request request_processing_time
backend_ip 55 HTTP/1.1 -1
backend_processing_time 55 HTTP/1.1 -1
backend_status_code 55 HTTP/1.1 -1
client_ip 55 HTTP/1.1 -1
elb_name 55 HTTP/1.1 -1
elb_status_code 55 HTTP/1.1 -1
http_method 55 HTTP/1.1 -1
received_bytes 55 HTTP/1.1 -1
request 55 HTTP/1.1 -1
request_processing_time 55 HTTP/1.1 -1
response_processing_time 55 HTTP/1.1 -1
sent_bytes 55 HTTP/1.1 -1
timestamp 55 HTTP/1.1 -1
response_processing_time sent_bytes
backend_ip -1 0
backend_processing_time -1 0
backend_status_code -1 0
client_ip -1 0
elb_name -1 0
elb_status_code -1 0
http_method -1 0
received_bytes -1 0
request -1 0
request_processing_time -1 0
response_processing_time -1 0
sent_bytes -1 0
timestamp -1 0
timestamp
backend_ip 2014-05-15 23:00:04.696671
backend_processing_time 2014-05-15 23:00:04.696671
backend_status_code 2014-05-15 23:00:04.696671
client_ip 2014-05-15 23:00:04.696671
elb_name 2014-05-15 23:00:04.696671
elb_status_code 2014-05-15 23:00:04.696671
http_method 2014-05-15 23:00:04.696671
received_bytes 2014-05-15 23:00:04.696671
request 2014-05-15 23:00:04.696671
request_processing_time 2014-05-15 23:00:04.696671
response_processing_time 2014-05-15 23:00:04.696671
sent_bytes 2014-05-15 23:00:04.696671
timestamp 2014-05-15 23:00:04.696671
[13 rows x 13 columns]
这是我想要的,但是在此之后附加/连接似乎仍然有问题.
我将研究
的解决方案import pandas as pd
import numpy
# Create a data dict
test_line_dict = {'backend_ip': '-',
'backend_processing_time': -1.0,
'backend_status_code': 408,
'client_ip': '192.168.0.10',
'lb_name': 'poundlb',
'elb_status_code': 408,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': -1.0,
'response_processing_time': -1.0,
'sent_bytes': 0,
'timestamp': numpy.datetime64('2014-05-16T00:00:04.696671+0100')}
# Create the corresponding dataframe
idOfTheFirstElement = '1st'
df=pd.DataFrame.from_dict({idOfTheFirstElement:test_line_dict})
# Transpose the result
df = df.T
# One can dynamically append/add new rows with pd.concat
dfNew = pd.concat([df,df,df])
I have been playing with Pandas to get HTTP logs into Pandas for analysis as it is a good source of large volumes of data and will allow me to learn Pandas.
I get the logs streamed in one line at a time and therefore can not import from CSV and need to 'pump' these into a Pandas DataFrame, which I will then persist to a HDFStore file.
The code I have written at the moment does read from a GZIP just so I can get the process going, but once I have the Panda's bit done I will modify it to be event driven with a pubsub style co-routine.
Here is my code so far:
import os
import gzip, io
import pandas as pd
import numpy as np
def read_gzip(file):
if os.path.isdir(os.path.dirname(file)):
if os.path.isfile(file):
while True:
for line in io.BufferedReader(gzip.open(file)):
yield line
class ElbParser(object):
def __init__(self, file):
self.file = file
def get_log_lines(self):
return read_gzip(self.file)
def build_series(self, log_line):
def normalise_ip(input_string):
try:
text = input_string.split(':')[0]
except:
text = np.NaN
finally:
return text
def normalise_time(input_string):
try:
text = float(input_string)
except:
text = np.NaN
finally:
return text
def normalise_ints(input_string):
try:
text = int(input_string)
except:
text = np.NaN
finally:
return text
log_list = log_line.split()
elb_series = pd.Series({
'timestamp' : np.datetime64(log_list[0]),
'elb_name' : log_list[1],
'client_ip' : normalise_ip(log_list[2]),
'backend_ip' : normalise_ip(log_list[3]),
'request_processing_time' : normalise_time(log_list[4]),
'backend_processing_time' : normalise_time(log_list[5]),
'response_processing_time' : normalise_time(log_list[6]),
'elb_status_code' : normalise_ints(log_list[7]),
'backend_status_code' : normalise_ints(log_list[8]),
'received_bytes' : normalise_ints(log_list[9]),
'sent_bytes' : normalise_ints(log_list[10]),
'http_method' : log_list[11].strip("'").strip('"'),
'request' : log_list[13].strip("'").strip('"')
})
return elb_series
The following is the basic test code for exercising the above code:
import os
import sys
import unittest
import pandas as pd
from lbreader import ElbParser
test_elb_log_file = 'tests/resources/lb_log.gz'
class ElbTest(unittest.TestCase):
#
# Fixture Framework
#
def setUp(self):
print '\nTest Fixture setUp'
self.elbparser = ElbParser(test_elb_log_file)
def tearDown(self):
print '\nTest Fixture tearDown'
del self.elbparser
#
# Tests
#
def test_01_get_aws_elb_log_line(self):
print '\n running get aws elb log line'
log_lines = self.elbparser.get_log_lines()
self.failUnlessEqual('2014-05-15T23:00:04.696671Z poundlb 192.168.0.10:53345 - -1 -1 -1 408 408 55 0 "PUT https://localhost:443/private/urls HTTP/1.1"\n'
, log_lines.next())
def test_02_build_series_with_aws_elb_log_line(self):
print '\n running build series from elb log'
log_lines = self.elbparser.get_log_lines()
test = self.elbparser.build_series(log_lines.next())
test_line_dict = {'backend_ip': '-',
'backend_processing_time': -1.0,
'backend_status_code': 408,
'client_ip': '192.168.0.10',
'lb_name': 'poundlb',
'elb_status_code': 408,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': -1.0,
'response_processing_time': -1.0,
'sent_bytes': 0,
'timestamp': numpy.datetime64('2014-05-16T00:00:04.696671+0100')}
self.failUnlessEqual(test_line_dict, test.to_dict())
if __name__ == '__main__':
unittest.main()
Now to where I seem to have managed to get stuck:
I am able to create pandas.Series data, but when I try to push it into a dataframe it takes it in as two columns of 11 rows.
test.values
>>> array(['-', -1.0, 408, '192.168.0.10', 'poundlb', 408, 'PUT', 55,
'HTTP/1.1', -1.0, -1.0, 0,
numpy.datetime64('2014-05-16T00:00:04.696671+0100')], dtype=object)
df = pd.DataFrame(test.values.tolist(), column_list)
>>> df
Out[35]:
0
backend_ip -
backend_processing_time -1
backend_status_code 408
client_ip 192.168.0.10
elb_name poundlb
elb_status_code 408
http_method PUT
received_bytes 55
request HTTP/1.1
request_processing_time -1
response_processing_time -1
sent_bytes 0
timestamp 2014-05-16T00:00:04.696671+0100
[13 rows x 1 columns]
This is far from what I was hoping to get, I was hoping for [1 rows x 1 columns]:
backend_ip backend_processing_time backend_status_code backend_status_code client_ip elb_name elb_status_code http_method received_bytes request request_processing_time response_processing_time sent_bytes timestamp
0 - -1 408 408 192.168.0.10 poundlb 408 PUT 55 HTTP/1.1 -1 -1 0 2014-05-16T00:00:04
1 - -1 408 408 192.168.0.11 poundlb 408 PUT 55 HTTP/1.1 -1 -1 0 2014-05-16T00:00:05
2 - -1 408 408 192.168.0.12 poundlb 408 PUT 55 HTTP/1.1 -1 -1 0 2014-05-16T00:00:06
Such that if I added another pandas.Series log line, I would get another row and so on.I also intend to index on the following: timestamp, client_ip, backend_ip
I would really appreciate some assistance with this as I don't seem to be getting my rows/columns correct.
After a bit of playing around I got the following:So after playing a bit with this I came up with the following, but still can not concatinate/append.
>>> test0 = elbparser.build_series(log_lines.next())
>>> test1 = elbparser.build_series(log_lines.next())
>>> test2 = elbparser.build_series(log_lines.next())
>>> test3 = elbparser.build_series(log_lines.next())
>>> test4 = elbparser.build_series(log_lines.next())
>>> test5 = elbparser.build_series(log_lines.next())
>>> test6 = elbparser.build_series(log_lines.next())
>>> test7 = elbparser.build_series(log_lines.next())
>>> test8 = elbparser.build_series(log_lines.next())
>>> test9 = elbparser.build_series(log_lines.next())
>>> test10 = elbparser.build_series(log_lines.next())
>>> test_list = [test.to_dict(), test1.to_dict(), test2.to_dict(), test3.to_dict(), test4.to_dict(), test5.to_dict(), test6.to_dict(), test7.to_dict(), test8.to_dict(), test9.to_dict(), test10.to_dict()]
>>> test_list
[{'backend_ip': '-',
'backend_processing_time': -1.0,
'backend_status_code': 408,
'client_ip': '192.168.0.1',
'elb_name': 'poundlb',
'elb_status_code': 408,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': -1.0,
'response_processing_time': -1.0,
'sent_bytes': 0,
'timestamp': numpy.datetime64('2014-05-16T00:00:04.696671+0100')},
{'backend_ip': '10.0.0.241',
'backend_processing_time': 59.246736,
'backend_status_code': 403,
'client_ip': '192.168.0.2',
'elb_name': 'poundlb',
'elb_status_code': 403,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': 3.4e-05,
'response_processing_time': 2.9e-05,
'sent_bytes': 64,
'timestamp': numpy.datetime64('2014-05-16T00:00:30.980494+0100')},
{'backend_ip': '10.0.0.242',
'backend_processing_time': 59.42053,
'backend_status_code': 200,
'client_ip': '192.168.0.3',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'PUT',
'received_bytes': 173,
'request': 'HTTP/1.1',
'request_processing_time': 8.4e-05,
'response_processing_time': 6.9e-05,
'sent_bytes': 149,
'timestamp': numpy.datetime64('2014-05-16T00:00:32.687835+0100')},
{'backend_ip': '10.0.0.39',
'backend_processing_time': 0.016443,
'backend_status_code': 200,
'client_ip': '192.168.0.4',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 7.9e-05,
'response_processing_time': 4e-05,
'sent_bytes': 289,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.247760+0100')},
{'backend_ip': '10.0.0.41',
'backend_processing_time': 0.008624,
'backend_status_code': 200,
'client_ip': '192.168.0.5',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 5.4e-05,
'response_processing_time': 3.2e-05,
'sent_bytes': 200,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.432535+0100')},
{'backend_ip': '10.0.0.43',
'backend_processing_time': 0.138925,
'backend_status_code': 200,
'client_ip': '192.168.0.6',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 4.5e-05,
'response_processing_time': 4.3e-05,
'sent_bytes': 268,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.509598+0100')},
{'backend_ip': '10.0.0.38',
'backend_processing_time': 0.013578,
'backend_status_code': 200,
'client_ip': '192.168.0.7',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'POST',
'received_bytes': 291,
'request': 'HTTP/1.1',
'request_processing_time': 4.2e-05,
'response_processing_time': 2.7e-05,
'sent_bytes': 36,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.667479+0100')},
{'backend_ip': '10.0.0.42',
'backend_processing_time': 0.017493,
'backend_status_code': 200,
'client_ip': '192.168.0.8',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'GET',
'received_bytes': 0,
'request': 'HTTP/1.1',
'request_processing_time': 3.7e-05,
'response_processing_time': 2.7e-05,
'sent_bytes': 290,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.708697+0100')},
{'backend_ip': '10.0.0.40',
'backend_processing_time': 0.014167,
'backend_status_code': 200,
'client_ip': '192.168.0.9',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'POST',
'received_bytes': 297,
'request': 'HTTP/1.1',
'request_processing_time': 3.5e-05,
'response_processing_time': 2.7e-05,
'sent_bytes': 36,
'timestamp': numpy.datetime64('2014-05-16T00:00:38.746867+0100')},
{'backend_ip': '10.0.0.40',
'backend_processing_time': 0.094383,
'backend_status_code': 200,
'client_ip': '192.168.0.10',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'PUT',
'received_bytes': 79,
'request': 'HTTP/1.1',
'request_processing_time': 3.4e-05,
'response_processing_time': 3.6e-05,
'sent_bytes': 148,
'timestamp': numpy.datetime64('2014-05-16T00:00:39.333482+0100')},
{'backend_ip': '10.0.0.42',
'backend_processing_time': 0.061355,
'backend_status_code': 200,
'client_ip': '192.168.0,10',
'elb_name': 'poundlb',
'elb_status_code': 200,
'http_method': 'PUT',
'received_bytes': 79,
'request': 'HTTP/1.1',
'request_processing_time': 9.6e-05,
'response_processing_time': 5.8e-05,
'sent_bytes': 148,
'timestamp': numpy.datetime64('2014-05-16T00:00:39.345097+0100')}]
>>> df = pd.DataFrame(test.to_dict(), index_list)
>>> df
Out[45]:
backend_ip backend_processing_time
backend_ip - -1
backend_processing_time - -1
backend_status_code - -1
client_ip - -1
elb_name - -1
elb_status_code - -1
http_method - -1
received_bytes - -1
request - -1
request_processing_time - -1
response_processing_time - -1
sent_bytes - -1
timestamp - -1
backend_status_code client_ip
backend_ip 408 192.168.0.10
backend_processing_time 408 192.168.0.10
backend_status_code 408 192.168.0.10
client_ip 408 192.168.0.10
elb_name 408 192.168.0.10
elb_status_code 408 192.168.0.10
http_method 408 192.168.0.10
received_bytes 408 192.168.0.10
request 408 192.168.0.10
request_processing_time 408 192.168.0.10
response_processing_time 408 192.168.0.10
sent_bytes 408 192.168.0.10
timestamp 408 192.168.0.10
elb_name elb_status_code http_method
backend_ip poundlb 408 PUT
backend_processing_time poundlb 408 PUT
backend_status_code poundlb 408 PUT
client_ip poundlb 408 PUT
elb_name poundlb 408 PUT
elb_status_code poundlb 408 PUT
http_method poundlb 408 PUT
received_bytes poundlb 408 PUT
request poundlb 408 PUT
request_processing_time poundlb 408 PUT
response_processing_time poundlb 408 PUT
sent_bytes poundlb 408 PUT
timestamp poundlb 408 PUT
received_bytes request request_processing_time
backend_ip 55 HTTP/1.1 -1
backend_processing_time 55 HTTP/1.1 -1
backend_status_code 55 HTTP/1.1 -1
client_ip 55 HTTP/1.1 -1
elb_name 55 HTTP/1.1 -1
elb_status_code 55 HTTP/1.1 -1
http_method 55 HTTP/1.1 -1
received_bytes 55 HTTP/1.1 -1
request 55 HTTP/1.1 -1
request_processing_time 55 HTTP/1.1 -1
response_processing_time 55 HTTP/1.1 -1
sent_bytes 55 HTTP/1.1 -1
timestamp 55 HTTP/1.1 -1
response_processing_time sent_bytes
backend_ip -1 0
backend_processing_time -1 0
backend_status_code -1 0
client_ip -1 0
elb_name -1 0
elb_status_code -1 0
http_method -1 0
received_bytes -1 0
request -1 0
request_processing_time -1 0
response_processing_time -1 0
sent_bytes -1 0
timestamp -1 0
timestamp
backend_ip 2014-05-15 23:00:04.696671
backend_processing_time 2014-05-15 23:00:04.696671
backend_status_code 2014-05-15 23:00:04.696671
client_ip 2014-05-15 23:00:04.696671
elb_name 2014-05-15 23:00:04.696671
elb_status_code 2014-05-15 23:00:04.696671
http_method 2014-05-15 23:00:04.696671
received_bytes 2014-05-15 23:00:04.696671
request 2014-05-15 23:00:04.696671
request_processing_time 2014-05-15 23:00:04.696671
response_processing_time 2014-05-15 23:00:04.696671
sent_bytes 2014-05-15 23:00:04.696671
timestamp 2014-05-15 23:00:04.696671
[13 rows x 13 columns]
Which is what I want, but I still seem to have a problem with appending/concatenating after this.
I will investigate the solution from
import pandas as pd
import numpy
# Create a data dict
test_line_dict = {'backend_ip': '-',
'backend_processing_time': -1.0,
'backend_status_code': 408,
'client_ip': '192.168.0.10',
'lb_name': 'poundlb',
'elb_status_code': 408,
'http_method': 'PUT',
'received_bytes': 55,
'request': 'HTTP/1.1',
'request_processing_time': -1.0,
'response_processing_time': -1.0,
'sent_bytes': 0,
'timestamp': numpy.datetime64('2014-05-16T00:00:04.696671+0100')}
# Create the corresponding dataframe
idOfTheFirstElement = '1st'
df=pd.DataFrame.from_dict({idOfTheFirstElement:test_line_dict})
# Transpose the result
df = df.T
# One can dynamically append/add new rows with pd.concat
dfNew = pd.concat([df,df,df])
这篇关于动态附加到Pandas数据框的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!