如何将值添加到行:
我在数据框中创建了一列,并将值分配为0。
现在更新这些列值但不反映出来的书面逻辑。
输入:
>>> parafix_df = main_df[["line_width", "para_num", "bbox" ]]
>>> parafix_df
line_width para_num bbox
0 238.546 NaN (50.0, 579.3, 288.546, 598.022)
1 318 1 (64.0, 564.9, 382.0, 583.622)
2 332 2 (50.0, 550.5, 382.0, 569.222)
3 332 2 (50.0, 536.1, 382.0, 554.822)
4 328.977 2 (50.0, 521.7, 378.977, 540.422)
5 318 3 (64.0, 507.3, 382.0, 526.022)
6 332 3 (50.0, 492.9, 382.0, 511.622)
7 332 3 (50.0, 478.5, 382.0, 497.222)
8 332 3 (50.0, 464.1, 382.0, 482.822)
9 332 3 (50.0, 449.7, 382.0, 468.422)
10 59.04 3 (50.0, 435.3, 109.04, 454.022)
11 304.007 4 (64.0, 420.9, 368.007, 439.622)
12 318 5 (64.0, 406.5, 382.0, 425.222)
13 332 5 (50.0, 392.1, 382.0, 410.822)
14 332 5 (50.0, 377.7, 382.0, 396.422)
15 332 5 (50.0, 363.3, 382.0, 382.022)
16 43.252 5 (50.0, 348.9, 93.252, 367.622)
17 318 6 (64.0, 334.5, 382.0, 353.222)
18 332 6 (50.0, 320.1, 382.0, 338.822)
19 332 6 (50.0, 305.7, 382.0, 324.422)
20 332 6 (50.0, 291.3, 382.0, 310.022)
21 332 6 (50.0, 276.9, 382.0, 295.622)
22 317.02 6 (50.0, 262.5, 367.02, 281.222)
23 318 7 (64.0, 248.1, 382.0, 266.822)
24 332 7 (50.0, 233.7, 382.0, 252.422)
25 47.014 7 (50.0, 219.3, 97.014, 238.022)
26 318 8 (64.0, 204.9, 382.0, 223.622)
27 316.723 8 (50.0, 190.5, 366.723, 209.222)
28 318 9 (64.0, 176.1, 382.0, 194.822)
29 326.766 9 (50.0, 161.7, 376.766, 180.422)
30 318 10 (64.0, 147.3, 382.0, 166.022)
31 332 10 (50.0, 132.9, 382.0, 151.622)
32 332 10 (50.0, 118.5, 382.0, 137.222)
33 305.393 11 (64.0, 104.1, 369.393, 122.822)
34 318 12 (64.0, 89.7, 382.0, 108.422)
35 318 13 (64.0, 75.3, 382.0, 94.022)
36 319.165 13 (50.0, 60.9, 369.165, 79.622)
37 308.165 14 (64.0, 46.5, 372.165, 65.222)
38 318 15 (64.0, 32.1, 382.0, 50.822)
39 329.153 15 (50.0, 17.7, 379.153, 36.422)
40 318 16 (64.0, 3.3, 382.0, 22.022)
41 324.335 16 (50.0, -11.1, 374.335, 7.622)
码:
parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
parafix_df["new_para_num"] = 0
max_width = parafix_df['line_width'].max()
bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]
previous = None
para1 = 1
for current, next in izip(parafix_df.iterrows(), parafix_df.iloc[1:].iterrows()):
if previous==None:
current[1]["new_para_num"] = para1
else:
bbox_current = current[1]["bbox"]
bbox_next = next[1]["bbox"]
bbox_previous = previous[1]["bbox"]
if bbox_current[0]>bbox_max_width[0]:
para1 += 1
print "para1:", para1
current[1]["new_para_num"] = para1
previous = current
上面代码的输出:
bbox new_para_num
0 (50.0, 579.3, 288.546, 598.022) 0
1 (64.0, 564.9, 382.0, 583.622) 0
2 (50.0, 550.5, 382.0, 569.222) 0
3 (50.0, 536.1, 382.0, 554.822) 0
4 (50.0, 521.7, 378.977, 540.422) 0
5 (64.0, 507.3, 382.0, 526.022) 0
6 (50.0, 492.9, 382.0, 511.622) 0
7 (50.0, 478.5, 382.0, 497.222) 0
8 (50.0, 464.1, 382.0, 482.822) 0
9 (50.0, 449.7, 382.0, 468.422) 0
10 (50.0, 435.3, 109.04, 454.022) 0
11 (64.0, 420.9, 368.007, 439.622) 0
12 (64.0, 406.5, 382.0, 425.222) 0
13 (50.0, 392.1, 382.0, 410.822) 0
14 (50.0, 377.7, 382.0, 396.422) 0
15 (50.0, 363.3, 382.0, 382.022) 0
16 (50.0, 348.9, 93.252, 367.622) 0
17 (64.0, 334.5, 382.0, 353.222) 0
18 (50.0, 320.1, 382.0, 338.822) 0
19 (50.0, 305.7, 382.0, 324.422) 0
20 (50.0, 291.3, 382.0, 310.022) 0
21 (50.0, 276.9, 382.0, 295.622) 0
22 (50.0, 262.5, 367.02, 281.222) 0
23 (64.0, 248.1, 382.0, 266.822) 0
24 (50.0, 233.7, 382.0, 252.422) 0
25 (50.0, 219.3, 97.014, 238.022) 0
26 (64.0, 204.9, 382.0, 223.622) 0
27 (50.0, 190.5, 366.723, 209.222) 0
28 (64.0, 176.1, 382.0, 194.822) 0
29 (50.0, 161.7, 376.766, 180.422) 0
30 (64.0, 147.3, 382.0, 166.022) 0
31 (50.0, 132.9, 382.0, 151.622) 0
32 (50.0, 118.5, 382.0, 137.222) 0
33 (64.0, 104.1, 369.393, 122.822) 0
34 (64.0, 89.7, 382.0, 108.422) 0
35 (64.0, 75.3, 382.0, 94.022) 0
36 (50.0, 60.9, 369.165, 79.622) 0
37 (64.0, 46.5, 372.165, 65.222) 0
38 (64.0, 32.1, 382.0, 50.822) 0
39 (50.0, 17.7, 379.153, 36.422) 0
40 (64.0, 3.3, 382.0, 22.022) 0
41 (50.0, -11.1, 374.335, 7.622) 0
但我想要新的para值:
para1: 2
para1: 3
para1: 4
para1: 5
para1: 6
para1: 7
para1: 8
para1: 9
para1: 10
para1: 11
para1: 12
para1: 13
para1: 14
para1: 15
para1: 16
你能帮助我吗?
以下是我的最终工作代码:
parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
parafix_df["new_para_num"] = 0
max_width = parafix_df['line_width'].max()
bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]
para1 = 1
for indx, current in enumerate(parafix_df.iterrows(), start=0):
if indx!=0:
bbox_current = current[1]["bbox"]
if bbox_current[0]>bbox_max_width[0]:
para1 += 1
parafix_df.iloc[indx, 4] = para1
我们可以优化更多吗?
最佳答案
更新:
IIUC,您可以这样操作:
df.new_para_num = 1
In [210]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'].cumsum() + 1
Out[210]:
2 2
3 3
6 4
7 5
8 6
9 7
13 8
14 9
15 10
18 11
19 12
20 13
21 14
24 15
31 16
32 17
Name: new_para_num, dtype: int64
如果要有条件地更新原始DF中的
new_para_num
列:In [223]: df.new_para_num = 1
In [224]: selected = df.loc[df.line_width == df.line_width.max()].copy()
In [226]: selected.new_para_num = selected.new_para_num.cumsum() + 1
In [227]: selected
Out[227]:
line_width para_num bbox new_para_num
2 332.0 2.0 [50.0, 550.5, 382.0, 569.222] 2
3 332.0 2.0 [50.0, 536.1, 382.0, 554.822] 3
6 332.0 3.0 [50.0, 492.9, 382.0, 511.622] 4
7 332.0 3.0 [50.0, 478.5, 382.0, 497.222] 5
8 332.0 3.0 [50.0, 464.1, 382.0, 482.822] 6
9 332.0 3.0 [50.0, 449.7, 382.0, 468.422] 7
13 332.0 5.0 [50.0, 392.1, 382.0, 410.822] 8
14 332.0 5.0 [50.0, 377.7, 382.0, 396.422] 9
15 332.0 5.0 [50.0, 363.3, 382.0, 382.022] 10
18 332.0 6.0 [50.0, 320.1, 382.0, 338.822] 11
19 332.0 6.0 [50.0, 305.7, 382.0, 324.422] 12
20 332.0 6.0 [50.0, 291.3, 382.0, 310.022] 13
21 332.0 6.0 [50.0, 276.9, 382.0, 295.622] 14
24 332.0 7.0 [50.0, 233.7, 382.0, 252.422] 15
31 332.0 10.0 [50.0, 132.9, 382.0, 151.622] 16
32 332.0 10.0 [50.0, 118.5, 382.0, 137.222] 17
In [228]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'] = selected
In [229]: df
Out[229]:
line_width para_num bbox new_para_num
0 238.546 NaN [50.0, 579.3, 288.546, 598.022] 1
1 318.000 1.0 [64.0, 564.9, 382.0, 583.622] 1
2 332.000 2.0 [50.0, 550.5, 382.0, 569.222] 2
3 332.000 2.0 [50.0, 536.1, 382.0, 554.822] 3
4 328.977 2.0 [50.0, 521.7, 378.977, 540.422] 1
5 318.000 3.0 [64.0, 507.3, 382.0, 526.022] 1
6 332.000 3.0 [50.0, 492.9, 382.0, 511.622] 4
7 332.000 3.0 [50.0, 478.5, 382.0, 497.222] 5
8 332.000 3.0 [50.0, 464.1, 382.0, 482.822] 6
9 332.000 3.0 [50.0, 449.7, 382.0, 468.422] 7
10 59.040 3.0 [50.0, 435.3, 109.04, 454.022] 1
11 304.007 4.0 [64.0, 420.9, 368.007, 439.622] 1
12 318.000 5.0 [64.0, 406.5, 382.0, 425.222] 1
13 332.000 5.0 [50.0, 392.1, 382.0, 410.822] 8
14 332.000 5.0 [50.0, 377.7, 382.0, 396.422] 9
15 332.000 5.0 [50.0, 363.3, 382.0, 382.022] 10
16 43.252 5.0 [50.0, 348.9, 93.252, 367.622] 1
17 318.000 6.0 [64.0, 334.5, 382.0, 353.222] 1
18 332.000 6.0 [50.0, 320.1, 382.0, 338.822] 11
19 332.000 6.0 [50.0, 305.7, 382.0, 324.422] 12
20 332.000 6.0 [50.0, 291.3, 382.0, 310.022] 13
21 332.000 6.0 [50.0, 276.9, 382.0, 295.622] 14
22 317.020 6.0 [50.0, 262.5, 367.02, 281.222] 1
23 318.000 7.0 [64.0, 248.1, 382.0, 266.822] 1
24 332.000 7.0 [50.0, 233.7, 382.0, 252.422] 15
25 47.014 7.0 [50.0, 219.3, 97.014, 238.022] 1
26 318.000 8.0 [64.0, 204.9, 382.0, 223.622] 1
27 316.723 8.0 [50.0, 190.5, 366.723, 209.222] 1
28 318.000 9.0 [64.0, 176.1, 382.0, 194.822] 1
29 326.766 9.0 [50.0, 161.7, 376.766, 180.422] 1
30 318.000 10.0 [64.0, 147.3, 382.0, 166.022] 1
31 332.000 10.0 [50.0, 132.9, 382.0, 151.622] 16
32 332.000 10.0 [50.0, 118.5, 382.0, 137.222] 17
33 305.393 11.0 [64.0, 104.1, 369.393, 122.822] 1
34 318.000 12.0 [64.0, 89.7, 382.0, 108.422] 1
35 318.000 13.0 [64.0, 75.3, 382.0, 94.022] 1
36 319.165 13.0 [50.0, 60.9, 369.165, 79.622] 1
37 308.165 14.0 [64.0, 46.5, 372.165, 65.222] 1
38 318.000 15.0 [64.0, 32.1, 382.0, 50.822] 1
39 329.153 15.0 [50.0, 17.7, 379.153, 36.422] 1
40 318.000 16.0 [64.0, 3.3, 382.0, 22.022] 1
41 324.335 16.0 [50.0, -11.1, 374.335, 7.622] 1
PS,但我仍然不确定我是否正确理解了您的目标
旧答案:
您可以使用shift函数来访问上一行和下一行:
df.shift(-1) # df will be shifted one row backwards (will show `next` row)
df.shift(1) # df will be shifted one row forwards (will show `prev` row)
例:
In [142]: df
Out[142]:
a b c
0 8 3 0
1 8 3 4
2 9 4 1
3 2 1 8
4 5 6 3
In [147]: df['prev_a'] = df.a.shift(1)
In [148]: df['next_a'] = df.a.shift(-1)
In [149]: df
Out[149]:
a b c prev_a next_a
0 8 3 0 NaN 8.0
1 8 3 4 8.0 9.0
2 9 4 1 8.0 2.0
3 2 1 8 9.0 5.0
4 5 6 3 2.0 NaN