使用Python使用稀疏CSV数据填充Pandas Dataframe
问题描述
我有如下文本文件,如下所示:
foo_14:14896|foo_23:17988|foo_16:1611|foo_5:14729|foo_6:13008|foo_2:19548|foo_8:2565
foo_18:13236|foo_17:6127|foo_14:5996
foo_4:12561|foo_24:20010|foo_15:20227|foo_6:3489|foo_8:19129|foo_1:2589|foo_25:5204|foo_19:18113|foo_3:20030|foo_16:12739|foo_5:5494
foo_23:3606|foo_13:22588|foo_5:15104|foo_12:3307|foo_16:2039|foo_14:21537|foo_1:13178|foo_20:19090|foo_2:1140
foo_5:10620|foo_21:11142|foo_22:13379|foo_1:16132|foo_10:9330|foo_23:13913|foo_6:8314|foo_13:4967
foo_13:4891|foo_5:22715|foo_2:12098|foo_8:18634|foo_23:1341|foo_4:13995|foo_16:2923|foo_10:13663|foo_7:22939|foo_14:16711|foo_25:17824
foo_22:8180|foo_1:11974|foo_7:6660|foo_17:17839|foo_10:3605|foo_8:22439|foo_6:15494|foo_11:17924|foo_18:17351|foo_20:16627|foo_14:9223|foo_13:2538|foo_16:19077
foo_12:11381|foo_24:541|foo_12:13106|foo_7:6459|foo_11:6398|foo_14:23127|foo_9:15701
使用以下代码:
df = pd.read_csv("text.txt",names=['text'],header=None)
data = df['text'].str.split("|")
names=[ y.split(':') for x in data for y in x]
ds=pd.DataFrame(names)
ds = ds.pivot(columns=0).fillna('')
我得到的输出如下所示:
0 foo_1 foo_10 foo_11 foo_12 foo_13 foo_14 foo_15 foo_16 foo_17 foo_18 foo_19 foo_2 foo_20 foo_21 foo_22 foo_23 foo_24 foo_25 foo_3 foo_4 foo_5 foo_6 foo_7 foo_8 foo_9
0 14896
1 17988
2 1611
3 14729
4 13008
.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
64 13106
65 6459
66 6398
67 23127
68 15701
但是,我希望看到同一行上的每个数据字段。例如:
foo_14, foo_23, foo_16, foo_5, foo_6, foo_2 and foo_8 all appearing in row 0
foo_18, foo_17 and foo_14 should all appear in row 1
etc etc...
而不是每个条目一行,我想将它们批量存储以供阅读
0 foo_1 foo_10 foo_11 foo_12 foo_13 foo_14 foo_15 foo_16 foo_17 foo_18 foo_19 foo_2 foo_20 foo_21 foo_22 foo_23 foo_24 foo_25 foo_3 foo_4 foo_5 foo_6 foo_7 foo_8 foo_9
0 14896 1611 19548 17988 14729 13008 2565
etc
解决方案
如果我理解正确的话,这里有一种方法:
from io import StringIO
import pandas as pd
txtfile = StringIO("""foo_14:14896|foo_23:17988|foo_16:1611|foo_5:14729|foo_6:13008|foo_2:19548|foo_8:2565
foo_18:13236|foo_17:6127|foo_14:5996
foo_4:12561|foo_24:20010|foo_15:20227|foo_6:3489|foo_8:19129|foo_1:2589|foo_25:5204|foo_19:18113|foo_3:20030|foo_16:12739|foo_5:5494
foo_23:3606|foo_13:22588|foo_5:15104|foo_12:3307|foo_16:2039|foo_14:21537|foo_1:13178|foo_20:19090|foo_2:1140
foo_5:10620|foo_21:11142|foo_22:13379|foo_1:16132|foo_10:9330|foo_23:13913|foo_6:8314|foo_13:4967
foo_13:4891|foo_5:22715|foo_2:12098|foo_8:18634|foo_23:1341|foo_4:13995|foo_16:2923|foo_10:13663|foo_7:22939|foo_14:16711|foo_25:17824
foo_22:8180|foo_1:11974|foo_7:6660|foo_17:17839|foo_10:3605|foo_8:22439|foo_6:15494|foo_11:17924|foo_18:17351|foo_20:16627|foo_14:9223|foo_13:2538|foo_16:19077
foo_12:11381|foo_24:541|foo_12:13106|foo_7:6459|foo_11:6398|foo_14:23127|foo_9:15701""")
df = pd.read_csv(txtfile, names=['text'])
#df_out = df['text'].str.split('|', expand=True).apply(lambda x: x.str.replace('(:d+)', '', regex=True))
@chris在下面的评论中改进。
df_out = df['text'].str.split('|', expand=True).replace('(:d+)', '', regex=True)
print(df_out)
输出:
0 1 2 3 4 5 6 7 8 9 10 11 12
0 foo_14 foo_23 foo_16 foo_5 foo_6 foo_2 foo_8 None None None None None None
1 foo_18 foo_17 foo_14 None None None None None None None None None None
2 foo_4 foo_24 foo_15 foo_6 foo_8 foo_1 foo_25 foo_19 foo_3 foo_16 foo_5 None None
3 foo_23 foo_13 foo_5 foo_12 foo_16 foo_14 foo_1 foo_20 foo_2 None None None None
4 foo_5 foo_21 foo_22 foo_1 foo_10 foo_23 foo_6 foo_13 None None None None None
5 foo_13 foo_5 foo_2 foo_8 foo_23 foo_4 foo_16 foo_10 foo_7 foo_14 foo_25 None None
6 foo_22 foo_1 foo_7 foo_17 foo_10 foo_8 foo_6 foo_11 foo_18 foo_20 foo_14 foo_13 foo_16
7 foo_12 foo_24 foo_12 foo_7 foo_11 foo_14 foo_9 None None None None None None
已更改第7行的输入数据,以使Foo_12在此行中唯一:
from io import StringIO
import pandas as pd
txtfile = StringIO("""foo_14:14896|foo_23:17988|foo_16:1611|foo_5:14729|foo_6:13008|foo_2:19548|foo_8:2565
foo_18:13236|foo_17:6127|foo_14:5996
foo_4:12561|foo_24:20010|foo_15:20227|foo_6:3489|foo_8:19129|foo_1:2589|foo_25:5204|foo_19:18113|foo_3:20030|foo_16:12739|foo_5:5494
foo_23:3606|foo_13:22588|foo_5:15104|foo_12:3307|foo_16:2039|foo_14:21537|foo_1:13178|foo_20:19090|foo_2:1140
foo_5:10620|foo_21:11142|foo_22:13379|foo_1:16132|foo_10:9330|foo_23:13913|foo_6:8314|foo_13:4967
foo_13:4891|foo_5:22715|foo_2:12098|foo_8:18634|foo_23:1341|foo_4:13995|foo_16:2923|foo_10:13663|foo_7:22939|foo_14:16711|foo_25:17824
foo_22:8180|foo_1:11974|foo_7:6660|foo_17:17839|foo_10:3605|foo_8:22439|foo_6:15494|foo_11:17924|foo_18:17351|foo_20:16627|foo_14:9223|foo_13:2538|foo_16:19077
foo_12:11381|foo_24:541|foo_13:13106|foo_7:6459|foo_11:6398|foo_14:23127|foo_9:15701""")
df = pd.read_csv(txtfile, names=['text'])
dfs = df['text'].str.split('|', expand=True).stack().str.split(':', expand=True).reset_index()
.set_index(['level_0',0])[1].unstack()
print(dfs)
输出:
0 foo_1 foo_10 foo_11 foo_12 foo_13 foo_14 foo_15 foo_16 foo_17 foo_18 ... foo_23 foo_24 foo_25 foo_3 foo_4 foo_5 foo_6 foo_7 foo_8 foo_9
level_0 ...
0 NaN NaN NaN NaN NaN 14896 NaN 1611 NaN NaN ... 17988 NaN NaN NaN NaN 14729 13008 NaN 2565 NaN
1 NaN NaN NaN NaN NaN 5996 NaN NaN 6127 13236 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2589 NaN NaN NaN NaN NaN 20227 12739 NaN NaN ... NaN 20010 5204 20030 12561 5494 3489 NaN 19129 NaN
3 13178 NaN NaN 3307 22588 21537 NaN 2039 NaN NaN ... 3606 NaN NaN NaN NaN 15104 NaN NaN NaN NaN
4 16132 9330 NaN NaN 4967 NaN NaN NaN NaN NaN ... 13913 NaN NaN NaN NaN 10620 8314 NaN NaN NaN
5 NaN 13663 NaN NaN 4891 16711 NaN 2923 NaN NaN ... 1341 NaN 17824 NaN 13995 22715 NaN 22939 18634 NaN
6 11974 3605 17924 NaN 2538 9223 NaN 19077 17839 17351 ... NaN NaN NaN NaN NaN NaN 15494 6660 22439 NaN
7 NaN NaN 6398 11381 13106 23127 NaN NaN NaN NaN ... NaN 541 NaN NaN NaN NaN NaN 6459 NaN 15701
相关文章