如何按正确顺序绘制分组条形图

2022-02-26 00:00:00 python pandas matplotlib pandas-groupby bar-chart

问题描述

我正在制作标准化考试的熟练程度分组条形图。以下是我的代码：

bush_prof_boy = bush.groupby(['BOY Prof'])['BOY Prof'].count()
bush_prof_pct_boy = bush_prof_boy/bush['BOY Prof'].count() * 100
bush_prof_eoy = bush.groupby(['EOY Prof'])['EOY Prof'].count()
bush_prof_pct_eoy = bush_prof_eoy/bush['EOY Prof'].count() * 100

labels = ['Remedial', 'Below Proficient', 'Proficient', 'Advanced']

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, bush_prof_pct_boy, width, label='BOY', 
color='mediumorchid')
rects2 = ax.bar(x + width/2, bush_prof_pct_eoy, width, label='EOY', color='teal')

ax.set_ylabel('% of Students at Proficiency Level', fontsize=18)
ax.set_title('Bushwick Middle Change in Proficiency Levels', fontsize=25)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=25)
ax.legend(fontsize=25)
plt.yticks(fontsize=15)


plt.figure(figsize=(5,15))

plt.show()

&BOY"；代表&年初和&年终，因此条形图旨在显示在年初和年底落入各个熟练程度级别的学生的百分率。？这张图看起来不错，但当我深入到数字中时，我可以看到eoy的标签是不正确的。这是我的图表：

男孩的百分比绘制正确，但eoy的百分比标签错误。以下是实际的百分比，我确信它们是正确的：

BOY %
Advanced            14.0
Below Proficient    38.0
Proficient          34.0
Remedial            14.0

EOY %
Advanced            39.0
Below Proficient    18.0
Proficient          32.0
Remedial            11.0

解决方案

使用来自Kaggle: Brooklyn NY Schools的数据
单独计算条形组可能会有问题。最好在一个数据框内进行计算，对数据框进行整形，然后绘制，因为这将确保条形图绘制在正确的组中。
由于未提供数据，因此此操作从宽形式的数字数据开始，然后清理和整形数据帧。
- 使用.cut
- 使用.melt将Dataframe转换为长格式，然后使用.groupby计算'x of Year'
- 用.pivot整形，用pandas.DataFrame.plot绘图
在python 3.8、pandas 1.3.1和matplotlib 3.4.2中测试

导入、加载和清理DataFrame

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np

# data 
data = {'BOY': [11.0, 11.0, 11.0, 11.0, 11.0, 8.0, 11.0, 14.0, 12.0, 13.0, 11.0, 14.0, 10.0, 9.0, 10.0, 10.0, 10.0, 12.0, 12.0, 13.0, 12.0, 11.0, 9.0, 12.0, 16.0, 12.0, 12.0, 12.0, 15.0, 10.0, 10.0, 10.0, 8.0, 11.0, 12.0, 14.0, 10.0, 8.0, 11.0, 12.0, 14.0, 12.0, 13.0, 15.0, 13.0, 8.0, 8.0, 11.0, 10.0, 11.0, 13.0, 11.0, 13.0, 15.0, 10.0, 8.0, 10.0, 9.0, 8.0, 11.0, 13.0, 11.0, 8.0, 11.0, 15.0, 11.0, 12.0, 17.0, 12.0, 11.0, 18.0, 14.0, 15.0, 16.0, 7.0, 11.0, 15.0, 16.0, 13.0, 13.0, 13.0, 0.0, 11.0, 15.0, 14.0, 11.0, 13.0, 16.0, 14.0, 12.0, 8.0, 13.0, 13.0, 14.0, 7.0, 10.0, 16.0, 10.0, 13.0, 10.0, 14.0, 8.0, 16.0, 13.0, 12.0, 14.0, 12.0, 14.0, 16.0, 15.0, 13.0, 13.0, 10.0, 14.0, 8.0, 10.0, 10.0, 11.0, 12.0, 10.0, 12.0, 14.0, 17.0, 13.0, 14.0, 16.0, 15.0, 13.0, 16.0, 9.0, 16.0, 15.0, 11.0, 11.0, 15.0, 14.0, 12.0, 15.0, 11.0, 16.0, 14.0, 14.0, 15.0, 14.0, 14.0, 14.0, 16.0, 15.0, 12.0, 12.0, 14.0, 15.0, 13.0, 14.0, 13.0, 17.0, 14.0, 13.0, 14.0, 13.0, 13.0, 12.0, 10.0, 15.0, 14.0, 12.0, 12.0, 14.0, 12.0, 14.0, 13.0, 15.0, 13.0, 14.0, 14.0, 12.0, 11.0, 15.0, 14.0, 14.0, 10.0], 'EOY': [16.0, 16.0, 16.0, 14.0, 10.0, 14.0, 16.0, 14.0, 15.0, 15.0, 15.0, 11.0, 11.0, 15.0, 10.0, 14.0, 17.0, 14.0, 9.0, 15.0, 14.0, 16.0, 14.0, 13.0, 11.0, 13.0, 12.0, 14.0, 15.0, 13.0, 14.0, 15.0, 12.0, 19.0, 9.0, 13.0, 11.0, 14.0, 17.0, 17.0, 14.0, 13.0, 14.0, 10.0, 16.0, 15.0, 12.0, 11.0, 12.0, 14.0, 15.0, 10.0, 15.0, 14.0, 14.0, 15.0, 18.0, 15.0, 10.0, 10.0, 15.0, 15.0, 13.0, 15.0, 19.0, 13.0, 18.0, 20.0, 21.0, 17.0, 18.0, 17.0, 18.0, 17.0, 12.0, 16.0, 15.0, 18.0, 19.0, 17.0, 20.0, 11.0, 18.0, 19.0, 11.0, 12.0, 17.0, 20.0, 17.0, 15.0, 13.0, 18.0, 14.0, 17.0, 12.0, 12.0, 16.0, 12.0, 14.0, 15.0, 14.0, 10.0, 20.0, 13.0, 18.0, 20.0, 11.0, 20.0, 17.0, 20.0, 13.0, 17.0, 15.0, 18.0, 14.0, 13.0, 13.0, 18.0, 10.0, 13.0, 12.0, 18.0, 20.0, 20.0, 16.0, 18.0, 15.0, 20.0, 22.0, 18.0, 21.0, 18.0, 18.0, 18.0, 17.0, 16.0, 19.0, 16.0, 20.0, 19.0, 19.0, 20.0, 20.0, 14.0, 18.0, 20.0, 20.0, 18.0, 16.0, 21.0, 20.0, 18.0, 15.0, 14.0, 17.0, 19.0, 21.0, 14.0, 18.0, 15.0, 18.0, 21.0, 19.0, 17.0, 16.0, 16.0, 15.0, 20.0, 19.0, 16.0, 21.0, 17.0, 19.0, 15.0, 18.0, 20.0, 18.0, 20.0, 18.0, 16.0, 16.0]}
df = pd.DataFrame(data)

# replace numbers with categorical labels; could also create new columns
labels = ['Remedial', 'Below Proficient', 'Proficient', 'Advanced']
bins = [1, 11, 13, 15, np.inf]
df['BOY'] = pd.cut(x=df.BOY, labels=labels, bins=bins, right=True)
df['EOY'] = pd.cut(x=df.EOY, labels=labels, bins=bins, right=True)

# melt the relevant columns into a long form
dfm = df.melt(var_name='Tested', value_name='Proficiency')

# set the categorical label order, which makes the xaxis labels print in the specific order
dfm['Proficiency'] = pd.Categorical(dfm['Proficiency'], labels, ordered=True)

绘制的分组方式、百分比计算和形状

# groupby and get the value counts
dfg = dfm.groupby('Tested')['Proficiency'].value_counts().reset_index(level=1, name='Size').rename({'level_1': 'Proficiency'}, axis=1)

# divide by the Tested value counts to get the percent
dfg['percent'] = dfg['Size'].div(dfm.Tested.value_counts()).mul(100).round(1)

# reshape to plot
dfp = dfg.reset_index().pivot(index='Proficiency', columns='Tested', values='percent')

# display(dfp)
Tested             BOY   EOY
Proficiency                 
Remedial          34.8   9.9
Below Proficient  28.7  12.7
Proficient        27.1  25.4
Advanced           8.8  51.9

绘图

ax = dfp.plot(kind='bar', figsize=(15, 5), rot=0, color=['orchid', 'teal'])

# formatting
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('Students at Proficiency Level', fontsize=18)
ax.set_xlabel('')
ax.set_title('Bushwick Middle Change in Proficiency Levels', fontsize=25)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=25)
ax.legend(fontsize=25)
_ = plt.yticks(fontsize=15)

# add bar labels
for p in ax.containers:
    ax.bar_label(p, fmt='%.1f%%', label_type='edge', fontsize=12)
    
# pad the spacing between the number and the edge of the figure
ax.margins(y=0.2)

查看条形图标签是否匹配dfp

相关文章