Python练手,pandas
'''
Http://pandas.pydata.org/pandas-docs/stable/10min.html
numpy的主要数据结构是ndarry
pandas的主要数据结构是Series、DataFrame
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df1 = pd.DataFrame(np.array(range(101,125)).reshape(6,4),
index=range(6),
columns=list('ABCD'))
print(df1)
# A B C D
# 0 101 102 103 104
# 1 105 106 107 108
# 2 109 110 111 112
# 3 113 114 115 116
# 4 117 118 119 120
# 5 121 122 123 124
df2 = pd.DataFrame({'custID':['C0001','C0002','C0004','C0004','C0004','C0003'],
'accountID':pd.Series(['6214C000101',
'6214C000201',
'6214C000401',
'6214C000403',
'6214C000402',
'6214C000301'],index=range(6),dtype='str'),
'tradeDate':pd.Series(['2018-01-18 14:00:00',
'2018-01-18 14:00:00',
'2018-01-18 14:00:01',
'2018-01-18 14:00:03',
'2018-01-18 14:00:02',
'2018-01-18 14:00:00'],index=range(6),dtype='str'),
'tradeAmt':pd.Series([100.0,
100.0,
101.0,
103.0,
102.0,
100.0],index=range(6),dtype='float'),
'tradeDesc':'xxxxxx',
'mark':pd.CateGorical(["row1","row2","row3","row4","row5","row6"])},
index=range(6)) #注意:表DateFrame与列Series的索引保持一致。DateFrame的真实index默认是从0开始的,这里设置的其实是index的标签,如果自定义了DateFrame的index(标签),假如某列是Series,那么Series的index也必须保持一致,否则会错位。
print(df2)
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
print(df2.dtypes)
# accountID object
# custID object
# mark category
# tradeAmt float64
# tradeDate object
# tradeDesc object
# dtype: object
print(df2.index)
# RangeIndex(start=0, stop=6, step=1)
print(df2.columns)
# Index(['accountID', 'custID', 'mark', 'tradeAmt', 'tradeDate', 'tradeDesc'], dtype='object')
print(df2.values)
# [['6214C000101' 'C0001' 'row1' 100.0 '2018-01-18 14:00:00' 'xxxxxx']
# ['6214C000201' 'C0002' 'row2' 100.0 '2018-01-18 14:00:00' 'xxxxxx']
# ['6214C000401' 'C0004' 'row3' 101.0 '2018-01-18 14:00:01' 'xxxxxx']
# ['6214C000403' 'C0004' 'row4' 103.0 '2018-01-18 14:00:03' 'xxxxxx']
# ['6214C000402' 'C0004' 'row5' 102.0 '2018-01-18 14:00:02' 'xxxxxx']
# ['6214C000301' 'C0003' 'row6' 100.0 '2018-01-18 14:00:00' 'xxxxxx']]
print(df2.head(2))
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
print(df2.tail(2))
# accountID custID mark tradeAmt tradeDate tradeDesc
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
print(df2.describe()) #统计,但仅限数值的列,非数值的列不会输出统计
# tradeAmt
# count 6.000000
# mean 101.000000
# std 1.264911
# min 100.000000
# 25% 100.000000
# 50% 100.500000
# 75% 101.750000
# max 103.000000
print(df2.T)
# 0 1 2 \
# accountID 6214C000101 6214C000201 6214C000401
# custID C0001 C0002 C0004
# mark row1 row2 row3
# tradeAmt 100 100 101
# tradeDate 2018-01-18 14:00:00 2018-01-18 14:00:00 2018-01-18 14:00:01
# tradeDesc xxxxxx xxxxxx xxxxxx
#
# 3 4 5
# accountID 6214C000403 6214C000402 6214C000301
# custID C0004 C0004 C0003
# mark row4 row5 row6
# tradeAmt 103 102 100
# tradeDate 2018-01-18 14:00:03 2018-01-18 14:00:02 2018-01-18 14:00:00
# tradeDesc xxxxxx xxxxxx xxxxxx
print('------------------------------------------------------------------------------------')
print(df2.sort_values(by='tradeDate',ascending=False)) #排序 按指定列的值 降序
# accountID custID mark tradeAmt tradeDate tradeDesc
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
print(df2.sort_values(by=['custID','tradeDate'],ascending=[True,False])) #联合排序
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
print(df2.sort_index(axis=0,ascending=False)) #索引排序 按照行的索引
# accountID custID mark tradeAmt tradeDate tradeDesc
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
print(df2.sort_index(axis=1,ascending=True)) #索引排序 按照列的索引(默认是按照列名生成的行索引)
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
print('------------------------------------------------------------------------------------')
'''
iloc按索引查找,loc按标签查找
iat按索引查找,iat按标签查找
'''
print(df2['custID'])
# 0 C0001
# 1 C0002
# 2 C0004
# 3 C0004
# 4 C0004
# 5 C0003
# Name: custID, dtype: object
print(df2[0:4]) #切片 按行索引
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
print(df2[1:4]) #切片 按行索引
# accountID custID mark tradeAmt tradeDate tradeDesc
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
print(df2.loc[1,'accountID']) #按行列标签查找,不是按行列索引查找
# 6214C000201
print(df2.iloc[3]) #第4行
# accountID 6214C000403
# custID C0004
# mark row4
# tradeAmt 103
# tradeDate 2018-01-18 14:00:03
# tradeDesc xxxxxx
# Name: 3, dtype: object
print(df2.iloc[3,4]) #第4行 第5列
# 2018-01-18 14:00:03
print(df2.iloc[3:4]) #第4至5行(不含第5行)
# accountID custID mark tradeAmt tradeDate tradeDesc
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
print(df2.iloc[3:5,1:3]) #第4、5行,第2、3列(列索引如果没有自定义,是按列名排序自动生成的)
# custID mark
# 3 C0004 row4
# 4 C0004 row5
print(df2.iloc[[3,4],[1,2]]) #第4、5行,第2、3列
# custID mark
# 3 C0004 row4
# 4 C0004 row5
print(df2.iloc[3:5,:]) #第4、5行,所有列
# accountID custID mark tradeAmt tradeDate tradeDesc
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
print(df2.iloc[:,1:3]) #所有行,第2、3列
# custID mark
# 0 C0001 row1
# 1 C0002 row2
# 2 C0004 row3
# 3 C0004 row4
# 4 C0004 row5
# 5 C0003 row6
print(df2[df2.tradeAmt > 101.0]) #筛选
# accountID custID mark tradeAmt tradeDate tradeDesc
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
print('------------------------------------------------------------------------------------')
df3 = df2.copy()
df3["custID"] = ["NEW","NEW","NEW","NEW","NEW","NEW"] # 更新 整列
df3.loc[:,'tradeAmt'] = range(len(df3)) #更新 按行列标签查找
df3.at[range(7)[1],'accountID'] = '===========' # 更新 按行列标签查找
df3.iat[0,0] = '+++++++++++' # 更新 按行列索引查找
# df3[df3.tradeDate == '2018-01-18 14:00:03'] = -df3 #找出符合条件的行,然后取反,如果所有字段都是数值的话是可以的
print(df3)
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 +++++++++++ NEW row1 0 2018-01-18 14:00:00 xxxxxx
# 1 =========== NEW row2 1 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 NEW row3 2 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 NEW row4 3 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 NEW row5 4 2018-01-18 14:00:02 xxxxxx
# 5 6214C000301 NEW row6 5 2018-01-18 14:00:00 xxxxxx
print('------------------------------------------------------------------------------------')
df4 = df2.reindex(index=range(4), columns=['custID','accountID','tradeAmt']) #重新组合 抽取
df4.loc[0:1,'tradeAmt'] = 200 #如果该列存在,则更新
df4.loc[0:1,'newColumn'] = 1 #如果该列不存在,则新增列
print(df4)
# custID accountID tradeAmt newColumn
# 0 C0001 6214C000101 200.0 1.0
# 1 C0002 6214C000201 200.0 1.0
# 2 C0004 6214C000401 101.0 NaN
# 3 C0004 6214C000403 103.0 NaN
print(df4.dropna(how='any')) #过滤所有包含空值的行
# custID accountID tradeAmt newColumn
# 0 C0001 6214C000101 200.0 1.0
# 1 C0002 6214C000201 200.0 1.0
print(df4.fillna(value=999)) #填充空值
# custID accountID tradeAmt newColumn
# 0 C0001 6214C000101 200.0 1.0
# 1 C0002 6214C000201 200.0 1.0
# 2 C0004 6214C000401 101.0 999.0
# 3 C0004 6214C000403 103.0 999.0
print(pd.isnull(df4)) #判断空值
# custID accountID tradeAmt newColumn
# 0 False False False False
# 1 False False False False
# 2 False False False True
# 3 False False False True
print('------------------------------------------------------------------------------------')
print(df2)
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
print(df2.mean())
# tradeAmt 101.0
# dtype: float64
s = pd.Series([1,3,5,np.nan,6,8], index=range(6)).shift(2) # 向后移动几行,前面置空
print(s)
# 0 NaN
# 1 1.0
# 2 3.0
# 3 5.0
# 4 NaN
# 5 6.0
# dtype: float64
print(df2.shift(2))
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 NaN NaN NaN NaN NaN NaN
# 1 NaN NaN NaN NaN NaN NaN
# 2 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 3 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 4 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 5 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
print('------------------------------------------------------------------------------------')
print(df2.apply(lambda x: max(x))) #列函数 lambda或者function都可以
# accountID 6214C000403
# custID C0004
# mark row6
# tradeAmt 103
# tradeDate 2018-01-18 14:00:03
# tradeDesc xxxxxx
# dtype: object
print('------------------------------------------------------------------------------------')
print(df2["custID"].value_counts()) #类似 group by count
# C0004 3
# C0001 1
# C0002 1
# C0003 1
# Name: custID, dtype: int64
print('------------------------------------------------------------------------------------')
print(df2["mark"].str.upper()) #大小写转换
# 0 ROW1
# 1 ROW2
# 2 ROW3
# 3 ROW4
# 4 ROW5
# 5 ROW6
# Name: mark, dtype: object
print('------------------------------------------------------------------------------------')
df5 = pd.DataFrame(np.random.randn(9,3))
print(df5)
# 0 1 2
# 0 1.303158 -0.125934 -0.205285
# 1 0.760388 -1.004298 1.143800
# 2 2.063722 0.229955 0.020368
# 3 -2.024974 0.307957 -0.579090
# 4 -1.571883 0.260561 -0.884209
# 5 2.465572 -1.001873 1.243028
# 6 0.025388 -0.372608 1.431214
# 7 -0.079416 -0.401075 -0.973337
# 8 -1.088755 -1.947188 -1.100827
pieces = [df5[:2],df5[5:6],df5[7:]] #头、中间、尾,切几块拼起来
print(pieces)
# [ 0 1 2
# 0 1.303158 -0.125934 -0.205285
# 1 0.760388 -1.004298 1.143800, 0 1 2
# 5 2.465572 -1.001873 1.243028, 0 1 2 #index重复打印了几次
# 7 -0.079416 -0.401075 -0.973337
# 8 -1.088755 -1.947188 -1.100827]
print(pd.concat(pieces)) #包含
# 0 1 2
# 0 1.303158 -0.125934 -0.205285
# 1 0.760388 -1.004298 1.143800
# 5 2.465572 -1.001873 1.243028
# 7 -0.079416 -0.401075 -0.973337
# 8 -1.088755 -1.947188 -1.100827
print('------------------------------------------------------------------------------------')
df_left = pd.DataFrame({'key':['001','002','007'],'val':['999','1','2']})
df_right = pd.DataFrame({'key':['001','002','009'],'key2':['001','002','009'],'val':['999','3','4']})
print(df_left)
# key val
# 0 001 999
# 1 002 1
# 2 007 2
print(df_right)
# key key2 val
# 0 001 001 999
# 1 002 002 3
# 2 009 009 4
print( pd.merge(df_left, df_right,how='inner', on='key') ) #内关联
# key val_x key2 val_y
# 0 001 999 001 999
# 1 002 1 002 3
print( pd.merge(df_left, df_right, how='inner', left_on='key',right_on='key2') ) #内关联 不同字段
# key_x val_x key_y key2 val_y
# 0 001 999 001 001 999
# 1 002 1 002 002 3
print( pd.merge(df_left, df_right,how='inner', on=['key','val']) ) #内关联 多字段
# key val key2
# 0 001 999 001
print( pd.merge(df_left, df_right, how='left', on='key') ) #左外关联
# key val_x key2 val_y
# 0 001 999 001 999
# 1 002 1 002 3
# 2 007 2 NaN NaN
print( pd.merge(df_left, df_right, how='right', on='key') ) #右外关联
# key val_x key2 val_y
# 0 001 999 001 999
# 1 002 1 002 3
# 2 009 NaN 009 4
print('------------------------------------------------------------------------------------')
print(df2.append(df2[:3],ignore_index=True)) #对原表做行切片,再追加到原表,追加的时候忽略切片的索引标签,索引自动重新编排标签
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
# 6 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)
# 7 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)
# 8 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx (这行是追加的)
print(df2.append(df2[:3],ignore_index=False)) #追加之后,保留切片的索引标签,发现了吗,索引标签是允许重复的
# accountID custID mark tradeAmt tradeDate tradeDesc
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx
# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx
# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx
# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx
# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)
# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)
# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx (这行是追加的)
print('------------------------------------------------------------------------------------')
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', #zip()函数,将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) # 多索引标签MultiIndex
df6 = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
print(df6)
# A B
# first second
# bar one -0.101234 -0.956210
# two -0.480354 1.308950
# baz one 0.943706 0.976480
# two -0.788852 -1.556547
# foo one 0.997527 -0.337391
# two -0.191448 -0.083129
# qux one -0.919527 -0.414051
# two -0.579727 1.595290
stacked = df6.stack() # 把“行列表结构”变成“堆栈结构”(姑且这样称呼它),把列标签追加到行标签之后
print(stacked)
# first second
# bar one A -0.101234
# B -0.956210
# two A -0.480354
# B 1.308950
# baz one A 0.943706
# B 0.976480
# two A -0.788852
# B -1.556547
# foo one A 0.997527
# B -0.337391
# two A -0.191448
# B -0.083129
# qux one A -0.919527
# B -0.414051
# two A -0.579727
# B 1.595290
print(stacked["bar"]["one"]["A"]) # “堆栈结构”的好处是,你可以这样访问数据,可以想象“堆栈结构”其实就是多层数组
# dtype: float64
# -0.101233870095
unstacked = stacked.unstack() # 还原回去,把“堆栈结构”变成“行列表结构”,把行标签变成列
print(unstacked)
# A B
# first second
# bar one -0.101234 -0.956210
# two -0.480354 1.308950
# baz one 0.943706 0.976480
# two -0.788852 -1.556547
# foo one 0.997527 -0.337391
# two -0.191448 -0.083129
# qux one -0.919527 -0.414051
# two -0.579727 1.595290
unstacked_unstacked_0 = unstacked.unstack(0) #还能继续吧行标签变成列标签
print(unstacked_unstacked_0)
# A B
# first bar baz foo qux bar baz foo qux
# second
# one -0.101234 0.943706 0.997527 -0.919527 -0.95621 0.976480 -0.337391 -0.414051
# two -0.480354 -0.788852 -0.191448 -0.579727 1.30895 -1.556547 -0.083129 1.595290
unstacked_unstacked_1 = unstacked.unstack(1) #还能继续吧行标签变成列标签 把第2个标签变成列标签
print(unstacked_unstacked_1)
# A B
# second one two one two
# first
# bar -0.101234 -0.480354 -0.956210 1.308950
# baz 0.943706 -0.788852 0.976480 -1.556547
# foo 0.997527 -0.191448 -0.337391 -0.083129
# qux -0.919527 -0.579727 -0.414051 1.595290
print('------------------------------------------------------------------------------------')
df7 = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
print(df7)
# A B C D E
# 0 one A foo -0.516297 -0.860641
# 1 one B foo -1.560483 -1.647366
# 2 two C foo 1.124756 0.329971
# 3 three A bar -0.312954 0.040263
# 4 one B bar -1.355079 0.358829
# 5 one C bar 0.749617 0.978513
# 6 two A foo -2.173830 0.434789
# 7 three B foo -1.070213 0.641253
# 8 one C foo -0.515032 0.127273
# 9 one A bar -1.408970 0.025128
# 10 two B bar -0.390044 0.060392
# 11 three C bar 0.067667 0.676595
print( pd.pivot_table(df7, values='D', index=['A', 'B'], columns=['C']) ) #透视表
# C bar foo
# A B
# one A -1.408970 -0.516297
# B -1.355079 -1.560483
# C 0.749617 -0.515032
# three A -0.312954 NaN
# B NaN -1.070213
# C 0.067667 NaN
# two A NaN -2.173830
# B -0.390044 NaN
# C NaN 1.124756
print('------------------------------------------------------------------------------------')
rng = pd.date_range('1/1/2012', periods=10, freq='min') #看结果,是个时间索引DatetimeIndex
print(rng)
# DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:01:00',
# '2012-01-01 00:02:00', '2012-01-01 00:03:00',
# '2012-01-01 00:04:00', '2012-01-01 00:05:00',
# '2012-01-01 00:06:00', '2012-01-01 00:07:00',
# '2012-01-01 00:08:00', '2012-01-01 00:09:00'],
# dtype='datetime64[ns]', freq='T')
ts = pd.Series(range(10), index=rng) # 时间序列数据
print(ts)
# 2012-01-01 00:00:00 0
# 2012-01-01 00:01:00 1
# 2012-01-01 00:02:00 2
# 2012-01-01 00:03:00 3
# 2012-01-01 00:04:00 4
# 2012-01-01 00:05:00 5
# 2012-01-01 00:06:00 6
# 2012-01-01 00:07:00 7
# 2012-01-01 00:08:00 8
# 2012-01-01 00:09:00 9
# Freq: T, dtype: int32
print( ts.resample('5Min').sum() ) #resample()是对时间序列数据进行重新采样的便捷方法
# 2012-01-01 00:00:00 10
# 2012-01-01 00:05:00 35
# Freq: 5T, dtype: int32
ts_utc = ts.tz_localize('UTC') #改变时区标准 UTC世界时 GMT格里尼治时
print( ts_utc )
# 2012-01-01 00:00:00+00:00 0
# 2012-01-01 00:01:00+00:00 1
# 2012-01-01 00:02:00+00:00 2
# 2012-01-01 00:03:00+00:00 3
# 2012-01-01 00:04:00+00:00 4
# 2012-01-01 00:05:00+00:00 5
# 2012-01-01 00:06:00+00:00 6
# 2012-01-01 00:07:00+00:00 7
# 2012-01-01 00:08:00+00:00 8
# 2012-01-01 00:09:00+00:00 9
# Freq: T, dtype: int32
print( ts_utc.tz_convert('US/Eastern') ) #时区转换
# 2011-12-31 19:00:00-05:00 0
# 2011-12-31 19:01:00-05:00 1
# 2011-12-31 19:02:00-05:00 2
# 2011-12-31 19:03:00-05:00 3
# 2011-12-31 19:04:00-05:00 4
# 2011-12-31 19:05:00-05:00 5
# 2011-12-31 19:06:00-05:00 6
# 2011-12-31 19:07:00-05:00 7
# 2011-12-31 19:08:00-05:00 8
# 2011-12-31 19:09:00-05:00 9
# Freq: T, dtype: int32
print( ts.to_period() ) #时间序列显示格式,只显示到你定义的单位
# 2012-01-01 00:00 0
# 2012-01-01 00:01 1
# 2012-01-01 00:02 2
# 2012-01-01 00:03 3
# 2012-01-01 00:04 4
# 2012-01-01 00:05 5
# 2012-01-01 00:06 6
# 2012-01-01 00:07 7
# 2012-01-01 00:08 8
# 2012-01-01 00:09 9
# Freq: T, dtype: int32
print( ts.to_period().to_timestamp() ) #时间序列显示格式,标准时间格式
# 2012-01-01 00:00:00 0
# 2012-01-01 00:01:00 1
# 2012-01-01 00:02:00 2
# 2012-01-01 00:03:00 3
# 2012-01-01 00:04:00 4
# 2012-01-01 00:05:00 5
# 2012-01-01 00:06:00 6
# 2012-01-01 00:07:00 7
# 2012-01-01 00:08:00 8
# 2012-01-01 00:09:00 9
# Freq: T, dtype: int32
print('------------------------------------------------------------------------------------')
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = df["raw_grade"].astype("category") #创建新的列,支持category类型数据(category是一种类别标签)
print( df["grade"] )
# 0 a
# 1 b
# 2 b
# 3 a
# 4 a
# 5 e
# Name: grade, dtype: category
df["grade"].cat.categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) #重新定义类别,覆盖原来的类别
print( df["grade"] )
# 0 very good
# 1 good
# 2 good
# 3 very good
# 4 very good
# 5 very bad
# Name: grade, dtype: category
# Categories (5, object): [very bad, bad, medium, good, very good]
print( df.groupby("grade").size() ) #按类别统计
# grade
# very bad 1
# bad 0
# medium 0
# good 2
# very good 3
# dtype: int64
print('------------------------------------------------------------------------------------')
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) #1000日的时间序列+随机数
ts = ts.cumsum() #累加统计
print(ts)
ts.plot() #有的环境到这步就显式了
plt.show() #有的要导入matplotlib.pyplot模块,这样开启图像显示
#图像是一条曲线,X轴:1000日,y轴:每日的累加统计结果
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,columns=['A', 'B', 'C', 'D']) #时间序列的索引标签,4列的表
df = df.cumsum() #每列的累加统计
df.plot()
plt.show()
#图像是4条曲线,X轴:1000日,y轴:每日的累加统计结果
相关文章