Pandas 的整数索引重采样等价物
•浏览 1
Pandas' equivalent of resample for integer index
我正在为一个不是 DatetimeIndex 而是整数数组,甚至可能是浮点数的数据帧寻找与 resample 方法等效的 pandas。
我知道在某些情况下(例如这个),重采样方法可以很容易地被重新索引和插值替换,但在某些情况下(我认为)它不能。
例如,如果我有
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
这给了我
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
但我无法使用 df 和重新采样产生相同的结果。所以我正在寻找可以作为
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
那会给我
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
这样的方法存在吗?我能够创建此方法的唯一方法是手动将 df 拆分为较小的数据帧,应用 np.std 然后将所有内容连接回来,我发现这很慢而且一点也不聪明。
干杯
设置
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
您需要自己创建标签进行分组。我会使用:
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
为你获取一系列值,如 [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...] 然后在 groupby
中使用它
您还需要为新数据框指定索引。我会使用:
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
获取从第 5 个位置开始的当前索引(因此是 4)以及之后的每个第 5 个位置。它看起来像 [4, 9, 14, 19]。我本可以将其作为 df.index[::5] 来获得起始位置,但我选择了结束位置。
解决方案
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
看起来像:
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
其他注意事项
这相当于下采样。我们尚未解决抽样问题。
要以更频繁的方式从我们生成的内容返回到数据帧索引,我们可以像这样使用 reindex:
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
看起来像:
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
我们也可以使用其他的东西来reindex,比如range(0, 20, 2),将样本上采样到偶数索引。
另类,这是可以做到的一件事
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
@piSquared 解决方案非常好,但我不喜欢在重新索引时手动选择索引。
这也适用于每种下采样(浮点索引),并自动选择每个范围内索引的平均值:
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
现在您可以随意选择要在每个子组中计算的函数:
df = pd.DataFrame(np.random.randn(10,2))
withdates = df.set_index(pd.date_range('2012-01-01', periods=10))
withdates.resample('5D', np.std)
0 1
2012-01-01 1.184582 0.492113
2012-01-06 0.533134 0.982562
df.resample(5, np.std)
0 1
0 1.184582 0.492113
5 0.533134 0.982562
import pandas as pd
import numpy as np
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(20, 2), columns=['A', 'B'])
(df.index.to_series() / 5).astype(int)
df.index[4::5]
# assign as variable because I'm going to use it more than once.
s = (df.index.to_series() / 5).astype(int)
df.groupby(s).std().set_index(s.index[4::5])
A B
4 0.198019 0.320451
9 0.329750 0.408232
14 0.293297 0.223991
19 0.095633 0.376390
# assign what we've done above to df_down
df_down = df.groupby(s).std().set_index(s.index[4::5])
df_up = df_down.reindex(range(20)).bfill()
A B
0 0.198019 0.320451
1 0.198019 0.320451
2 0.198019 0.320451
3 0.198019 0.320451
4 0.198019 0.320451
5 0.329750 0.408232
6 0.329750 0.408232
7 0.329750 0.408232
8 0.329750 0.408232
9 0.329750 0.408232
10 0.293297 0.223991
11 0.293297 0.223991
12 0.293297 0.223991
13 0.293297 0.223991
14 0.293297 0.223991
15 0.095633 0.376390
16 0.095633 0.376390
17 0.095633 0.376390
18 0.095633 0.376390
19 0.095633 0.376390
def resample(df, rule, how=None, **kwargs):
import pandas as pd
if how==None:
import numpy as np
how = np.mean
if isinstance(df.index, pd.DatetimeIndex) and isinstance(rule, str):
return df.resample(rule, how, **kwargs)
else:
idx, bins = pd.cut(df.index, range(df.index[0], df.index[-1]+2, rule), right=False, retbins=True)
aux = df.groupby(idx).apply(how)
aux = aux.set_index(bins[:-1])
return auxdf = pd.DataFrame(index = np.random.rand(20)*30, data=np.random.rand(20, 2), columns=['A', 'B'])
df.index.name = 'crazy_index'
s = (df.index.to_series() / 10).astype(int)
# calculate std() in each group
df.groupby(s).mean().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
A B
crazy_index
3.667539 0.276986 0.317642
14.275074 0.248700 0.372551
25.054042 0.254860 0.297586
# calculate median() in each group
df.groupby(s).median().set_index( s.groupby(s).apply(lambda x: np.mean(x.index)) )
Out[38]:
A B
crazy_index
3.667539 0.454654 0.521649
14.275074 0.451265 0.490125
25.054042 0.489326 0.622781
编辑:s 索引中有一些错误,现在它是正确的