這篇文章主要介紹了Pandas中如何使用groupby分組,具有一定借鑒價值,感興趣的朋友可以參考下,希望大家閱讀完這篇文章之后大有收獲,下面讓小編帶著大家一起了解一下。
import pandas as pd
import numpy as np
df=pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
print(df)
grouped=df.groupby('A')
print('-'*30)
print(grouped.count())
print('-'*30)
grouped=df.groupby(['A','B'])
print(grouped.count())
print('-'*30)
# 通過函數分組
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'a'
    else:
        return 'b'
grouped=df.groupby(get_letter_type,axis=1)
print(grouped.count())A B C D 0 foo one 1.429387 0.643569 1 bar one -0.858448 -0.213034 2 foo two 0.375644 0.214584 3 bar three 0.042284 -0.330481 4 foo two -1.421967 0.768176 5 bar two 1.293483 -0.399003 6 foo one -1.101385 -0.236341 7 foo three -0.852603 -1.718694 ------------------------------ B C D A bar 3 3 3 foo 5 5 5 ------------------------------ C D A B bar one 1 1 three 1 1 two 1 1 foo one 2 2 three 1 1 two 2 2 ------------------------------ a b 0 1 3 1 1 3 2 1 3 3 1 3 4 1 3 5 1 3 6 1 3 7 1 3
se=pd.Series([1,2,3,4,5],[6,9,8,9,8]) print(se) se.groupby(level=0)
6 1 9 2 8 3 9 4 8 5 dtype: int64 <pandas.core.groupby.generic.SeriesGroupBy object at 0x111b00040>
# 分組求和 grouped=se.groupby(level=0).sum() print(grouped)
6 1 8 8 9 6 dtype: int64
df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]})
print(df2)X Y 0 A 1 1 B 2 2 A 3 3 B 4
# 按X分組,并查詢A列的數據
grp=df2.groupby('X').get_group('A')
print(grp)X Y 0 A 1 2 A 3
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] index=pd.MultiIndex.from_arrays(arrays,names=['first','second']) print(index)
MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])s=pd.Series(np.random.randn(8),index=index) print(s)
first second bar one 0.120979 two -0.440384 baz one 0.515106 two -0.019882 foo one 1.149595 two -0.369984 qux one -0.930438 two 0.146044 dtype: float64
# 分組求和 grouped=s.groupby(level='first') print(grouped.sum())
first bar -0.319405 baz 0.495224 foo 0.779611 qux -0.784394 dtype: float64
grouped=df.groupby(['A','B']) print(grouped.size())
A B bar one 1 three 1 two 1 foo one 2 three 1 two 2 dtype: int64
print(df)
A B C D 0 foo one 1.429387 0.643569 1 bar one -0.858448 -0.213034 2 foo two 0.375644 0.214584 3 bar three 0.042284 -0.330481 4 foo two -1.421967 0.768176 5 bar two 1.293483 -0.399003 6 foo one -1.101385 -0.236341 7 foo three -0.852603 -1.718694
print(grouped.describe().head())
C \ count mean std min 25% 50% 75% A B bar one 1.0 -0.858448 NaN -0.858448 -0.858448 -0.858448 -0.858448 three 1.0 0.042284 NaN 0.042284 0.042284 0.042284 0.042284 two 1.0 1.293483 NaN 1.293483 1.293483 1.293483 1.293483 foo one 2.0 0.164001 1.789526 -1.101385 -0.468692 0.164001 0.796694 three 1.0 -0.852603 NaN -0.852603 -0.852603 -0.852603 -0.852603 D \ max count mean std min 25% 50% A B bar one -0.858448 1.0 -0.213034 NaN -0.213034 -0.213034 -0.213034 three 0.042284 1.0 -0.330481 NaN -0.330481 -0.330481 -0.330481 two 1.293483 1.0 -0.399003 NaN -0.399003 -0.399003 -0.399003 foo one 1.429387 2.0 0.203614 0.622191 -0.236341 -0.016364 0.203614 three -0.852603 1.0 -1.718694 NaN -1.718694 -1.718694 -1.718694
 75% max
 A B
 bar one -0.213034 -0.213034
 three -0.330481 -0.330481
 two -0.399003 -0.399003
 foo one 0.423592 0.643569
 three -1.718694 -1.718694
grouped=df.groupby('A')
grouped['C'].agg([np.sum,np.mean,np.std])<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}
.dataframe thead th {
    text-align: right;
}</style> <table border="1" class="dataframe"> <thead> <tr > <th></th> <th>sum</th> <th>mean</th> <th>std</th> </tr> <tr> <th>A</th> <th></th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>bar</th> <td>0.477319</td> <td>0.159106</td> <td>1.080712</td> </tr> <tr> <th>foo</th> <td>-1.570925</td> <td>-0.314185</td> <td>1.188767</td> </tr> </tbody> </table> </div>
import pandas as pd import numpy as np s=pd.Series(['A','b','c','D',np.nan]) print(s) # 轉小寫 print(s.str.lower()) # 轉大寫 print(s.str.upper()) # 每個字符的長度 print(s.str.len())
0 A 1 b 2 c 3 D 4 NaN dtype: object 0 a 1 b 2 c 3 d 4 NaN dtype: object 0 A 1 B 2 C 3 D 4 NaN dtype: object 0 1.0 1 1.0 2 1.0 3 1.0 4 NaN dtype: float64
index=pd.Index([' Index','ru ',' men']) # 去掉空格 print(index.str.strip()) # 去掉左邊的空格 print(index.str.lstrip()) # 去掉右邊的空格 print(index.str.rstrip())
Index(['Index', 'ru', 'men'], dtype='object') Index(['Index', 'ru ', 'men'], dtype='object') Index([' Index', 'ru', ' men'], dtype='object')
df=pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3)) print(df)
A a B b 0 3.005273 0.486696 1 1.093889 1.054230 2 -2.846352 0.302465
# 列替換
print(df.columns.str.replace(' ','_'))Index(['A_a', 'B_b'], dtype='object')
s=pd.Series(['a_b_C','c_d_e','f_g_h']) print(s)
0 a_b_C 1 c_d_e 2 f_g_h dtype: object
print(s.str.split('_'))0 [a, b, C] 1 [c, d, e] 2 [f, g, h] dtype: object
print(s.str.split('_',expand=True,n=1))0 1 0 a b_C 1 c d_e 2 f g_h
s = pd.Series(['A','rumen','ru','rumen','xiao','zhan'])
print(s.str.contains('ru'))0 False 1 True 2 True 3 True 4 False 5 False dtype: bool
s=pd.Series(['a','a|b','a|c']) print(s)
0 a 1 a|b 2 a|c dtype: object
print(s.str.get_dummies(sep='|'))
a b c 0 1 0 0 1 1 1 0 2 1 0 1
s=pd.Series(np.arange(5),np.arange(5)[::-1],dtype='int64')
s
4 0 3 1 2 2 1 3 0 4 dtype: int64
print(s[s>2])
1 3 0 4 dtype: int64
# isin查詢索引在某個范圍 print(s.isin([1,3,4]))
4 False 3 True 2 False 1 True 0 True dtype: bool
# 根據索引查詢數據 print(s[s.isin([1,3,4])])
3 1 1 3 0 4 dtype: int64
# 構造一個聯合索引的數據 s=pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[1,2],['a','b','c']]))
print(s)
1 a 0 b 1 c 2 2 a 3 b 4 c 5 dtype: int64
print(s.iloc[s.index.isin([(1,'b'),(2,'c')])])
1 b 1 2 c 5 dtype: int64
# 構造一個時間序列
dates=pd.date_range('20200920',periods=8)
print(dates)DatetimeIndex(['2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27'], dtype='datetime64[ns]', freq='D')
df=pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D']) print(df)
A B C D 2020-09-20 -1.218522 2.067088 0.015009 0.158780 2020-09-21 -0.546837 -0.601178 -0.894882 0.172037 2020-09-22 0.189848 -0.910520 0.196186 -0.073495 2020-09-23 -0.566892 0.899193 -0.450925 0.633253 2020-09-24 0.038838 1.577004 0.580927 0.609050 2020-09-25 1.562094 0.020813 -0.618859 -0.515212 2020-09-26 -1.333947 0.275765 0.139325 1.124207 2020-09-27 -1.271748 1.082302 1.036805 -1.041206
# 查詢A列數據 print(df['A'])
2020-09-20 -1.218522 2020-09-21 -0.546837 2020-09-22 0.189848 2020-09-23 -0.566892 2020-09-24 0.038838 2020-09-25 1.562094 2020-09-26 -1.333947 2020-09-27 -1.271748 Freq: D, Name: A, dtype: float64
# 查詢小于0的數字,大于0的值默認被置為NaN df.where(df<0)
<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}
.dataframe thead th {
    text-align: right;
}</style> <table border="1" class="dataframe"> <thead> <tr > <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2020-09-20</th> <td>-1.218522</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2020-09-21</th> <td>-0.546837</td> <td>-0.601178</td> <td>-0.894882</td> <td>NaN</td> </tr> <tr> <th>2020-09-22</th> <td>NaN</td> <td>-0.910520</td> <td>NaN</td> <td>-0.073495</td> </tr> <tr> <th>2020-09-23</th> <td>-0.566892</td> <td>NaN</td> <td>-0.450925</td> <td>NaN</td> </tr> <tr> <th>2020-09-24</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2020-09-25</th> <td>NaN</td> <td>NaN</td> <td>-0.618859</td> <td>-0.515212</td> </tr> <tr> <th>2020-09-26</th> <td>-1.333947</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2020-09-27</th> <td>-1.271748</td> <td>NaN</td> <td>NaN</td> <td>-1.041206</td> </tr> </tbody> </table> </div>
# 查詢小于0的數字,大于0的值變成負數 print(df.where(df<0,-df))
A B C D 2020-09-20 -1.218522 -2.067088 -0.015009 -0.158780 2020-09-21 -0.546837 -0.601178 -0.894882 -0.172037 2020-09-22 -0.189848 -0.910520 -0.196186 -0.073495 2020-09-23 -0.566892 -0.899193 -0.450925 -0.633253 2020-09-24 -0.038838 -1.577004 -0.580927 -0.609050 2020-09-25 -1.562094 -0.020813 -0.618859 -0.515212 2020-09-26 -1.333947 -0.275765 -0.139325 -1.124207 2020-09-27 -1.271748 -1.082302 -1.036805 -1.041206
# 查詢小于0的數據,大于0的置為1000 print(df.where(df<0,1000))
A B C D 2020-09-20 -1.218522 1000.000000 1000.000000 1000.000000 2020-09-21 -0.546837 -0.601178 -0.894882 1000.000000 2020-09-22 1000.000000 -0.910520 1000.000000 -0.073495 2020-09-23 -0.566892 1000.000000 -0.450925 1000.000000 2020-09-24 1000.000000 1000.000000 1000.000000 1000.000000 2020-09-25 1000.000000 1000.000000 -0.618859 -0.515212 2020-09-26 -1.333947 1000.000000 1000.000000 1000.000000 2020-09-27 -1.271748 1000.000000 1000.000000 -1.041206
# 構造一個10行3列的數據
df=pd.DataFrame(np.random.randn(10,3),columns=list('abc'))
print(df)a b c 0 1.761415 0.528009 -0.347271 1 -0.682149 0.353312 0.337229 2 1.080733 -0.272290 1.020335 3 -0.979681 -1.753745 0.836387 4 0.243748 2.085531 -0.993318 5 -1.041006 1.518130 -0.087383 6 -1.400354 -0.095196 3.043639 7 -0.835144 0.926415 -1.217102 8 0.326098 1.079906 0.156884 9 1.836618 -1.288516 -2.492620
# 查詢a>b的數據
print(df.query('a>b'))a b c 0 1.761415 0.528009 -0.347271 2 1.080733 -0.272290 1.020335 3 -0.979681 -1.753745 0.836387 9 1.836618 -1.288516 -2.492620
# 查詢c>b>a的數據
print(df.query('(c<b) & (b<a)'))a b c 0 1.761415 0.528009 -0.347271 9 1.836618 -1.288516 -2.492620
感謝你能夠認真閱讀完這篇文章,希望小編分享的“Pandas中如何使用groupby分組”這篇文章對大家有幫助,同時也希望大家多多支持億速云,關注億速云行業資訊頻道,更多相關知識等著你來學習!
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。