>>> import pandas as pd >>> import numpy as np >>> df = pd.read_csv('tips.csv') >>> df.head() total_bill tip sex smoker day time size 016.991.01 Female No Sun Dinner 2 110.341.66 Male No Sun Dinner 3 221.013.50 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2 424.593.61 Female No Sun Dinner 4
>>> mask = [False] * 244 >>> mask[1] = True >>> mask[3] = True >>> df[mask] total_bill tip sex smoker day time size 110.341.66 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2
>>> # 选取性别为男性的行 >>> df[df['sex'] == 'Male'].head() total_bill tip sex smoker day time size 110.341.66 Male No Sun Dinner 3 221.013.50 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2 525.294.71 Male No Sun Dinner 4 68.772.00 Male No Sun Dinner 2
>>> # 选取小费超过 2 ,或者性别为女性的行 >>> df[(df['tip']>2) | (df['sex']=='Female')].head() total_bill tip sex smoker day time size 016.991.01 Female No Sun Dinner 2 221.013.50 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2 424.593.61 Female No Sun Dinner 4 525.294.71 Male No Sun Dinner 4
>>> # 选取性别为男性的行 >>> df.loc[df['sex'] == 'Male'].head() total_bill tip sex smoker day time size 110.341.66 Male No Sun Dinner 3 221.013.50 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2 525.294.71 Male No Sun Dinner 4 68.772.00 Male No Sun Dinner 2
>>> # 选取小费超过 2 ,或者性别为女性的行 >>> df.loc[(df['tip']>2) | (df['sex']=='Female')].head() total_bill tip sex smoker day time size 016.991.01 Female No Sun Dinner 2 221.013.50 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2 424.593.61 Female No Sun Dinner 4 525.294.71 Male No Sun Dinner 4
>>> # 选择不是周末,且小费大于 5 的行 >>> df.loc[~df['day'].isin(['Sun', 'Sat']) & (df['tip']>5)] total_bill tip sex smoker day time size 8534.835.17 Female No Thur Lunch 4 8824.715.85 Male No Thur Lunch 2 14134.306.70 Male No Thur Lunch 6
茴香豆三: 位置索引
位置索引接受布尔数组作为输入,所以使用行索引的位置可以筛选行。
1 2 3 4 5 6 7 8
>>> mask = list(df['sex'] == 'Male') >>> df.iloc[mask].head() total_bill tip sex smoker day time size 110.341.66 Male No Sun Dinner 3 221.013.50 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2 525.294.71 Male No Sun Dinner 4 68.772.00 Male No Sun Dinner 2
茴香豆四: 调用函数
以上三种索引都可以使用函数,函数你懂的。
1 2 3 4 5 6 7
>>> df[lambda df: df['tip']>5].head() total_bill tip sex smoker day time size 2339.427.58 Male No Sat Dinner 4 4430.405.60 Male No Sun Dinner 4 4732.406.00 Male No Sun Dinner 4 5234.815.20 Female No Sun Dinner 4 5948.276.73 Male No Sat Dinner 4
茴香豆五: query
这个嘛,熟悉 SQL 的一定喜欢。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
>>> # 选取小费超过 2 ,或者性别为女性的行 >>> df.query('tip>2 | sex=="Female"').head() total_bill tip sex smoker day time size 016.991.01 Female No Sun Dinner 2 221.013.50 Male No Sun Dinner 3 323.683.31 Male No Sun Dinner 2 424.593.61 Female No Sun Dinner 4 525.294.71 Male No Sun Dinner 4
>>> # 选择不是周末,且小费大于 5 的行 >>> # @可以引用当前环境中的变量 >>> weekday = ['Sun', 'Sat'] >>> df.query('day not in @weekday & tip>5') total_bill tip sex smoker day time size 8534.835.17 Female No Thur Lunch 4 8824.715.85 Male No Thur Lunch 2 14134.306.70 Male No Thur Lunch 6
茴香豆六: where
where 可以把不符合条件的变为 NaN,然后来一个 dropna 吧。
1 2 3 4 5 6 7
>>> df.where(df.tip>5).dropna().head() total_bill tip sex smoker day time size 2339.427.58 Male No Sat Dinner 4.0 4430.405.60 Male No Sun Dinner 4.0 4732.406.00 Male No Sun Dinner 4.0 5234.815.20 Female No Sun Dinner 4.0 5948.276.73 Male No Sat Dinner 4.0