(AI Generated)AI-Generated-Pandas速查指南

2026-02-09

字数统计: 6.8k字 | 阅读时长≈ 36分

pandas速查：全面覆盖核心功能与高级技巧

1. 环境配置与基础

1.1 安装与依赖

# 基础安装
pip install pandas numpy matplotlib scipy

# 完整数据科学环境
pip install pandas numpy matplotlib seaborn scikit-learn jupyter

# 导入标准约定
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# 设置显示选项
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.max_rows', 100)       # 最多显示100行
pd.set_option('display.float_format', '{:.2f}'.format)  # 格式化浮点数

1.2 核心数据结构详解

1.2.1 Series：带标签的一维数组

# 创建Series
s1 = pd.Series([1, 3, 5, np.nan, 6, 8])  # 从列表创建
s2 = pd.Series({'a': 1, 'b': 2, 'c': 3})  # 从字典创建
s3 = pd.Series(5, index=['a', 'b', 'c'])  # 标量值创建

# Series属性和方法
s1.values      # 底层numpy数组
s1.index       # 索引对象
s1.dtype       # 数据类型
s1.shape       # 形状
s1.size        # 元素数量
s1.name        # Series名称
s1.index.name  # 索引名称

1.2.2 DataFrame：二维表格型数据结构

# 多种创建方式
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})  # 字典创建
df2 = pd.DataFrame(np.random.randn(3, 2), columns=['A', 'B'])  # numpy数组
df3 = pd.DataFrame([{'A': 1, 'B': 2}, {'A': 3, 'B': 4}])  # 字典列表

# 特殊DataFrame创建
pd.DataFrame.from_dict({'A': [1, 2], 'B': [3, 4]}, orient='index')
pd.DataFrame.from_records([(1, 'a'), (2, 'b')], columns=['id', 'name'])

1.2.3 Index：不可变的轴标签

# 索引类型
pd.Index([1, 2, 3])           # 基础索引
pd.RangeIndex(start=0, stop=5, step=1)  # 范围索引
pd.MultiIndex.from_tuples([('a', 1), ('b', 2)])  # 多级索引
pd.DatetimeIndex(['2023-01-01', '2023-01-02'])  # 时间索引

2. 数据创建与读取的高级技巧

2.1 动态数据结构创建

# 从多个数据源合并创建
data_sources = {
    'sales': {'file': 'sales.csv', 'key': 'sale_id'},
    'customers': {'file': 'customers.csv', 'key': 'customer_id'},
    'products': {'file': 'products.csv', 'key': 'product_id'}
}

# 动态读取并合并
dfs = {}
for name, config in data_sources.items():
    df = pd.read_csv(config['file'])
    df['source'] = name  # 标记数据来源
    dfs[name] = df

# 创建统一的主数据表
master_df = pd.concat(dfs.values(), keys=dfs.keys(), names=['source', 'row_id'])

2.2 读取时直接处理管道

# 使用pipe的读取处理链
def create_processed_dataframe(filepath):
    """完整的读取-处理管道"""
    return (
        pd.read_csv(filepath)
        .pipe(lambda df: df if not df.empty else pd.DataFrame(columns=['default']))
        .pipe(standardize_column_names)
        .pipe(handle_missing_values, strategy='median')
        .pipe(encode_categorical_variables)
        .pipe(normalize_numeric_columns)
        .assign(processing_timestamp=pd.Timestamp.now())
        .set_index('id', verify_integrity=True)
    )

# 支持函数
def standardize_column_names(df):
    """标准化列名：小写，下划线分隔"""
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df

def handle_missing_values(df, strategy='mean'):
    """处理缺失值"""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if strategy == 'mean':
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    elif strategy == 'median':
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    return df

2.3 读取不同类型文件的高级参数

# CSV高级读取
df = pd.read_csv(
    'data.csv',
    encoding='utf-8-sig',  # 处理BOM
    sep=',',               # 分隔符
    header=0,              # 表头行
    index_col=0,           # 索引列
    usecols=['col1', 'col2'],  # 指定列
    dtype={'col1': 'int32', 'col2': 'category'},  # 数据类型
    parse_dates=['date_col'],  # 解析日期
    date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
    na_values=['NA', 'NULL', 'NaN', ''],  # 缺失值标识
    true_values=['Yes', 'True'],  # 布尔值转换
    false_values=['No', 'False'],
    skiprows=lambda x: x % 2 == 0,  # 跳过偶数行
    nrows=1000,  # 读取行数限制
    skipfooter=10,  # 跳过尾部行
    converters={'col': lambda x: x.strip()},  # 列转换器
    memory_map=True,  # 内存映射提高大文件读取速度
    verbose=True      # 显示读取信息
)

# Excel多工作表处理
excel_file = pd.ExcelFile('data.xlsx')
sheet_names = excel_file.sheet_names
dfs = {sheet: excel_file.parse(sheet) for sheet in sheet_names}

# 或使用read_excel的高级参数
df = pd.read_excel(
    'data.xlsx',
    sheet_name=None,  # 读取所有工作表
    header=[0, 1],    # 多级表头
    skiprows=range(1, 5),  # 跳过前5行
    usecols='A:C,E:G',  # 读取指定列
    converters={'Price': lambda x: x * 1.1}  # 转换函数
)

3. 数据查看与探索的全面方法

3.1 数据概览与统计

# 基础概览
df.info(verbose=True, memory_usage='deep')  # 详细内存使用
df.describe(include='all', percentiles=[.1, .5, .9])  # 完整统计
df.describe(exclude=[np.number])  # 排除数值列

# 内存使用分析
df.memory_usage(deep=True).sum()  # 总内存使用
df.memory_usage(deep=True).sort_values(ascending=False)  # 各列内存使用

# 数据类型分析
df.dtypes.value_counts()  # 各类型数量
df.select_dtypes(include=['object']).nunique()  # 文本列唯一值

3.2 数据质量检查

# 完整性检查
def check_data_quality(df):
    """全面数据质量检查"""
    quality_report = pd.DataFrame({
        'column': df.columns,
        'dtype': df.dtypes.values,
        'non_null_count': df.count().values,
        'null_count': df.isnull().sum().values,
        'null_percentage': (df.isnull().sum() / len(df) * 100).values,
        'unique_values': df.nunique().values,
        'most_frequent': [df[col].mode().iloc[0] if not df[col].mode().empty else None 
                         for col in df.columns],
        'most_freq_count': [df[col].value_counts().iloc[0] if not df[col].empty else 0 
                           for col in df.columns]
    })
    
    # 添加数据质量标记
    quality_report['quality_score'] = (
        (quality_report['null_percentage'] < 5).astype(int) * 3 +  # 缺失值少
        (quality_report['unique_values'] > 1).astype(int) * 2 +   # 有变化
        (quality_report['unique_values'] < len(df) * 0.9).astype(int) * 2  # 不过于分散
    )
    
    return quality_report.sort_values('quality_score', ascending=False)

# 执行检查
quality_df = check_data_quality(df)

4. 数据选择与切片的深度解析

4.1 选择方法的性能比较

# 各种选择方法的速度比较
import timeit

# 1. 直接索引（最快）
def method1(df):
    return df['column']

# 2. loc选择
def method2(df):
    return df.loc[:, 'column']

# 3. iloc选择
def method3(df):
    return df.iloc[:, 0]

# 4. 属性访问（仅限有效Python变量名）
def method4(df):
    return df.column

# 性能测试
df = pd.DataFrame(np.random.randn(1000000, 10), columns=[f'col_{i}' for i in range(10)])
for i, method in enumerate([method1, method2, method3, method4], 1):
    time_taken = timeit.timeit(lambda: method(df), number=100)
    print(f"Method {i}: {time_taken:.4f} seconds")

4.2 高级索引技巧

# 多重条件索引
df.query('A > 0 and B < 0')  # 使用query方法
df.loc[(df['A'] > 0) & (df['B'] < 0)]  # 布尔索引

# 使用between
df[df['value'].between(10, 20)]  # 包含边界
df[df['value'].between(10, 20, inclusive='neither')]  # 不包含边界

# 使用isin的多个条件
valid_categories = ['A', 'B', 'C']
valid_statuses = ['active', 'pending']
df[df['category'].isin(valid_categories) & df['status'].isin(valid_statuses)]

# 正则表达式索引
df.filter(regex='^user_')  # 选择以user_开头的列
df.filter(like='_count')   # 选择包含_count的列
df.filter(items=['col1', 'col2'])  # 精确选择

# 基于数据类型的索引
df.select_dtypes(include=['int64', 'float64'])  # 选择数值列
df.select_dtypes(exclude=['object', 'category'])  # 排除文本和分类列

4.3 多层索引的选择

# 创建多层索引
index = pd.MultiIndex.from_product([['A', 'B'], [1, 2]], names=['letter', 'number'])
df = pd.DataFrame({'value': np.random.randn(4)}, index=index)

# 多层索引选择
df.xs('A', level='letter')  # 选择字母为A的所有行
df.xs(1, level='number')    # 选择数字为1的所有行
df.loc[('A', 1)]            # 选择特定组合
df.loc[('A', slice(None))]  # 选择字母A的所有数字
df.loc[(slice(None), 1), :]  # 选择数字1的所有字母

# 使用IndexSlice
idx = pd.IndexSlice
df.loc[idx['A', :], :]  # 选择字母A的所有
df.loc[idx[:, 1], :]    # 选择数字1的所有

5. 数据清洗与预处理的专业技巧

5.1 高级缺失值处理

# 缺失值模式分析
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_missing_patterns(df):
    """分析缺失值模式"""
    # 创建缺失值矩阵
    missing_matrix = df.isnull()
    
    # 可视化缺失值
    plt.figure(figsize=(10, 6))
    sns.heatmap(missing_matrix, cbar=False, cmap='viridis')
    plt.title('Missing Values Pattern')
    plt.show()
    
    # 缺失值相关性
    missing_corr = missing_matrix.corr()
    
    # 缺失值模式聚类
    from scipy.cluster import hierarchy
    linked = hierarchy.linkage(missing_corr, 'single')
    
    plt.figure(figsize=(10, 6))
    hierarchy.dendrogram(linked, labels=missing_corr.columns)
    plt.title('Missing Value Pattern Clustering')
    plt.show()
    
    return missing_corr

# 高级填充策略
def advanced_fillna(df):
    """基于数据关系的填充策略"""
    # 使用分组均值填充
    df_filled = df.copy()
    
    for col in df.columns:
        if df[col].isnull().any():
            # 如果有分组列，使用分组均值
            if 'group' in df.columns:
                df_filled[col] = df.groupby('group')[col].transform(
                    lambda x: x.fillna(x.mean())
                )
            # 使用时间序列方法填充时间相关数据
            elif 'date' in df.columns:
                df_filled[col] = df[col].interpolate(method='time')
            # 使用KNN填充
            else:
                from sklearn.impute import KNNImputer
                imputer = KNNImputer(n_neighbors=5)
                df_filled[col] = imputer.fit_transform(df[[col]])
    
    return df_filled

5.2 异常值检测与处理

def detect_and_handle_outliers(df, method='iqr', threshold=1.5):
    """异常值检测与处理"""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    outliers_info = {}
    
    for col in numeric_cols:
        if method == 'iqr':
            # IQR方法
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        
        elif method == 'zscore':
            # Z-score方法
            from scipy import stats
            z_scores = np.abs(stats.zscore(df[col].dropna()))
            outliers = df[z_scores > threshold]
        
        elif method == 'mad':
            # MAD方法（中位数绝对偏差）
            median = df[col].median()
            mad = np.median(np.abs(df[col] - median))
            modified_z_scores = 0.6745 * (df[col] - median) / mad
            outliers = df[np.abs(modified_z_scores) > threshold]
        
        outliers_info[col] = {
            'count': len(outliers),
            'percentage': len(outliers) / len(df) * 100,
            'indices': outliers.index.tolist(),
            'values': outliers[col].tolist()
        }
    
    return outliers_info

# 异常值处理策略
def handle_outliers_strategy(df, strategy='cap', threshold=1.5):
    """处理异常值的多种策略"""
    df_clean = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        if strategy == 'cap':
            # 封顶法：将异常值替换为边界值
            df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)
        
        elif strategy == 'remove':
            # 删除法：删除异常值
            df_clean = df_clean[(df_clean[col] >= lower_bound) & 
                               (df_clean[col] <= upper_bound)]
        
        elif strategy == 'transform':
            # 变换法：使用对数变换
            df_clean[col] = np.log1p(df_clean[col] - df_clean[col].min() + 1)
        
        elif strategy == 'winsorize':
            # 缩尾法
            from scipy.stats.mstats import winsorize
            df_clean[col] = winsorize(df_clean[col], limits=[0.05, 0.05])
    
    return df_clean

5.3 数据标准化与编码

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def prepare_features(df, numeric_cols, categorical_cols):
    """特征工程：标准化和编码"""
    df_prepared = df.copy()
    
    # 数值特征标准化
    if numeric_cols:
        # 选择标准化方法
        scaler = StandardScaler()  # 或 MinMaxScaler(), RobustScaler()
        df_prepared[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    # 分类特征编码
    if categorical_cols:
        # 对有序分类变量使用LabelEncoder
        for col in categorical_cols:
            if df[col].dtype.name == 'category' and df[col].cat.ordered:
                le = LabelEncoder()
                df_prepared[col] = le.fit_transform(df[col])
            else:
                # 对无序分类变量使用One-Hot编码
                dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
                df_prepared = pd.concat([df_prepared, dummies], axis=1)
                df_prepared = df_prepared.drop(col, axis=1)
    
    return df_prepared

6. 数据变形与重塑：pipe的全面应用

6.1 pipe方法深度解析

# pipe的基本用法
df.pipe(func, *args, **kwargs)

# pipe链式操作示例
result = (df
          .pipe(clean_data)
          .pipe(transform_data, method='log')
          .pipe(filter_data, min_value=0)
          .pipe(aggregate_data, by='category')
          .pipe(format_output, precision=2))

# 实际应用：创建数据处理管道类
class DataPipeline:
    def __init__(self, df):
        self.df = df.copy()
    
    @staticmethod
    def remove_outliers(df, column, threshold=3):
        """移除异常值"""
        z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
        return df[z_scores < threshold]
    
    @staticmethod
    def add_features(df):
        """添加衍生特征"""
        return df.assign(
            log_value=lambda x: np.log1p(x['value']),
            scaled_value=lambda x: (x['value'] - x['value'].min()) / 
                                   (x['value'].max() - x['value'].min()),
            interaction=lambda x: x['feature1'] * x['feature2']
        )
    
    @staticmethod
    def encode_categorical(df, columns):
        """编码分类变量"""
        for col in columns:
            if df[col].dtype == 'object':
                df[col] = pd.Categorical(df[col]).codes
        return df
    
    def run(self):
        """运行完整管道"""
        return (self.df
                .pipe(self.remove_outliers, 'value', threshold=2.5)
                .pipe(self.add_features)
                .pipe(self.encode_categorical, ['category', 'status'])
                .pipe(lambda x: x.dropna())
                .pipe(lambda x: x.reset_index(drop=True)))

# 使用管道
pipeline = DataPipeline(raw_df)
processed_df = pipeline.run()

6.2 复杂数据变形操作

# 1. 数据透视表的高级用法
pivot = df.pivot_table(
    values=['sales', 'profit'],
    index=['region', pd.Grouper(key='date', freq='M')],
    columns='product_category',
    aggfunc={'sales': 'sum', 'profit': 'mean'},
    fill_value=0,
    margins=True,
    margins_name='Total',
    observed=True
)

# 2. 交叉表的自定义
cross_tab = pd.crosstab(
    index=[df['year'], df['quarter']],
    columns=df['product_line'],
    values=df['revenue'],
    aggfunc='sum',
    normalize='index'  # 或 'columns', 'all'
)

# 3. 数据堆叠与解堆叠
# 宽表转长表
long_df = df.melt(
    id_vars=['id', 'name'],
    value_vars=['2020', '2021', '2022'],
    var_name='year',
    value_name='value'
)

# 长表转宽表
wide_df = long_df.pivot(
    index=['id', 'name'],
    columns='year',
    values='value'
).reset_index()

# 4. 多重索引的堆叠/解堆叠
df_multi = df.set_index(['region', 'product', 'date'])
stacked = df_multi.stack().reset_index()  # 堆叠
unstacked = stacked.set_index(['region', 'product', 'date']).unstack()  # 解堆叠

6.3 分组聚合的高级技巧

# 1. 自定义聚合函数
def weighted_average(df, value_col, weight_col):
    """计算加权平均值"""
    return np.average(df[value_col], weights=df[weight_col])

def top_n_values(series, n=3):
    """获取前N个值及其频率"""
    return series.value_counts().head(n).to_dict()

# 应用自定义聚合
agg_results = df.groupby('category').agg({
    'value': ['mean', 'std', lambda x: weighted_average(df, 'value', 'weight')],
    'name': lambda x: top_n_values(x, n=2)
})

# 2. 分组后应用复杂转换
def normalize_group(df):
    """组内标准化"""
    return (df - df.mean()) / df.std()

# 使用transform
df['normalized'] = df.groupby('group')['value'].transform(normalize_group)

# 3. 分组过滤
# 筛选组内数量大于阈值的组
filtered = df.groupby('category').filter(lambda x: len(x) > 10)

# 筛选组内平均值大于阈值的组
filtered = df.groupby('category').filter(lambda x: x['value'].mean() > 50)

# 4. 分组排名
df['rank_in_group'] = df.groupby('group')['score'].rank(method='dense', ascending=False)

7. 时间序列处理的完整指南

7.1 时间数据操作

# 创建时间范围
date_ranges = {
    'daily': pd.date_range('2023-01-01', periods=30, freq='D'),
    'business_daily': pd.date_range('2023-01-01', periods=30, freq='B'),
    'hourly': pd.date_range('2023-01-01', periods=24, freq='H'),
    'monthly': pd.date_range('2023-01-01', periods=12, freq='M'),
    'quarterly': pd.date_range('2023-01-01', periods=4, freq='Q'),
    'yearly': pd.date_range('2020-01-01', '2023-01-01', freq='Y')
}

# 时间属性提取
df['date'] = pd.to_datetime(df['date_string'])
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek  # 周一=0
df['is_weekend'] = df['date'].dt.dayofweek >= 5
df['hour'] = df['date'].dt.hour
df['is_month_start'] = df['date'].dt.is_month_start
df['is_quarter_end'] = df['date'].dt.is_quarter_end

# 时间差计算
df['days_since_event'] = (df['date'] - pd.Timestamp('2023-01-01')).dt.days
df['hours_diff'] = df['end_time'] - df['start_time']
df['hours_diff'] = df['hours_diff'].dt.total_seconds() / 3600

7.2 时间序列分析

# 重采样分析
resample_methods = {
    'daily_sum': df.resample('D', on='date').sum(),
    'weekly_mean': df.resample('W-MON', on='date').mean(),  # 周一为每周起始
    'monthly_std': df.resample('M', on='date').std(),
    'quarterly_first': df.resample('Q', on='date').first(),
    'hourly_interpolated': df.resample('H', on='date').interpolate(method='linear')
}

# 滚动窗口统计
window_stats = {
    '7d_rolling_mean': df['value'].rolling(window='7D', min_periods=1).mean(),
    '30d_rolling_std': df['value'].rolling(window='30D', min_periods=7).std(),
    'exp_weighted_mean': df['value'].ewm(span=30, adjust=False).mean(),
    'expanding_mean': df['value'].expanding(min_periods=1).mean(),
    'centered_rolling': df['value'].rolling(window='14D', center=True).mean()
}

# 时间序列差分
df['value_diff'] = df['value'].diff()  # 一阶差分
df['value_pct_change'] = df['value'].pct_change()  # 百分比变化
df['value_log_diff'] = np.log(df['value']).diff()  # 对数差分

# 季节性分解
from statsmodels.tsa.seasonal import seasonal_decompose

def decompose_timeseries(df, column, freq=365):
    """时间序列分解"""
    decomposition = seasonal_decompose(
        df[column],
        model='additive',  # 或 'multiplicative'
        period=freq,
        extrapolate_trend='freq'
    )
    
    return {
        'observed': decomposition.observed,
        'trend': decomposition.trend,
        'seasonal': decomposition.seasonal,
        'residual': decomposition.resid
    }

8. 性能优化与内存管理

8.1 内存优化策略

def optimize_memory(df):
    """优化DataFrame内存使用"""
    df_optimized = df.copy()
    
    # 数值列优化
    for col in df.select_dtypes(include=[np.number]).columns:
        col_min = df[col].min()
        col_max = df[col].max()
        
        # 整数优化
        if np.issubdtype(df[col].dtype, np.integer):
            if col_min >= 0:
                if col_max < 255:
                    df_optimized[col] = df[col].astype(np.uint8)
                elif col_max < 65535:
                    df_optimized[col] = df[col].astype(np.uint16)
                elif col_max < 4294967295:
                    df_optimized[col] = df[col].astype(np.uint32)
                else:
                    df_optimized[col] = df[col].astype(np.uint64)
            else:
                if col_min > -128 and col_max < 127:
                    df_optimized[col] = df[col].astype(np.int8)
                elif col_min > -32768 and col_max < 32767:
                    df_optimized[col] = df[col].astype(np.int16)
                elif col_min > -2147483648 and col_max < 2147483647:
                    df_optimized[col] = df[col].astype(np.int32)
                else:
                    df_optimized[col] = df[col].astype(np.int64)
        # 浮点数优化
        else:
            if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                df_optimized[col] = df[col].astype(np.float16)
            elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                df_optimized[col] = df[col].astype(np.float32)
    
    # 对象列优化
    for col in df.select_dtypes(include=['object']).columns:
        num_unique = df[col].nunique()
        num_total = len(df[col])
        
        if num_unique / num_total < 0.5:  # 唯一值比例小于50%
            df_optimized[col] = df[col].astype('category')
    
    # 日期时间优化
    for col in df.select_dtypes(include=['datetime64']).columns:
        df_optimized[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
    
    return df_optimized

# 内存使用比较
original_memory = df.memory_usage(deep=True).sum()
optimized_df = optimize_memory(df)
optimized_memory = optimized_df.memory_usage(deep=True).sum()
print(f"内存节省: {(original_memory - optimized_memory) / original_memory * 100:.1f}%")

8.2 性能优化技巧

# 1. 使用向量化操作替代循环
# 避免
for i in range(len(df)):
    df.loc[i, 'new_col'] = df.loc[i, 'col1'] * df.loc[i, 'col2']

# 推荐
df['new_col'] = df['col1'] * df['col2']

# 2. 使用eval进行表达式求值（大数据集）
result = df.eval('col1 + col2 * col3')

# 3. 使用query进行高效过滤
filtered = df.query('col1 > 0 and col2 < 0 and col3 in [1, 2, 3]')

# 4. 分块处理大数据
def process_large_file(filepath, chunk_size=10000):
    """分块处理大文件"""
    chunks = []
    for chunk in pd.read_csv(filepath, chunksize=chunk_size):
        # 对每个块进行处理
        processed_chunk = process_chunk(chunk)
        chunks.append(processed_chunk)
    
    return pd.concat(chunks, ignore_index=True)

# 5. 使用Dask处理超大数据集
import dask.dataframe as dd
dask_df = dd.read_csv('large_dataset.csv')
result = dask_df.groupby('category').mean().compute()

9. 数据导出与共享

9.1 多种格式导出

# 1. CSV导出
df.to_csv(
    'output.csv',
    index=False,                # 不保存索引
    encoding='utf-8-sig',       # 支持Excel中文
    sep=',',                    # 分隔符
    float_format='%.2f',        # 浮点数格式
    date_format='%Y-%m-%d',     # 日期格式
    quoting=csv.QUOTE_NONNUMERIC,  # 引用非数字
    compression='gzip'          # 压缩
)

# 2. Excel导出（多个工作表）
with pd.ExcelWriter('output.xlsx', engine='openpyxl') as writer:
    df1.to_excel(writer, sheet_name='Sheet1', index=False)
    df2.to_excel(writer, sheet_name='Sheet2', index=False)
    
    # 添加格式
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']
    
    # 设置列宽
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        adjusted_width = (max_length + 2)
        worksheet.column_dimensions[column_letter].width = adjusted_width

# 3. Parquet格式（高效存储）
df.to_parquet(
    'output.parquet',
    engine='pyarrow',
    compression='snappy',  # 或 'gzip', 'brotli'
    index=False
)

# 4. 数据库导出
df.to_sql(
    'table_name',
    con=engine,
    if_exists='replace',  # 或 'append', 'fail'
    index=False,
    chunksize=1000,       # 分批插入
    dtype={'col1': sqlalchemy.types.VARCHAR(255)}
)

# 5. JSON导出
df.to_json(
    'output.json',
    orient='records',      # 或 'split', 'index', 'columns', 'values', 'table'
    date_format='iso',
    double_precision=2,
    force_ascii=False
)

# 6. HTML导出（网页展示）
df.to_html(
    'output.html',
    index=False,
    classes='table table-striped table-bordered',
    border=0,
    float_format='{:,.2f}'.format,
    encoding='utf-8'
)

10. 实战应用：完整的数据分析管道

10.1 端到端分析项目

class CompleteDataAnalysis:
    """完整的数据分析管道"""
    
    def __init__(self, data_path):
        self.data_path = data_path
        self.results = {}
    
    def load_and_explore(self):
        """加载和探索数据"""
        # 智能检测文件类型并加载
        if self.data_path.endswith('.csv'):
            self.df = pd.read_csv(self.data_path, low_memory=False)
        elif self.data_path.endswith('.parquet'):
            self.df = pd.read_parquet(self.data_path)
        elif self.data_path.endswith('.xlsx'):
            self.df = pd.read_excel(self.data_path, sheet_name=None)
        
        # 基础探索
        self.results['shape'] = self.df.shape
        self.results['dtypes'] = self.df.dtypes.value_counts().to_dict()
        self.results['missing'] = self.df.isnull().sum().to_dict()
        self.results['stats'] = self.df.describe().to_dict()
        
        return self
    
    def clean_data(self):
        """数据清洗"""
        # 使用pipe链式清洗
        self.df = (self.df
                   .pipe(self._remove_duplicates)
                   .pipe(self._handle_missing)
                   .pipe(self._fix_data_types)
                   .pipe(self._remove_outliers))
        
        self.results['cleaning_report'] = {
            'rows_removed': self.results['shape'][0] - len(self.df),
            'columns_cleaned': list(self.df.columns)
        }
        
        return self
    
    def engineer_features(self):
        """特征工程"""
        # 添加时间特征
        if 'date' in self.df.columns:
            self.df = self.df.pipe(self._add_time_features)
        
        # 添加聚合特征
        self.df = self.df.pipe(self._add_aggregate_features)
        
        # 添加交互特征
        self.df = self.df.pipe(self._add_interaction_features)
        
        self.results['features_added'] = [
            col for col in self.df.columns if col not in self.original_columns
        ]
        
        return self
    
    def analyze(self):
        """数据分析"""
        # 相关性分析
        self.results['correlation'] = self.df.corr().to_dict()
        
        # 分组分析
        if 'category' in self.df.columns:
            self.results['group_analysis'] = self.df.groupby('category').agg({
                'value': ['mean', 'std', 'count']
            }).to_dict()
        
        # 趋势分析
        if 'date' in self.df.columns:
            self.results['trend_analysis'] = self._analyze_trends()
        
        return self
    
    def visualize(self):
        """数据可视化"""
        import matplotlib.pyplot as plt
        import seaborn as sns
        
        # 创建可视化
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # 1. 分布图
        if 'value' in self.df.columns:
            self.df['value'].hist(ax=axes[0, 0], bins=30)
            axes[0, 0].set_title('Value Distribution')
        
        # 2. 相关性热图
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 1:
            sns.heatmap(self.df[numeric_cols].corr(), 
                       annot=True, fmt='.2f', 
                       ax=axes[0, 1], cmap='coolwarm')
            axes[0, 1].set_title('Correlation Heatmap')
        
        # 3. 时间序列图
        if 'date' in self.df.columns and 'value' in self.df.columns:
            self.df.set_index('date')['value'].plot(ax=axes[1, 0])
            axes[1, 0].set_title('Time Series Plot')
        
        # 4. 箱线图
        if 'category' in self.df.columns and 'value' in self.df.columns:
            sns.boxplot(x='category', y='value', data=self.df, ax=axes[1, 1])
            axes[1, 1].set_title('Value by Category')
            axes[1, 1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.savefig('analysis_results.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        return self
    
    def export_results(self):
        """导出结果"""
        # 导出处理后的数据
        self.df.to_csv('processed_data.csv', index=False)
        
        # 导出分析结果
        import json
        with open('analysis_results.json', 'w') as f:
            json.dump(self.results, f, indent=2, default=str)
        
        # 导出报告
        self._generate_report()
        
        return self
    
    # 辅助方法
    def _remove_duplicates(self, df):
        return df.drop_duplicates()
    
    def _handle_missing(self, df):
        # 根据列类型处理缺失值
        for col in df.columns:
            if df[col].dtype in [np.float64, np.int64]:
                df[col] = df[col].fillna(df[col].median())
            elif df[col].dtype == 'object':
                df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown')
        return df
    
    def _add_time_features(self, df):
        df['date'] = pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['dayofweek'] = df['date'].dt.dayofweek
        df['is_weekend'] = df['date'].dt.dayofweek >= 5
        return df

# 使用完整管道
analysis = CompleteDataAnalysis('data.csv')
(analysis.load_and_explore()
         .clean_data()
         .engineer_features()
         .analyze()
         .visualize()
         .export_results())

10.2 生产级数据处理框架

from abc import ABC, abstractmethod
from typing import Dict, Any, List, Optional
import logging

class DataProcessor(ABC):
    """数据处理器的抽象基类"""
    
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
        self.results = {}
    
    @abstractmethod
    def process(self, data: pd.DataFrame) -> pd.DataFrame:
        pass
    
    def validate_input(self, data: pd.DataFrame) -> bool:
        """验证输入数据"""
        required_columns = self.config.get('required_columns', [])
        missing_cols = [col for col in required_columns if col not in data.columns]
        
        if missing_cols:
            self.logger.error(f"Missing required columns: {missing_cols}")
            return False
        
        return True

class DataPipeline:
    """生产级数据管道"""
    
    def __init__(self):
        self.processors: List[DataProcessor] = []
        self.data: Optional[pd.DataFrame] = None
        self.history: List[Dict[str, Any]] = []
    
    def add_processor(self, processor: DataProcessor) -> 'DataPipeline':
        """添加处理器"""
        self.processors.append(processor)
        return self
    
    def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
        """运行管道"""
        self.data = input_data.copy()
        
        for i, processor in enumerate(self.processors):
            self.logger.info(f"Running processor {i+1}/{len(self.processors)}: {processor.__class__.__name__}")
            
            # 记录处理前状态
            snapshot = {
                'processor': processor.__class__.__name__,
                'rows_before': len(self.data),
                'columns_before': list(self.data.columns),
                'timestamp': pd.Timestamp.now()
            }
            
            # 执行处理
            try:
                self.data = processor.process(self.data)
                snapshot['success'] = True
            except Exception as e:
                self.logger.error(f"Processor failed: {e}")
                snapshot['success'] = False
                snapshot['error'] = str(e)
                raise
            
            # 记录处理后状态
            snapshot.update({
                'rows_after': len(self.data),
                'columns_after': list(self.data.columns),
                'processor_results': processor.results
            })
            
            self.history.append(snapshot)
        
        return self.data
    
    def get_report(self) -> Dict[str, Any]:
        """生成处理报告"""
        return {
            'total_processors': len(self.processors),
            'successful_processors': sum(1 for h in self.history if h['success']),
            'final_shape': self.data.shape if self.data is not None else None,
            'processing_history': self.history
        }

# 具体处理器实现
class MissingValueProcessor(DataProcessor):
    """缺失值处理器"""
    
    def process(self, data: pd.DataFrame) -> pd.DataFrame:
        strategies = self.config.get('strategies', {})
        
        for col in data.columns:
            if col in strategies:
                strategy = strategies[col]
                
                if strategy['method'] == 'mean':
                    data[col] = data[col].fillna(data[col].mean())
                elif strategy['method'] == 'median':
                    data[col] = data[col].fillna(data[col].median())
                elif strategy['method'] == 'mode':
                    data[col] = data[col].fillna(data[col].mode()[0])
                elif strategy['method'] == 'ffill':
                    data[col] = data[col].fillna(method='ffill')
                elif strategy['method'] == 'interpolate':
                    data[col] = data[col].interpolate()
                elif strategy['method'] == 'constant':
                    data[col] = data[col].fillna(strategy['value'])
        
        # 记录处理结果
        self.results['missing_values_filled'] = data.isnull().sum().to_dict()
        
        return data

class OutlierProcessor(DataProcessor):
    """异常值处理器"""
    
    def process(self, data: pd.DataFrame) -> pd.DataFrame:
        method = self.config.get('method', 'iqr')
        threshold = self.config.get('threshold', 1.5)
        
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if method == 'iqr':
                Q1 = data[col].quantile(0.25)
                Q3 = data[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - threshold * IQR
                upper_bound = Q3 + threshold * IQR
                
                # 记录异常值
                outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
                self.results[f'{col}_outliers'] = {
                    'count': len(outliers),
                    'indices': outliers.index.tolist()
                }
                
                # 处理异常值
                if self.config.get('action') == 'remove':
                    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
                elif self.config.get('action') == 'cap':
                    data[col] = data[col].clip(lower_bound, upper_bound)
        
        return data

# 使用生产级管道
pipeline = DataPipeline()

# 配置处理器
missing_config = {
    'strategies': {
        'age': {'method': 'median'},
        'salary': {'method': 'mean'},
        'department': {'method': 'mode'}
    }
}

outlier_config = {
    'method': 'iqr',
    'threshold': 3,
    'action': 'cap'
}

# 添加处理器
pipeline.add_processor(MissingValueProcessor(missing_config))
pipeline.add_processor(OutlierProcessor(outlier_config))

# 运行管道
result = pipeline.run(raw_data)
report = pipeline.get_report()

总结

这份全面的pandas速查笔记覆盖了从基础到高级的各种功能，特别强调了：

pipe方法的强大应用：通过管道模式实现清晰的数据处理流程
性能优化策略：包括内存优化、向量化操作和分块处理
生产级代码模式：使用面向对象设计和错误处理
完整的数据分析流程：从数据加载到结果导出的端到端解决方案
高级时间序列处理：包括重采样、滚动窗口和季节性分解
数据质量保证：全面的数据验证和清洗策略

掌握这些技巧将使你能够高效地处理各种数据分析任务，从简单的数据清洗到复杂的生产级数据处理管道。记住，良好的数据习惯（如数据验证、错误处理和文档记录）与掌握技术技巧同样重要。

本文作者： Kylin
本文链接： https://kylinnnnn.github.io/2026/02/09/AI-Generated-Pandas速查指南/
版权声明： 本博客所有文章除特别声明外，均采用 MIT 许可协议。转载请注明出处！