How to deal with missing values in Pandas DataFrame in Python

January 9, 2019

Contact Seller

Guest Post Description

How to deal with missing values in Pandas DataFrame in Python

def Kickstarter_Example_94(): 
    print()
    print(format('How to deal with missing values in a Pandas DataFrame','*^82'))    
    import warnings
    warnings.filterwarnings("ignore")
    # load libraries
    import pandas as pd
    import numpy as np
    
    # Create dataframe with missing values
    raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'], 
                'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'], 
                'age': [42, np.nan, 36, 24, 73], 
                'sex': ['m', np.nan, 'f', 'm', 'f'], 
                'preTestScore': [4, np.nan, np.nan, 2, 3],
                'postTestScore': [25, np.nan, np.nan, 62, 70]}
    df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 
                                           'preTestScore', 'postTestScore'])
    print(); print(df)
    # Drop missing observations
    df_no_missing = df.dropna()
    print(); print(df_no_missing)
    # Drop rows where all cells in that row is NA
    df_cleaned = df.dropna(how='all')
    print(); print(df_cleaned)
    # Create a new column full of missing values
    df['location'] = np.nan
    print(); print(df)
    # Drop column if they only contain missing values
    print(); print(df.dropna(axis=1, how='all'))
    # Drop rows that contain less than five observations
    # This is really mostly useful for time series
    print(); print(df.dropna(thresh=5))
    # Fill in missing data with zeros
    print(); print(df.fillna(0))
    # Fill in missing in preTestScore with the mean value of preTestScore
    # inplace=True means that the changes are saved to the df right away
    df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
    print(); print(df)
    # Fill in missing in postTestScore with each sex’s mean value of postTestScore
    df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
    print(); print(df)
    # Select the rows of df where age is not NaN and sex is not NaN
    print(); print(df[df['age'].notnull() & df['sex'].notnull()])
    print(); print(df[df['age'].notnull() & df['sex'].notnull()].fillna(0))
Kickstarter_Example_94()

How to deal with missing values in Pandas DataFrame in Python

Add a Review

You must be logged in to post a comment.