Note_Tech

All technological notes.


Project maintained by simonangel-fong Hosted on GitHub Pages — Theme by mattgraham

Pandas - Cleaning

Back


Data Cleaning

import pandas as pd

df = pd.read_csv("data_cleaning.csv")

df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  32 non-null     int64
#  1   Date      31 non-null     object
#  2   Pulse     32 non-null     int64
#  3   Maxpulse  32 non-null     int64
#  4   Calories  30 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

Cleaning Empty Cells(dropna())


new_df = df.dropna()
new_df.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 29 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  29 non-null     int64
#  1   Date      29 non-null     object
#  2   Pulse     29 non-null     int64
#  3   Maxpulse  29 non-null     int64
#  4   Calories  29 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

# change the original data
df.dropna(inplace=True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 29 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  29 non-null     int64
#  1   Date      29 non-null     object
#  2   Pulse     29 non-null     int64
#  3   Maxpulse  29 non-null     int64
#  4   Calories  29 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

Replace Empty Values(fillna())

import pandas as pd
df = pd.read_csv('data_cleaning.csv')
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  32 non-null     int64
#  1   Date      31 non-null     object
#  2   Pulse     32 non-null     int64
#  3   Maxpulse  32 non-null     int64
#  4   Calories  30 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

df.fillna(130, inplace = True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  32 non-null     int64
#  1   Date      32 non-null     object
#  2   Pulse     32 non-null     int64
#  3   Maxpulse  32 non-null     int64
#  4   Calories  32 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

Replace Only For Specified Columns

import pandas as pd

df = pd.read_csv('data_cleaning.csv')
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  32 non-null     int64
#  1   Date      31 non-null     object
#  2   Pulse     32 non-null     int64
#  3   Maxpulse  32 non-null     int64
#  4   Calories  30 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

df["Calories"].fillna(130, inplace=True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  32 non-null     int64
#  1   Date      31 non-null     object
#  2   Pulse     32 non-null     int64
#  3   Maxpulse  32 non-null     int64
#  4   Calories  32 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

Replace Using Mean, Median, or Mode(fillna())


mean()

import pandas as pd

df = pd.read_csv('data_cleaning.csv')
x = df["Calories"].mean()
df["Calories"].fillna(x, inplace = True)

median()

import pandas as pd

df = pd.read_csv('data.csv')

x = df["Calories"].median()

df["Calories"].fillna(x, inplace = True)

mode()

import pandas as pd

df = pd.read_csv('data.csv')

x = df["Calories"].mode()[0]

df["Calories"].fillna(x, inplace = True)

Data of Wrong Format


Convert Into a Correct Format

import pandas as pd

df = pd.read_csv('data.csv')

df['Date'] = pd.to_datetime(df['Date'])

print(df.to_string())

Removing Rows

import pandas as pd

df = pd.read_csv('data_cleaning.csv')
df.dropna(subset=['Date'], inplace = True)
df

Wrong Data

import pandas as pd
df = pd.read_csv('data_cleaning.csv')
df.loc[7,'Duration'] = 45

for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.loc[x, "Duration"] = 120

print(df.to_string())
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.drop(x, inplace = True)

Removing Duplicates

df = pd.read_csv('data_cleaning.csv')
print(df.duplicated())

Removing Duplicates(drop_duplicates())

import pandas as pd

df = pd.read_csv('data_cleaning.csv')
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  32 non-null     int64
#  1   Date      31 non-null     object
#  2   Pulse     32 non-null     int64
#  3   Maxpulse  32 non-null     int64
#  4   Calories  30 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB

df.drop_duplicates(inplace = True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 31 entries, 0 to 31
# Data columns (total 5 columns):
#  #   Column    Non-Null Count  Dtype
# ---  ------    --------------  -----
#  0   Duration  31 non-null     int64
#  1   Date      30 non-null     object
#  2   Pulse     31 non-null     int64
#  3   Maxpulse  31 non-null     int64
#  4   Calories  29 non-null     float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.5+ KB

TOP