All technological notes.
Data cleaning means fixing bad data in your data set.
Bad data could be:
Load data
import pandas as pd
df = pd.read_csv("data_cleaning.csv")
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 32 non-null int64
# 1 Date 31 non-null object
# 2 Pulse 32 non-null int64
# 3 Maxpulse 32 non-null int64
# 4 Calories 30 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
dropna())Empty Cells
One way to deal with empty cells is to remove rows that contain empty cells.
dropna()
inplace = True argument:new_df = df.dropna()
new_df.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 29 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 29 non-null int64
# 1 Date 29 non-null object
# 2 Pulse 29 non-null int64
# 3 Maxpulse 29 non-null int64
# 4 Calories 29 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
# change the original data
df.dropna(inplace=True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 29 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 29 non-null int64
# 1 Date 29 non-null object
# 2 Pulse 29 non-null int64
# 3 Maxpulse 29 non-null int64
# 4 Calories 29 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
fillna())fillna()
import pandas as pd
df = pd.read_csv('data_cleaning.csv')
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 32 non-null int64
# 1 Date 31 non-null object
# 2 Pulse 32 non-null int64
# 3 Maxpulse 32 non-null int64
# 4 Calories 30 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
df.fillna(130, inplace = True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 32 non-null int64
# 1 Date 32 non-null object
# 2 Pulse 32 non-null int64
# 3 Maxpulse 32 non-null int64
# 4 Calories 32 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
import pandas as pd
df = pd.read_csv('data_cleaning.csv')
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 32 non-null int64
# 1 Date 31 non-null object
# 2 Pulse 32 non-null int64
# 3 Maxpulse 32 non-null int64
# 4 Calories 30 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
df["Calories"].fillna(130, inplace=True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 32 non-null int64
# 1 Date 31 non-null object
# 2 Pulse 32 non-null int64
# 3 Maxpulse 32 non-null int64
# 4 Calories 32 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
fillna())A common way to replace empty cells, is to calculate the mean, median or mode value of the column.
Pandas uses the mean() median() and mode() methods to calculate the respective values for a specified column.
mean()import pandas as pd
df = pd.read_csv('data_cleaning.csv')
x = df["Calories"].mean()
df["Calories"].fillna(x, inplace = True)
median()import pandas as pd
df = pd.read_csv('data.csv')
x = df["Calories"].median()
df["Calories"].fillna(x, inplace = True)
mode()import pandas as pd
df = pd.read_csv('data.csv')
x = df["Calories"].mode()[0]
df["Calories"].fillna(x, inplace = True)
import pandas as pd
df = pd.read_csv('data.csv')
df['Date'] = pd.to_datetime(df['Date'])
print(df.to_string())
import pandas as pd
df = pd.read_csv('data_cleaning.csv')
df.dropna(subset=['Date'], inplace = True)
df
import pandas as pd
df = pd.read_csv('data_cleaning.csv')
df.loc[7,'Duration'] = 45
for x in df.index:
if df.loc[x, "Duration"] > 120:
df.loc[x, "Duration"] = 120
print(df.to_string())
for x in df.index:
if df.loc[x, "Duration"] > 120:
df.drop(x, inplace = True)
duplicated()
df = pd.read_csv('data_cleaning.csv')
print(df.duplicated())
drop_duplicates())drop_duplicates():
import pandas as pd
df = pd.read_csv('data_cleaning.csv')
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 32 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 32 non-null int64
# 1 Date 31 non-null object
# 2 Pulse 32 non-null int64
# 3 Maxpulse 32 non-null int64
# 4 Calories 30 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.4+ KB
df.drop_duplicates(inplace = True)
df.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 31 entries, 0 to 31
# Data columns (total 5 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Duration 31 non-null int64
# 1 Date 30 non-null object
# 2 Pulse 31 non-null int64
# 3 Maxpulse 31 non-null int64
# 4 Calories 29 non-null float64
# dtypes: float64(1), int64(3), object(1)
# memory usage: 1.5+ KB