binary_classification_titanic.ipynb

import os
import urllib.request

# Download the data
titanic_path = os.path.join("datasets", "titanic")
os.makedirs(titanic_path, exist_ok=True)

download_url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"

def fetch_data(url, path):
    for file in ["train.csv", "test.csv"]:
        if not os.path.isfile(os.path.join(path, file)):
            urllib.request.urlretrieve(url + file, os.path.join(path, file))

fetch_data(download_url + "datasets/titanic/", titanic_path)

Data loading

# Load the data
import pandas as pd

train_df = pd.read_csv(os.path.join(titanic_path, "train.csv"))
test_df = pd.read_csv(os.path.join(titanic_path, "test.csv"))

# Set the id as PassengerId
train_df = train_df.set_index("PassengerId")
train_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
# Show info about the data
train_df.describe()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

Pre-processing

Numerical atributes

# Pipeline for preprocessing the numerical attributes
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), # Fill missing values with the median
    ('std_scaler', StandardScaler()), # Scale the data
])

# Show examples on the data
print("Before: ", train_df["Age"].head(3))
print("After: ", num_pipeline.fit_transform(train_df[["Age"]]).flatten()[:3])
Before:  PassengerId
1    22.0
2    38.0
3    26.0
Name: Age, dtype: float64
After:  [-0.56573582  0.6638609  -0.25833664]

Categorical attributes

# Pipeline for preprocessing the categorical attributes
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")), # Fill missing values with the most frequent value
    ('cat_encoder', OneHotEncoder()), # Encode the data using one-hot encoding
])

# Show examples on the data
print("Before: ", train_df["Embarked"].head(3))
print("After: ", cat_pipeline.fit_transform(train_df[["Embarked"]]).toarray()[:3])
Before:  PassengerId
1    S
2    C
3    S
Name: Embarked, dtype: object
After:  [[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]

Common pipeline

# Pipeline for preprocessing the data
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attrib = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attrib),
])

# Show examples on the data
print("Before: ", train_df.head(1))
print("After: ", preprocess_pipeline.fit_transform(train_df)[0])
Before:               Survived  Pclass                     Name   Sex   Age  SibSp  \
PassengerId                                                                 
1                   0       3  Braund, Mr. Owen Harris  male  22.0      1   

             Parch     Ticket  Fare Cabin Embarked  
PassengerId                                         
1                0  A/5 21171  7.25   NaN        S  
After:  [-0.56573582  0.43279337 -0.47367361 -0.50244517  0.          0.
  1.          0.          1.          0.          0.          1.        ]
X_train = preprocess_pipeline.fit_transform(train_df[num_attribs + cat_attrib])
y_train = train_df["Survived"]

Model

Random Forest Classifier

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=2910)
forest_clf.fit(X_train, y_train)
# Evaluate the model
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
print("Scores: ", scores)
Scores:  [0.77777778 0.7752809  0.76404494 0.83146067 0.87640449 0.84269663
 0.80898876 0.7752809  0.83146067 0.84269663]