Skip to content

binary_classification_titanic.ipynb

import os
import urllib.request

# Download the data
titanic_path = os.path.join("datasets", "titanic")
os.makedirs(titanic_path, exist_ok=True)

download_url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"

def fetch_data(url, path):
    for file in ["train.csv", "test.csv"]:
        if not os.path.isfile(os.path.join(path, file)):
            urllib.request.urlretrieve(url + file, os.path.join(path, file))

fetch_data(download_url + "datasets/titanic/", titanic_path)

Data loading

# Load the data
import pandas as pd

train_df = pd.read_csv(os.path.join(titanic_path, "train.csv"))
test_df = pd.read_csv(os.path.join(titanic_path, "test.csv"))

# Set the id as PassengerId
train_df = train_df.set_index("PassengerId")
train_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
# Show info about the data
train_df.describe()

SurvivedPclassAgeSibSpParchFare
count891.000000891.000000714.000000891.000000891.000000891.000000
mean0.3838382.30864229.6991130.5230080.38159432.204208
std0.4865920.83607114.5265071.1027430.80605749.693429
min0.0000001.0000000.4167000.0000000.0000000.000000
25%0.0000002.00000020.1250000.0000000.0000007.910400
50%0.0000003.00000028.0000000.0000000.00000014.454200
75%1.0000003.00000038.0000001.0000000.00000031.000000
max1.0000003.00000080.0000008.0000006.000000512.329200

Pre-processing

Numerical atributes

# Pipeline for preprocessing the numerical attributes
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), # Fill missing values with the median
    ('std_scaler', StandardScaler()), # Scale the data
])

# Show examples on the data
print("Before: ", train_df["Age"].head(3))
print("After: ", num_pipeline.fit_transform(train_df[["Age"]]).flatten()[:3])
Before:  PassengerId
1    22.0
2    38.0
3    26.0
Name: Age, dtype: float64
After:  [-0.56573582  0.6638609  -0.25833664]

Categorical attributes

# Pipeline for preprocessing the categorical attributes
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")), # Fill missing values with the most frequent value
    ('cat_encoder', OneHotEncoder()), # Encode the data using one-hot encoding
])

# Show examples on the data
print("Before: ", train_df["Embarked"].head(3))
print("After: ", cat_pipeline.fit_transform(train_df[["Embarked"]]).toarray()[:3])
Before:  PassengerId
1    S
2    C
3    S
Name: Embarked, dtype: object
After:  [[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]

Common pipeline

# Pipeline for preprocessing the data
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attrib = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attrib),
])

# Show examples on the data
print("Before: ", train_df.head(1))
print("After: ", preprocess_pipeline.fit_transform(train_df)[0])
Before:               Survived  Pclass                     Name   Sex   Age  SibSp  \
PassengerId                                                                 
1                   0       3  Braund, Mr. Owen Harris  male  22.0      1   

             Parch     Ticket  Fare Cabin Embarked  
PassengerId                                         
1                0  A/5 21171  7.25   NaN        S  
After:  [-0.56573582  0.43279337 -0.47367361 -0.50244517  0.          0.
  1.          0.          1.          0.          0.          1.        ]
X_train = preprocess_pipeline.fit_transform(train_df[num_attribs + cat_attrib])
y_train = train_df["Survived"]

Model

Random Forest Classifier

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=2910)
forest_clf.fit(X_train, y_train)
RandomForestClassifier(random_state=2910)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Evaluate the model
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
print("Scores: ", scores)
Scores:  [0.77777778 0.7752809  0.76404494 0.83146067 0.87640449 0.84269663
 0.80898876 0.7752809  0.83146067 0.84269663]