binary_classification_titanic.ipynb

import os
import urllib.request

# Download the data
titanic_path = os.path.join("datasets", "titanic")
os.makedirs(titanic_path, exist_ok=True)

download_url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"

def fetch_data(url, path):
    for file in ["train.csv", "test.csv"]:
        if not os.path.isfile(os.path.join(path, file)):
            urllib.request.urlretrieve(url + file, os.path.join(path, file))

fetch_data(download_url + "datasets/titanic/", titanic_path)

Data loading

# Load the data
import pandas as pd

train_df = pd.read_csv(os.path.join(titanic_path, "train.csv"))
test_df = pd.read_csv(os.path.join(titanic_path, "test.csv"))

# Set the id as PassengerId
train_df = train_df.set_index("PassengerId")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB

# Show info about the data
train_df.describe()

	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	0.383838	2.308642	29.699113	0.523008	0.381594	32.204208
std	0.486592	0.836071	14.526507	1.102743	0.806057	49.693429
min	0.000000	1.000000	0.416700	0.000000	0.000000	0.000000
25%	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

Pre-processing

Numerical atributes

# Pipeline for preprocessing the numerical attributes
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), # Fill missing values with the median
    ('std_scaler', StandardScaler()), # Scale the data
])

# Show examples on the data
print("Before: ", train_df["Age"].head(3))
print("After: ", num_pipeline.fit_transform(train_df[["Age"]]).flatten()[:3])

Before:  PassengerId
1    22.0
2    38.0
3    26.0
Name: Age, dtype: float64
After:  [-0.56573582  0.6638609  -0.25833664]

Categorical attributes

# Pipeline for preprocessing the categorical attributes
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")), # Fill missing values with the most frequent value
    ('cat_encoder', OneHotEncoder()), # Encode the data using one-hot encoding
])

# Show examples on the data
print("Before: ", train_df["Embarked"].head(3))
print("After: ", cat_pipeline.fit_transform(train_df[["Embarked"]]).toarray()[:3])

Before:  PassengerId
1    S
2    C
3    S
Name: Embarked, dtype: object
After:  [[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]

Common pipeline

# Pipeline for preprocessing the data
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attrib = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attrib),
])

# Show examples on the data
print("Before: ", train_df.head(1))
print("After: ", preprocess_pipeline.fit_transform(train_df)[0])

Before:               Survived  Pclass                     Name   Sex   Age  SibSp  \
PassengerId                                                                 
1                   0       3  Braund, Mr. Owen Harris  male  22.0      1   

             Parch     Ticket  Fare Cabin Embarked  
PassengerId                                         
1                0  A/5 21171  7.25   NaN        S  
After:  [-0.56573582  0.43279337 -0.47367361 -0.50244517  0.          0.
  1.          0.          1.          0.          0.          1.        ]

X_train = preprocess_pipeline.fit_transform(train_df[num_attribs + cat_attrib])
y_train = train_df["Survived"]

Model

Random Forest Classifier

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=2910)
forest_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=2910)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Evaluate the model
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
print("Scores: ", scores)

Scores:  [0.77777778 0.7752809  0.76404494 0.83146067 0.87640449 0.84269663
 0.80898876 0.7752809  0.83146067 0.84269663]

image_clasification_online.ipynb