binary_classification_titanic.ipynb
import os
import urllib.request
# Download the data
titanic_path = os.path.join("datasets", "titanic")
os.makedirs(titanic_path, exist_ok=True)
download_url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
def fetch_data(url, path):
for file in ["train.csv", "test.csv"]:
if not os.path.isfile(os.path.join(path, file)):
urllib.request.urlretrieve(url + file, os.path.join(path, file))
fetch_data(download_url + "datasets/titanic/", titanic_path)Data loading
# Load the data
import pandas as pd
train_df = pd.read_csv(os.path.join(titanic_path, "train.csv"))
test_df = pd.read_csv(os.path.join(titanic_path, "test.csv"))
# Set the id as PassengerId
train_df = train_df.set_index("PassengerId")
train_df.info()<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null int64
1 Pclass 891 non-null int64
2 Name 891 non-null object
3 Sex 891 non-null object
4 Age 714 non-null float64
5 SibSp 891 non-null int64
6 Parch 891 non-null int64
7 Ticket 891 non-null object
8 Fare 891 non-null float64
9 Cabin 204 non-null object
10 Embarked 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
# Show info about the data
train_df.describe()| Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 0.383838 | 2.308642 | 29.699113 | 0.523008 | 0.381594 | 32.204208 |
| std | 0.486592 | 0.836071 | 14.526507 | 1.102743 | 0.806057 | 49.693429 |
| min | 0.000000 | 1.000000 | 0.416700 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
Pre-processing
Numerical atributes
# Pipeline for preprocessing the numerical attributes
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")), # Fill missing values with the median
('std_scaler', StandardScaler()), # Scale the data
])
# Show examples on the data
print("Before: ", train_df["Age"].head(3))
print("After: ", num_pipeline.fit_transform(train_df[["Age"]]).flatten()[:3])Before: PassengerId
1 22.0
2 38.0
3 26.0
Name: Age, dtype: float64
After: [-0.56573582 0.6638609 -0.25833664]
Categorical attributes
# Pipeline for preprocessing the categorical attributes
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="most_frequent")), # Fill missing values with the most frequent value
('cat_encoder', OneHotEncoder()), # Encode the data using one-hot encoding
])
# Show examples on the data
print("Before: ", train_df["Embarked"].head(3))
print("After: ", cat_pipeline.fit_transform(train_df[["Embarked"]]).toarray()[:3])Before: PassengerId
1 S
2 C
3 S
Name: Embarked, dtype: object
After: [[0. 0. 1.]
[1. 0. 0.]
[0. 0. 1.]]
Common pipeline
# Pipeline for preprocessing the data
from sklearn.compose import ColumnTransformer
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attrib = ["Pclass", "Sex", "Embarked"]
preprocess_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", cat_pipeline, cat_attrib),
])
# Show examples on the data
print("Before: ", train_df.head(1))
print("After: ", preprocess_pipeline.fit_transform(train_df)[0])Before: Survived Pclass Name Sex Age SibSp \
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1
Parch Ticket Fare Cabin Embarked
PassengerId
1 0 A/5 21171 7.25 NaN S
After: [-0.56573582 0.43279337 -0.47367361 -0.50244517 0. 0.
1. 0. 1. 0. 0. 1. ]
X_train = preprocess_pipeline.fit_transform(train_df[num_attribs + cat_attrib])
y_train = train_df["Survived"]Model
Random Forest Classifier
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=2910)
forest_clf.fit(X_train, y_train)RandomForestClassifier(random_state=2910)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=2910)
# Evaluate the model
from sklearn.model_selection import cross_val_score
scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
print("Scores: ", scores)Scores: [0.77777778 0.7752809 0.76404494 0.83146067 0.87640449 0.84269663
0.80898876 0.7752809 0.83146067 0.84269663]