import os
import urllib.request
# Download the datatitanic_path = os.path.join("datasets", "titanic")
os.makedirs(titanic_path, exist_ok=True)
download_url ="https://raw.githubusercontent.com/ageron/handson-ml2/master/"deffetch_data(url, path):
for file in ["train.csv", "test.csv"]:
ifnot os.path.isfile(os.path.join(path, file)):
urllib.request.urlretrieve(url + file, os.path.join(path, file))
fetch_data(download_url +"datasets/titanic/", titanic_path)
Data loading
# Load the dataimport pandas as pd
train_df = pd.read_csv(os.path.join(titanic_path, "train.csv"))
test_df = pd.read_csv(os.path.join(titanic_path, "test.csv"))
# Set the id as PassengerIdtrain_df = train_df.set_index("PassengerId")
train_df.info()
# Pipeline for preprocessing the numerical attributesfrom sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")), # Fill missing values with the median ('std_scaler', StandardScaler()), # Scale the data])
# Show examples on the dataprint("Before: ", train_df["Age"].head(3))
print("After: ", num_pipeline.fit_transform(train_df[["Age"]]).flatten()[:3])
# Pipeline for preprocessing the categorical attributesfrom sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="most_frequent")), # Fill missing values with the most frequent value ('cat_encoder', OneHotEncoder()), # Encode the data using one-hot encoding])
# Show examples on the dataprint("Before: ", train_df["Embarked"].head(3))
print("After: ", cat_pipeline.fit_transform(train_df[["Embarked"]]).toarray()[:3])
Before: PassengerId
1 S
2 C
3 S
Name: Embarked, dtype: object
After: [[0. 0. 1.]
[1. 0. 0.]
[0. 0. 1.]]
Common pipeline
# Pipeline for preprocessing the datafrom sklearn.compose import ColumnTransformer
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attrib = ["Pclass", "Sex", "Embarked"]
preprocess_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", cat_pipeline, cat_attrib),
])
# Show examples on the dataprint("Before: ", train_df.head(1))
print("After: ", preprocess_pipeline.fit_transform(train_df)[0])
Before: Survived Pclass Name Sex Age SibSp \
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1
Parch Ticket Fare Cabin Embarked
PassengerId
1 0 A/5 21171 7.25 NaN S
After: [-0.56573582 0.43279337 -0.47367361 -0.50244517 0. 0.
1. 0. 1. 0. 0. 1. ]