I am working on video 5. After following through the video, I tried to organize everything into a class. Here is what I have so far, this runs in my jupyter notebook. Please give advices, as this is my first time trying to write data code into more organized way.
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import tensor
from fastai.data.transforms import RandomSplitter
# print and exit, for debugging purpose
def pe(*args, **kwargs):
print(*args, **kwargs)
raise Exception("Stopped for check")
class TitanicProj:
CAT_COLUMNS ='Sex Pclass Embarked'.split()
NUM_COLUMNS = 'Age SibSp Parch LogFare'.split()
DEP_COLUMN = 'Survived'
PROJ = 'titanic'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
def __init__(self):
self.path = Path(self.PROJ)
self.download_project(self.path)
self.train_df = pd.read_csv(self.path / self.TRAIN_FILE)
# modes is most common value, not avg
# will be used by both train and test
self.modes = self.train_df.mode().iloc[0]
def download_project(self, path):
if not path.exists():
import zipfile, kaggle
kaggle.api.competition_download_cli(str(path))
zipfile.ZipFile(f'{path}.zip').extractall(path)
# clean up a dataframe, return a new dataframe
def clean_data(self, df_original):
df = df_original.copy()
# Fill all missing data with mode values
df.fillna(self.modes, inplace=True)
# Fare could be large, so take log to make larger number smaller
df['LogFare'] = np.log(df['Fare']+1)
# create one hot encoding and add the the df.
df, dummy_columns = self.get_dummy(df, self.CAT_COLUMNS)
# final independent vals: num_cols + one-hot-encoded cat cols
df = df[self.NUM_COLUMNS + dummy_columns]
return df
# return augumented df and dummy_column_names
def get_dummy(self, df_original, columns):
df = df_original.copy()
columns_before_dummies = df.columns.tolist()
df_with_dummies = pd.get_dummies(df, columns=columns)
columns_after_dummies = df_with_dummies.columns.tolist()
dummy_columns = list(set(columns_after_dummies) - set(columns_before_dummies))
df[dummy_columns] = df_with_dummies[dummy_columns].astype(float)
return df, dummy_columns
# return coeffs based on how many in_dep variables
def gen_coeffs(self, df):
size = df.shape[1]
return (torch.rand(size) - 0.5).requires_grad_()
# normalize the passed in tensor array, assuming all are float already.
# this is modifying inplace
def normalize(self, t):
vals, indices = t.max(dim=0)
t = t / vals
return t
def calc_preds(self, t_coeffs, t_indeps):
return torch.sigmoid((t_coeffs*t_indeps).sum(axis=1))
def calc_loss(self, t_coeffs, t_indeps, t_deps):
return torch.abs(self.calc_preds(t_coeffs, t_indeps) - t_deps).mean()
def train_valid_split(self, t_indeps, t_deps):
trn_split, val_split = RandomSplitter(seed=42)(t_deps)
return t_indeps[trn_split], t_indeps[val_split], t_deps[trn_split], t_deps[val_split]
# Init the model, so that train_model can be called multiple times.
def init_model(self):
torch.manual_seed(442)
clean_df = self.clean_data(self.train_df)
self.clean_columns = clean_df.columns.tolist()
self.t_indeps = tensor(clean_df.values, dtype=torch.float)
self.t_indeps = self.normalize(self.t_indeps)
self.t_deps = tensor(self.train_df[self.DEP_COLUMN].astype(float))
self.t_coeffs = self.gen_coeffs(self.t_indeps)
self.trn_indep, self.val_indep, self.trn_dep, self.val_dep = (
self.train_valid_split(self.t_indeps, self.t_deps)
)
# Every time train model will train it further
def train_model(self, epochs=30, lr=0.01):
for i in range(epochs):
self.one_epoch(lr=lr)
return self.show_coeffs()
def show_coeffs(self):
return dict(zip(self.clean_columns, self.t_coeffs.clone().detach().tolist()))
# One epoch does 3 things:
# 1. calc loss
# 2. backward
# 3. update coeffs
def one_epoch(self, lr):
loss = self.calc_loss(self.t_coeffs, self.trn_indep, self.trn_dep)
loss.backward()
with torch.no_grad():
self.t_coeffs.sub_(self.t_coeffs.grad * lr)
self.t_coeffs.grad.zero_()
print(f"{loss:.3f}", end="; ")
# accuracy by using validation set
def acc(self):
preds = self.calc_preds(self.t_coeffs, self.val_indep)
return (self.val_dep.bool() == (preds > 0.5)).float().mean()
# write the test result of test.csv to an output file
def gen_test_result(self, output_csv):
test_df = pd.read_csv(self.path / self.TEST_FILE)
clean_df = self.clean_data(test_df)
t_indeps = tensor(clean_df.values, dtype=torch.float)
t_indeps = self.normalize(t_indeps)
preds = self.calc_preds(t_indeps, self.t_coeffs)
test_df['Survived'] = (preds > 0.5).int()
sub_df = test_df[['PassengerId', 'Survived']]
sub_df.to_csv(output_csv, index=False)
proj = TitanicProj()
proj.init_model()
proj.train_model(100, lr=0.3)
proj.train_model(40, lr=50)
print(f"\naccuracy: {proj.acc()}")
print(f"coeffs: {proj.show_coeffs()!r}")
output_csv = 'my_titanic_result.csv'
proj.gen_test_result(output_csv)
print(f"test result successfully written to: {output_csv}")
Here are the results
0.575; 0.570; 0.564; 0.559; 0.553; 0.547; 0.541; 0.535; 0.529; 0.523; 0.517; 0.511; 0.504; 0.498; 0.492; 0.486; 0.480; 0.474; 0.468; 0.463; 0.458; 0.452; 0.448; 0.443; 0.438; 0.434; 0.430; 0.426; 0.422; 0.418; 0.415; 0.411; 0.408; 0.405; 0.402; 0.399; 0.397; 0.394; 0.392; 0.389; 0.387; 0.385; 0.383; 0.380; 0.378; 0.376; 0.375; 0.373; 0.371; 0.369; 0.368; 0.366; 0.364; 0.363; 0.361; 0.360; 0.358; 0.357; 0.355; 0.354; 0.352; 0.351; 0.350; 0.348; 0.347; 0.346; 0.345; 0.343; 0.342; 0.341; 0.340; 0.338; 0.337; 0.336; 0.335; 0.334; 0.333; 0.331; 0.330; 0.329; 0.328; 0.327; 0.326; 0.325; 0.324; 0.323; 0.322; 0.321; 0.320; 0.319; 0.318; 0.317; 0.316; 0.315; 0.314; 0.313; 0.312; 0.311; 0.310; 0.309; 0.308; 0.231; 0.222; 0.219; 0.216; 0.214; 0.212; 0.210; 0.209; 0.208; 0.207; 0.206; 0.205; 0.204; 0.203; 0.203; 0.202; 0.202; 0.201; 0.201; 0.201; 0.200; 0.200; 0.200; 0.200; 0.199; 0.199; 0.199; 0.199; 0.199; 0.198; 0.198; 0.198; 0.198; 0.198; 0.198; 0.197; 0.197; 0.197; 0.197;
accuracy: 0.8258426785469055
coeffs: {'Age': -1.243803858757019, 'SibSp': -1.6929728984832764, 'Parch': -0.9451086521148682, 'LogFare': 0.0632515624165535, 'Embarked_C': 1.0449289083480835, 'Pclass_2': 2.296539545059204, 'Pclass_3': -4.138270378112793, 'Embarked_Q': 2.134453535079956, 'Pclass_1': 2.275932788848877, 'Sex_female': 5.694452285766602, 'Sex_male': -5.941898822784424, 'Embarked_S': -2.785123348236084}
test result successfully written to: my_titanic_result.csv