Skip to the content.

ML Titanic

sinkin

import seaborn as sns
titanic_data = sns.load_dataset('titanic')
import seaborn as sns

# Load the titanic dataset
titanic_data = sns.load_dataset('titanic')

print("Titanic Data")


print(titanic_data.columns) # titanic data set
display(titanic_data[['survived','pclass', 'sex', 'age', 'sibsp', 'parch', 'class', 'fare', 'embark_town', 'alone']]) # look at selected columns
Titanic Data
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')
survived pclass sex age sibsp parch class fare embark_town alone
0 0 3 male 22.0 1 0 Third 7.2500 Southampton False
1 1 1 female 38.0 1 0 First 71.2833 Cherbourg False
2 1 3 female 26.0 0 0 Third 7.9250 Southampton True
3 1 1 female 35.0 1 0 First 53.1000 Southampton False
4 0 3 male 35.0 0 0 Third 8.0500 Southampton True
... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 Second 13.0000 Southampton True
887 1 1 female 19.0 0 0 First 30.0000 Southampton True
888 0 3 female NaN 1 2 Third 23.4500 Southampton False
889 1 1 male 26.0 0 0 First 30.0000 Cherbourg True
890 0 3 male 32.0 0 0 Third 7.7500 Queenstown True

891 rows × 10 columns

import pandas as pd
# Preprocess the data
from sklearn.preprocessing import OneHotEncoder

td = titanic_data
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x == True else 0)

# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(td[['embarked']])
onehot = enc.transform(td[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
td[cols] = pd.DataFrame(onehot)
td.drop(['embarked'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after preparing the data

print(td.columns)
display(td)
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')
survived pclass sex age sibsp parch fare alone embarked_C embarked_Q embarked_S
0 0 3 1 22.0 1 0 7.2500 0 0.0 0.0 1.0
1 1 1 0 38.0 1 0 71.2833 0 1.0 0.0 0.0
2 1 3 0 26.0 0 0 7.9250 1 0.0 0.0 1.0
3 1 1 0 35.0 1 0 53.1000 0 0.0 0.0 1.0
4 0 3 1 35.0 0 0 8.0500 1 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ...
705 0 2 1 39.0 0 0 26.0000 1 0.0 0.0 1.0
706 1 2 0 45.0 0 0 13.5000 1 0.0 0.0 1.0
707 1 1 1 42.0 0 0 26.2875 1 0.0 1.0 0.0
708 1 1 0 22.0 0 0 151.5500 1 0.0 0.0 1.0
710 1 1 0 24.0 0 0 49.5042 1 1.0 0.0 0.0

564 rows × 11 columns

import pandas as pd
# Preprocess the data
from sklearn.preprocessing import OneHotEncoder

td = titanic_data
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x == True else 0)

# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(td[['embarked']])
onehot = enc.transform(td[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
td[cols] = pd.DataFrame(onehot)
td.drop(['embarked'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after preparing the data

print(td.columns)
display(td)
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

Cell In[15], line 6
      3 from sklearn.preprocessing import OneHotEncoder
      5 td = titanic_data
----> 6 td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
      7 td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
      8 td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/frame.py:5581, in DataFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
   5433 def drop(
   5434     self,
   5435     labels: IndexLabel | None = None,
   (...)
   5442     errors: IgnoreRaise = "raise",
   5443 ) -> DataFrame | None:
   5444     """
   5445     Drop specified labels from rows or columns.
   5446 
   (...)
   5579             weight  1.0     0.8
   5580     """
-> 5581     return super().drop(
   5582         labels=labels,
   5583         axis=axis,
   5584         index=index,
   5585         columns=columns,
   5586         level=level,
   5587         inplace=inplace,
   5588         errors=errors,
   5589     )


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/generic.py:4788, in NDFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
   4786 for axis, labels in axes.items():
   4787     if labels is not None:
-> 4788         obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4790 if inplace:
   4791     self._update_inplace(obj)


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/generic.py:4830, in NDFrame._drop_axis(self, labels, axis, level, errors, only_slice)
   4828         new_axis = axis.drop(labels, level=level, errors=errors)
   4829     else:
-> 4830         new_axis = axis.drop(labels, errors=errors)
   4831     indexer = axis.get_indexer(new_axis)
   4833 # Case for non-unique axis
   4834 else:


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:7070, in Index.drop(self, labels, errors)
   7068 if mask.any():
   7069     if errors != "ignore":
-> 7070         raise KeyError(f"{labels[mask].tolist()} not found in axis")
   7071     indexer = indexer[~mask]
   7072 return self.delete(indexer)


KeyError: "['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'] not found in axis"