ML Titanic

import seaborn as sns
titanic_data = sns.load_dataset('titanic')

import seaborn as sns

# Load the titanic dataset
titanic_data = sns.load_dataset('titanic')

print("Titanic Data")


print(titanic_data.columns) # titanic data set
display(titanic_data[['survived','pclass', 'sex', 'age', 'sibsp', 'parch', 'class', 'fare', 'embark_town', 'alone']]) # look at selected columns

Titanic Data
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

	survived	pclass	sex	age	sibsp	parch	class	fare	embark_town	alone
0	0	3	male	22.0	1	0	Third	7.2500	Southampton	False
1	1	1	female	38.0	1	0	First	71.2833	Cherbourg	False
2	1	3	female	26.0	0	0	Third	7.9250	Southampton	True
3	1	1	female	35.0	1	0	First	53.1000	Southampton	False
4	0	3	male	35.0	0	0	Third	8.0500	Southampton	True
...	...	...	...	...	...	...	...	...	...	...
886	0	2	male	27.0	0	0	Second	13.0000	Southampton	True
887	1	1	female	19.0	0	0	First	30.0000	Southampton	True
888	0	3	female	NaN	1	2	Third	23.4500	Southampton	False
889	1	1	male	26.0	0	0	First	30.0000	Cherbourg	True
890	0	3	male	32.0	0	0	Third	7.7500	Queenstown	True

891 rows × 10 columns

import pandas as pd
# Preprocess the data
from sklearn.preprocessing import OneHotEncoder

td = titanic_data
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x == True else 0)

# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(td[['embarked']])
onehot = enc.transform(td[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
td[cols] = pd.DataFrame(onehot)
td.drop(['embarked'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after preparing the data

print(td.columns)
display(td)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

	survived	pclass	sex	age	sibsp	parch	fare	alone	embarked_C	embarked_Q	embarked_S
0	0	3	1	22.0	1	0	7.2500	0	0.0	0.0	1.0
1	1	1	0	38.0	1	0	71.2833	0	1.0	0.0	0.0
2	1	3	0	26.0	0	0	7.9250	1	0.0	0.0	1.0
3	1	1	0	35.0	1	0	53.1000	0	0.0	0.0	1.0
4	0	3	1	35.0	0	0	8.0500	1	0.0	0.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...
705	0	2	1	39.0	0	0	26.0000	1	0.0	0.0	1.0
706	1	2	0	45.0	0	0	13.5000	1	0.0	0.0	1.0
707	1	1	1	42.0	0	0	26.2875	1	0.0	1.0	0.0
708	1	1	0	22.0	0	0	151.5500	1	0.0	0.0	1.0
710	1	1	0	24.0	0	0	49.5042	1	1.0	0.0	0.0

564 rows × 11 columns

import pandas as pd
# Preprocess the data
from sklearn.preprocessing import OneHotEncoder

td = titanic_data
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x == True else 0)

# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(td[['embarked']])
onehot = enc.transform(td[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
td[cols] = pd.DataFrame(onehot)
td.drop(['embarked'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after preparing the data

print(td.columns)
display(td)

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

Cell In[15], line 6
from sklearn.preprocessing import OneHotEncoder
td = titanic_data
----> 6 td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/frame.py:5581, in DataFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
def drop(
   self,
   labels: IndexLabel | None = None,
   (...)
   errors: IgnoreRaise = "raise",
) -> DataFrame | None:
   """
   Drop specified labels from rows or columns.

   (...)
           weight  1.0     0.8
   """
-> 5581     return super().drop(
       labels=labels,
       axis=axis,
       index=index,
       columns=columns,
       level=level,
       inplace=inplace,
       errors=errors,
   )


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/generic.py:4788, in NDFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
for axis, labels in axes.items():
   if labels is not None:
-> 4788         obj = obj._drop_axis(labels, axis, level=level, errors=errors)
if inplace:
   self._update_inplace(obj)


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/generic.py:4830, in NDFrame._drop_axis(self, labels, axis, level, errors, only_slice)
       new_axis = axis.drop(labels, level=level, errors=errors)
   else:
-> 4830         new_axis = axis.drop(labels, errors=errors)
   indexer = axis.get_indexer(new_axis)
# Case for non-unique axis
else:


File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:7070, in Index.drop(self, labels, errors)
if mask.any():
   if errors != "ignore":
-> 7070         raise KeyError(f"{labels[mask].tolist()} not found in axis")
   indexer = indexer[~mask]
return self.delete(indexer)


KeyError: "['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'] not found in axis"