import seaborn as sns
titanic_data = sns.load_dataset('titanic')
import seaborn as sns
# Load the titanic dataset
titanic_data = sns.load_dataset('titanic')
print("Titanic Data")
print(titanic_data.columns) # titanic data set
display(titanic_data[['survived','pclass', 'sex', 'age', 'sibsp', 'parch', 'class', 'fare', 'embark_town', 'alone']]) # look at selected columns
Titanic Data
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
'alive', 'alone'],
dtype='object')
| survived | pclass | sex | age | sibsp | parch | class | fare | embark_town | alone | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | Third | 7.2500 | Southampton | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | First | 71.2833 | Cherbourg | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | Third | 7.9250 | Southampton | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | First | 53.1000 | Southampton | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | Third | 8.0500 | Southampton | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | Second | 13.0000 | Southampton | True |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | First | 30.0000 | Southampton | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | Third | 23.4500 | Southampton | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | First | 30.0000 | Cherbourg | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | Third | 7.7500 | Queenstown | True |
891 rows × 10 columns
import pandas as pd
# Preprocess the data
from sklearn.preprocessing import OneHotEncoder
td = titanic_data
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x == True else 0)
# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(td[['embarked']])
onehot = enc.transform(td[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
td[cols] = pd.DataFrame(onehot)
td.drop(['embarked'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after preparing the data
print(td.columns)
display(td)
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'alone',
'embarked_C', 'embarked_Q', 'embarked_S'],
dtype='object')
| survived | pclass | sex | age | sibsp | parch | fare | alone | embarked_C | embarked_Q | embarked_S | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | 0 | 0.0 | 0.0 | 1.0 |
| 1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 0 | 1.0 | 0.0 | 0.0 |
| 2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | 1 | 0.0 | 0.0 | 1.0 |
| 3 | 1 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 | 0 | 0.0 | 0.0 | 1.0 |
| 4 | 0 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 | 1 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 705 | 0 | 2 | 1 | 39.0 | 0 | 0 | 26.0000 | 1 | 0.0 | 0.0 | 1.0 |
| 706 | 1 | 2 | 0 | 45.0 | 0 | 0 | 13.5000 | 1 | 0.0 | 0.0 | 1.0 |
| 707 | 1 | 1 | 1 | 42.0 | 0 | 0 | 26.2875 | 1 | 0.0 | 1.0 | 0.0 |
| 708 | 1 | 1 | 0 | 22.0 | 0 | 0 | 151.5500 | 1 | 0.0 | 0.0 | 1.0 |
| 710 | 1 | 1 | 0 | 24.0 | 0 | 0 | 49.5042 | 1 | 1.0 | 0.0 | 0.0 |
564 rows × 11 columns
import pandas as pd
# Preprocess the data
from sklearn.preprocessing import OneHotEncoder
td = titanic_data
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x == True else 0)
# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(td[['embarked']])
onehot = enc.transform(td[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
td[cols] = pd.DataFrame(onehot)
td.drop(['embarked'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after preparing the data
print(td.columns)
display(td)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[15], line 6
3 from sklearn.preprocessing import OneHotEncoder
5 td = titanic_data
----> 6 td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
7 td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
8 td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/frame.py:5581, in DataFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
5433 def drop(
5434 self,
5435 labels: IndexLabel | None = None,
(...)
5442 errors: IgnoreRaise = "raise",
5443 ) -> DataFrame | None:
5444 """
5445 Drop specified labels from rows or columns.
5446
(...)
5579 weight 1.0 0.8
5580 """
-> 5581 return super().drop(
5582 labels=labels,
5583 axis=axis,
5584 index=index,
5585 columns=columns,
5586 level=level,
5587 inplace=inplace,
5588 errors=errors,
5589 )
File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/generic.py:4788, in NDFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
4786 for axis, labels in axes.items():
4787 if labels is not None:
-> 4788 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4790 if inplace:
4791 self._update_inplace(obj)
File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/generic.py:4830, in NDFrame._drop_axis(self, labels, axis, level, errors, only_slice)
4828 new_axis = axis.drop(labels, level=level, errors=errors)
4829 else:
-> 4830 new_axis = axis.drop(labels, errors=errors)
4831 indexer = axis.get_indexer(new_axis)
4833 # Case for non-unique axis
4834 else:
File ~/nighthawk/dakshaggCSP_2025/venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:7070, in Index.drop(self, labels, errors)
7068 if mask.any():
7069 if errors != "ignore":
-> 7070 raise KeyError(f"{labels[mask].tolist()} not found in axis")
7071 indexer = indexer[~mask]
7072 return self.delete(indexer)
KeyError: "['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'] not found in axis"