Last Updated: January 25, 2021
·
51
· kalinin84

Python: Exploratory data analysis

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
FILE_NAME = '../input/example.csv'
dataset = pd.read_csv(FILE_NAME)
dataset.shape
dataset.info()
dataset.head(4)
dataset.tail(4)
dataset.sample(4)
dataset.describe()
dataset.corr()
dataset.groupby('target').median()
dataset.groupby('target').agg(['mean', 'std'])
dataset.hist(bins=20, figsize=(10, 10))
plt.show()
sns.heatmap(dataset.corr(), square=True, annot=True)
plt.show()
sns.pairplot(dataset, hue='target')
plt.show()
sns.displot(dataset, hue='target', x='alpha', y='beta', kind='kde')
plt.show()
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
X, target = dataset.iloc[:, 1:], dataset.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, target)
model = RandomForestClassifier(n_estimators=2000, max_depth=150)
cross_val_score(model, X, target, cv=5, scoring='f1')
cb = CatBoostClassifier(logging_level='Silent').fit(X_train, y_train)
y_predicted = cb.predict(X_test)
f1_score(y_test, y_predicted)
confusion_matrix(y_test, y_predicted)
cb.feature_importances_
np.argmax(cb.feature_importances_)