Predicting if a student will get admission or not¶
In [33]:
import pandas as pd
import numpy as np
In [34]:
admission = pd.read_csv( "admission.csv" )
In [35]:
admission.head()
Out[35]:
In [36]:
admission.columns = ["admit", "gre", "gpa", "ranking" ]
Does the ranking of the college impact the admissions and how much¶
In [37]:
pd.crosstab( admission.admit, admission.ranking )
Out[37]:
In [38]:
admit_by_rankings = pd.crosstab( admission.admit, admission.ranking ).apply( lambda x: x/x.sum(), axis = 0 )
In [39]:
admit_by_rankings
Out[39]:
In [40]:
admit_by_rankings = pd.DataFrame( admit_by_rankings.unstack() ).reset_index()
In [41]:
admit_by_rankings
Out[41]:
In [42]:
admit_by_rankings.columns = ["ranking", "admit", "total" ]
In [43]:
import matplotlib as plt
import seaborn as sn
%matplotlib inline
In [44]:
sn.barplot( admit_by_rankings.ranking, admit_by_rankings.total, hue = admit_by_rankings.admit )
Out[44]:
Is the mean GRE and GPA score different for student who got admitted and who did not?¶
In [45]:
gre_0 = admission[admission.admit == 0]["gre"]
In [46]:
sn.distplot( gre_0 )
Out[46]:
In [47]:
gre_1 = admission[admission.admit == 1]["gre"]
In [ ]:
sn.distplot( gre_1 )
In [ ]:
g = sn.FacetGrid(admission, col="admit", size = 10)
g.map(sn.distplot, "gre")
In [ ]:
g = sn.FacetGrid(admission, col="admit", size = 10)
g.map(sn.distplot, "gpa")
In [51]:
sn.pairplot( admission[["gre", "gpa"]], size = 4 )
Out[51]:
In [ ]:
g = sn.FacetGrid(admission, row="ranking", size = 6)
g.map(sn.distplot, "gre")
In [53]:
sn.boxplot( "ranking", "gre", data = admission )
Out[53]:
Building a Classification Model¶
Convert the categorical variables into dummy variables¶
In [54]:
def create_dummies( df, colname ):
col_dummies = pd.get_dummies(df[colname], prefix=colname)
col_dummies.drop(col_dummies.columns[0], axis=1, inplace=True)
df = pd.concat([df, col_dummies], axis=1)
df.drop( colname, axis = 1, inplace = True )
return df
In [55]:
admission_new = create_dummies( admission, "ranking" )
In [56]:
admission_new.head()
Out[56]:
In [57]:
from sklearn.linear_model import LogisticRegression
In [58]:
logreg = LogisticRegression()
In [59]:
admission_new.columns
Out[59]:
In [60]:
feature_cols = ['gre', 'gpa', 'ranking_2', 'ranking_3', 'ranking_4']
In [61]:
logreg.fit( admission_new[feature_cols], admission_new.admit )
Out[61]:
In [62]:
list( zip( feature_cols, logreg.coef_ ) )
Out[62]:
In [63]:
logreg.coef_
Out[63]:
In [64]:
logreg.intercept_
Out[64]:
In [65]:
admission_new["predicted_class"] = logreg.predict( admission_new[feature_cols] )
In [66]:
admission_new = pd.concat( [admission_new, pd.DataFrame( logreg.predict_proba( admission_new[feature_cols] ) )], axis = 1 )
In [67]:
admission_new.head()
Out[67]:
Evaluating the model - Confusion Matrix¶
In [70]:
from sklearn import metrics
In [71]:
cm = metrics.confusion_matrix( admission_new.admit, admission_new.predicted_class )
In [72]:
cm
Out[72]:
In [73]:
sn.heatmap(cm, annot=True, fmt='.2f' );
Accuracy Score¶
In [77]:
score = metrics.accuracy_score( admission_new.admit, admission_new.predicted_class )
In [79]:
score
Out[79]:
In [80]:
auc_score = metrics.roc_auc_score( admission_new.admit, admission_new.predicted_class )
In [81]:
auc_score
Out[81]:
In [82]:
fpr, tpr, _ = metrics.roc_curve( admission_new.admit, admission_new.predicted_class )
In [83]:
import matplotlib.pyplot as pyplt
pyplt.figure()
pyplt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
pyplt.plot([0, 1], [0, 1], 'k--')
pyplt.xlim([0.0, 1.0])
pyplt.ylim([0.0, 1.05])
pyplt.xlabel('False Positive Rate')
pyplt.ylabel('True Positive Rate')
pyplt.title('Receiver operating characteristic example')
pyplt.legend(loc="lower right")
pyplt.show()
In [91]:
print( metrics.classification_report( admission_new.admit, admission_new.predicted_class ) )
Cross validating the model¶
In [87]:
from sklearn.cross_validation import cross_val_score
In [88]:
logreg = LogisticRegression()
X = admission_new[feature_cols]
y = admission_new.admit
scores = cross_val_score(logreg, X, y, cv=10, scoring='accuracy')
In [89]:
scores
Out[89]:
In [90]:
scores.mean()
Out[90]:
Comments