In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [2]:
data = pd.read_csv('D:/data analysis/StudentsPerformance using python/StudentsPerformance.csv')
data.head(5)
Out[2]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
In [3]:
data.describe(include=['object'])
Out[3]:
gender race/ethnicity parental level of education lunch test preparation course
count 1000 1000 1000 1000 1000
unique 2 5 6 2 2
top female group C some college standard none
freq 518 319 226 645 642
In [4]:
data.shape
Out[4]:
(1000, 8)
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
In [6]:
data.isnull().sum()
Out[6]:
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
In [7]:
data.duplicated().sum()
Out[7]:
0
In [8]:
data['gender'].value_counts().plot(kind='pie',autopct='%.1f%%',shadow=True)
Out[8]:
<AxesSubplot:ylabel='gender'>
In [9]:
GenderByGroup = data[['race/ethnicity','gender']].value_counts().to_frame().sort_values(by='race/ethnicity').rename(columns={0:'count'})
GenderByGroup
Out[9]:
count
race/ethnicity gender
group A male 53
female 36
group B female 104
male 86
group C female 180
male 139
group D male 133
female 129
group E male 71
female 69
In [10]:
GenderByGroup.to_csv('D:/data analysis/StudentsPerformance using python/GenderByGroup.csv')
In [11]:
sns.countplot(data=data,x='test preparation course',hue='gender',palette='cool')
Out[11]:
<AxesSubplot:xlabel='test preparation course', ylabel='count'>
In [12]:
LevelOfEducation = data['parental level of education'].value_counts()
LevelOfEducation
Out[12]:
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64
In [13]:
plt.figure(figsize=(20,5))
sns.barplot(x=LevelOfEducation.index,y=LevelOfEducation.values,palette='CMRmap_r')
for i in range(len(LevelOfEducation.values)):
    plt.text(i,LevelOfEducation.values[i],LevelOfEducation.values[i],fontsize=20)
In [14]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
In [15]:
data[['math score','reading score','writing score']].agg(['min','max','mean'])
Out[15]:
math score reading score writing score
min 0.000 17.000 10.000
max 100.000 100.000 100.000
mean 66.089 69.169 68.054
In [16]:
px.histogram(data,x='math score')
In [17]:
px.histogram(data,x='writing score')
In [18]:
data['total'] = data['math score']+data['writing score']+data['reading score']
data
Out[18]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score total
0 female group B bachelor's degree standard none 72 72 74 218
1 female group C some college standard completed 69 90 88 247
2 female group B master's degree standard none 90 95 93 278
3 male group A associate's degree free/reduced none 47 57 44 148
4 male group C some college standard none 76 78 75 229
... ... ... ... ... ... ... ... ... ...
995 female group E master's degree standard completed 88 99 95 282
996 male group C high school free/reduced none 62 55 55 172
997 female group C high school free/reduced completed 59 71 65 195
998 female group D some college standard completed 68 78 77 223
999 female group D some college free/reduced none 77 86 86 249

1000 rows × 9 columns

In [19]:
data.nlargest(10,'total')
Out[19]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score total
458 female group E bachelor's degree standard none 100 100 100 300
916 male group E bachelor's degree standard completed 100 100 100 300
962 female group E associate's degree standard none 100 100 100 300
114 female group E bachelor's degree standard completed 99 100 100 299
179 female group D some high school standard completed 97 100 100 297
712 female group D some college standard none 98 100 99 297
165 female group C bachelor's degree standard completed 96 100 100 296
625 male group D some college standard completed 100 97 99 296
149 male group E associate's degree free/reduced completed 100 100 93 293
685 female group E master's degree standard completed 94 99 100 293
In [20]:
data.nsmallest(10,'total')
Out[20]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score total
59 female group C some high school free/reduced none 0 17 10 27
980 female group B high school free/reduced none 8 24 23 55
596 male group B high school free/reduced none 30 24 15 69
327 male group A some college free/reduced none 28 23 19 70
17 female group B some high school free/reduced none 18 32 28 78
76 male group E some high school standard none 30 26 22 78
601 female group C high school standard none 29 29 30 88
338 female group B some high school free/reduced none 24 38 27 89
787 female group B some college standard none 19 38 32 89
211 male group C some college free/reduced none 35 28 27 90