import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
data = pd.read_csv('D:/data analysis/StudentsPerformance using python/StudentsPerformance.csv')
data.head(5)
gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
---|---|---|---|---|---|---|---|---|
0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
4 | male | group C | some college | standard | none | 76 | 78 | 75 |
data.describe(include=['object'])
gender | race/ethnicity | parental level of education | lunch | test preparation course | |
---|---|---|---|---|---|
count | 1000 | 1000 | 1000 | 1000 | 1000 |
unique | 2 | 5 | 6 | 2 | 2 |
top | female | group C | some college | standard | none |
freq | 518 | 319 | 226 | 645 | 642 |
data.shape
(1000, 8)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 1000 non-null object 1 race/ethnicity 1000 non-null object 2 parental level of education 1000 non-null object 3 lunch 1000 non-null object 4 test preparation course 1000 non-null object 5 math score 1000 non-null int64 6 reading score 1000 non-null int64 7 writing score 1000 non-null int64 dtypes: int64(3), object(5) memory usage: 62.6+ KB
data.isnull().sum()
gender 0 race/ethnicity 0 parental level of education 0 lunch 0 test preparation course 0 math score 0 reading score 0 writing score 0 dtype: int64
data.duplicated().sum()
0
data['gender'].value_counts().plot(kind='pie',autopct='%.1f%%',shadow=True)
<AxesSubplot:ylabel='gender'>
GenderByGroup = data[['race/ethnicity','gender']].value_counts().to_frame().sort_values(by='race/ethnicity').rename(columns={0:'count'})
GenderByGroup
count | ||
---|---|---|
race/ethnicity | gender | |
group A | male | 53 |
female | 36 | |
group B | female | 104 |
male | 86 | |
group C | female | 180 |
male | 139 | |
group D | male | 133 |
female | 129 | |
group E | male | 71 |
female | 69 |
GenderByGroup.to_csv('D:/data analysis/StudentsPerformance using python/GenderByGroup.csv')
sns.countplot(data=data,x='test preparation course',hue='gender',palette='cool')
<AxesSubplot:xlabel='test preparation course', ylabel='count'>
LevelOfEducation = data['parental level of education'].value_counts()
LevelOfEducation
some college 226 associate's degree 222 high school 196 some high school 179 bachelor's degree 118 master's degree 59 Name: parental level of education, dtype: int64
plt.figure(figsize=(20,5))
sns.barplot(x=LevelOfEducation.index,y=LevelOfEducation.values,palette='CMRmap_r')
for i in range(len(LevelOfEducation.values)):
plt.text(i,LevelOfEducation.values[i],LevelOfEducation.values[i],fontsize=20)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 1000 non-null object 1 race/ethnicity 1000 non-null object 2 parental level of education 1000 non-null object 3 lunch 1000 non-null object 4 test preparation course 1000 non-null object 5 math score 1000 non-null int64 6 reading score 1000 non-null int64 7 writing score 1000 non-null int64 dtypes: int64(3), object(5) memory usage: 62.6+ KB
data[['math score','reading score','writing score']].agg(['min','max','mean'])
math score | reading score | writing score | |
---|---|---|---|
min | 0.000 | 17.000 | 10.000 |
max | 100.000 | 100.000 | 100.000 |
mean | 66.089 | 69.169 | 68.054 |
px.histogram(data,x='math score')
px.histogram(data,x='writing score')
data['total'] = data['math score']+data['writing score']+data['reading score']
data
gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | total | |
---|---|---|---|---|---|---|---|---|---|
0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 | 218 |
1 | female | group C | some college | standard | completed | 69 | 90 | 88 | 247 |
2 | female | group B | master's degree | standard | none | 90 | 95 | 93 | 278 |
3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 | 148 |
4 | male | group C | some college | standard | none | 76 | 78 | 75 | 229 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | female | group E | master's degree | standard | completed | 88 | 99 | 95 | 282 |
996 | male | group C | high school | free/reduced | none | 62 | 55 | 55 | 172 |
997 | female | group C | high school | free/reduced | completed | 59 | 71 | 65 | 195 |
998 | female | group D | some college | standard | completed | 68 | 78 | 77 | 223 |
999 | female | group D | some college | free/reduced | none | 77 | 86 | 86 | 249 |
1000 rows × 9 columns
data.nlargest(10,'total')
gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | total | |
---|---|---|---|---|---|---|---|---|---|
458 | female | group E | bachelor's degree | standard | none | 100 | 100 | 100 | 300 |
916 | male | group E | bachelor's degree | standard | completed | 100 | 100 | 100 | 300 |
962 | female | group E | associate's degree | standard | none | 100 | 100 | 100 | 300 |
114 | female | group E | bachelor's degree | standard | completed | 99 | 100 | 100 | 299 |
179 | female | group D | some high school | standard | completed | 97 | 100 | 100 | 297 |
712 | female | group D | some college | standard | none | 98 | 100 | 99 | 297 |
165 | female | group C | bachelor's degree | standard | completed | 96 | 100 | 100 | 296 |
625 | male | group D | some college | standard | completed | 100 | 97 | 99 | 296 |
149 | male | group E | associate's degree | free/reduced | completed | 100 | 100 | 93 | 293 |
685 | female | group E | master's degree | standard | completed | 94 | 99 | 100 | 293 |
data.nsmallest(10,'total')
gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | total | |
---|---|---|---|---|---|---|---|---|---|
59 | female | group C | some high school | free/reduced | none | 0 | 17 | 10 | 27 |
980 | female | group B | high school | free/reduced | none | 8 | 24 | 23 | 55 |
596 | male | group B | high school | free/reduced | none | 30 | 24 | 15 | 69 |
327 | male | group A | some college | free/reduced | none | 28 | 23 | 19 | 70 |
17 | female | group B | some high school | free/reduced | none | 18 | 32 | 28 | 78 |
76 | male | group E | some high school | standard | none | 30 | 26 | 22 | 78 |
601 | female | group C | high school | standard | none | 29 | 29 | 30 | 88 |
338 | female | group B | some high school | free/reduced | none | 24 | 38 | 27 | 89 |
787 | female | group B | some college | standard | none | 19 | 38 | 32 | 89 |
211 | male | group C | some college | free/reduced | none | 35 | 28 | 27 | 90 |