Thursday, October 22, 2015

Data Management and Visualization - Wesleyan University (Coursera) - Week 4 - Ammar Shigri

CODE

import pandas
import numpy
import seaborn
import matplotlib.pyplot as plt

pandas.set_option('display.float_format',lambda x:'%f'%x)



data = pandas.read_csv('20151009gap.csv', low_memory=False)

print(len(data))            #Number of observations (Rows)
print(len(data.columns))    #Number of Variables (Columns)




data['polityscore'] = data['polityscore'].convert_objects(convert_numeric=True)
data['suicideper100th'] = data['suicideper100th'].convert_objects(convert_numeric=True)
data['employrate'] = data['employrate'].convert_objects(convert_numeric=True)
data['incomeperperson'] = data['incomeperperson'].convert_objects(convert_numeric=True)
data['co2emissions'] = data['co2emissions'].convert_objects(convert_numeric=True)


#Making a copy of data to sub5 data frame
sub5=data.copy()

#Filling empty records with avearge value of the column, I am doing this invidually to only numeric columns
#fillna function is used to fill NaN with mean values. This helps to make our analysis more accurate. Thus managing empty spaces.
sub5['polityscore'].fillna((sub5['polityscore'].mean()), inplace=True)
sub5['suicideper100th'].fillna((sub5['suicideper100th'].mean()), inplace=True)
sub5['employrate'].fillna((sub5['employrate'].mean()), inplace=True)
sub5['incomeperperson'].fillna((sub5['incomeperperson'].mean()), inplace=True)
sub5['co2emissions'].fillna((sub5['co2emissions'].mean()), inplace=True)



# categorize quantitative variable based on customized splits using cut function - making a new variable polity4
# splits into 4 groups
sub5['polity4'] = pandas.cut(sub5.polityscore, [-10, -5, 0, 5, 10])
f1 = sub5['polity4'].value_counts(sort=False)
f2 = sub5['polity4'].value_counts(sort=False, normalize=True)

print ('\n\n Polity Score divided into 4 parts, frequesty and percentage of each is given \n\n')
print(f1)
print(f2)


# quartile split (use qcut function & ask for 4 groups - gives you quartile split)
sub5['suicide4']=pandas.qcut(sub5.suicideper100th, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
f3 = sub5['suicide4'].value_counts(sort=False)
f4 = sub5['suicide4'].value_counts(sort=False, normalize=True)

print ('\n\n suicide Score divided into 4 parts, frequesty and percentage of each is given \n\n')
print(f3)
print(f4)



# quartile split (use qcut function & ask for 4 groups - gives you quartile split)
sub5['employ4']=pandas.qcut(sub5.employrate, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
f5 = sub5['employ4'].value_counts(sort=False)
f6 = sub5['employ4'].value_counts(sort=False, normalize=True)

print ('\n\n employrate Score divided into 4 parts, frequesty and percentage of each is given \n\n')
print(f5)
print(f6)


"""
#basic scatterplot:  Q->Q
scat1 = seaborn.regplot(x="polityscore", y="suicideper100th", data=data)
plt.xlabel('polityscore')
plt.ylabel('Suicide rate per 100th')
plt.title('Scatterplot for the Association Between Suicide Rate and polityscore')



scat2 = seaborn.regplot(x="employrate", y="suicideper100th", data=data)
plt.xlabel('employrate')
plt.ylabel('Suicide rate per 100th')
plt.title('Scatterplot for the Association Between Suicide Rate and employrate')
"""

# quartile split (use qcut function & ask for 4 groups - gives you quartile split)
print ('employment - 4 categories - quartiles')
data['employrate']=pandas.qcut(data.incomeperperson, 4, labels=["1=25th%tile","2=50%tile","3=75%tile","4=100%tile"])
g1 = data['employrate'].value_counts(sort=False, dropna=True)
print (g1)

# bivariate bar graph 
seaborn.factorplot(x='employrate', y='suicideper100th', data=data, kind="bar", ci=None)
plt.xlabel('employrate')
plt.ylabel('suicideper100th')

OUTPUT




From the bar chart and even the scatter plot I see that there does not any direct relationship between the researched variables. Further analysis is required to look into other variables, or may be the combined effect of variables will have to be studied.

No comments:

Post a Comment