CODE

import pandas

import numpy

import seaborn

import matplotlib.pyplot as plt

pandas.set_option('display.float_format',lambda x:'%f'%x)

data = pandas.read_csv('20151009gap.csv', low_memory=False)

print(len(data)) #Number of observations (Rows)

print(len(data.columns)) #Number of Variables (Columns)

data['polityscore'] = data['polityscore'].convert_objects(convert_numeric=True)

data['suicideper100th'] = data['suicideper100th'].convert_objects(convert_numeric=True)

data['employrate'] = data['employrate'].convert_objects(convert_numeric=True)

data['incomeperperson'] = data['incomeperperson'].convert_objects(convert_numeric=True)

data['co2emissions'] = data['co2emissions'].convert_objects(convert_numeric=True)

#Making a copy of data to sub5 data frame

sub5=data.copy()

#Filling empty records with avearge value of the column, I am doing this invidually to only numeric columns

#fillna function is used to fill NaN with mean values. This helps to make our analysis more accurate. Thus managing empty spaces.

sub5['polityscore'].fillna((sub5['polityscore'].mean()), inplace=True)

sub5['suicideper100th'].fillna((sub5['suicideper100th'].mean()), inplace=True)

sub5['employrate'].fillna((sub5['employrate'].mean()), inplace=True)

sub5['incomeperperson'].fillna((sub5['incomeperperson'].mean()), inplace=True)

sub5['co2emissions'].fillna((sub5['co2emissions'].mean()), inplace=True)

# categorize quantitative variable based on customized splits using cut function - making a new variable polity4

# splits into 4 groups

sub5['polity4'] = pandas.cut(sub5.polityscore, [-10, -5, 0, 5, 10])

f1 = sub5['polity4'].value_counts(sort=False)

f2 = sub5['polity4'].value_counts(sort=False, normalize=True)

print ('\n\n Polity Score divided into 4 parts, frequesty and percentage of each is given \n\n')

print(f1)

print(f2)

# quartile split (use qcut function & ask for 4 groups - gives you quartile split)

sub5['suicide4']=pandas.qcut(sub5.suicideper100th, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])

f3 = sub5['suicide4'].value_counts(sort=False)

f4 = sub5['suicide4'].value_counts(sort=False, normalize=True)

print ('\n\n suicide Score divided into 4 parts, frequesty and percentage of each is given \n\n')

print(f3)

print(f4)

# quartile split (use qcut function & ask for 4 groups - gives you quartile split)

sub5['employ4']=pandas.qcut(sub5.employrate, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])

f5 = sub5['employ4'].value_counts(sort=False)

f6 = sub5['employ4'].value_counts(sort=False, normalize=True)

print ('\n\n employrate Score divided into 4 parts, frequesty and percentage of each is given \n\n')

print(f5)

print(f6)

import pandas

import numpy

import seaborn

import matplotlib.pyplot as plt

pandas.set_option('display.float_format',lambda x:'%f'%x)

data = pandas.read_csv('20151009gap.csv', low_memory=False)

print(len(data)) #Number of observations (Rows)

print(len(data.columns)) #Number of Variables (Columns)

data['polityscore'] = data['polityscore'].convert_objects(convert_numeric=True)

data['suicideper100th'] = data['suicideper100th'].convert_objects(convert_numeric=True)

data['employrate'] = data['employrate'].convert_objects(convert_numeric=True)

data['incomeperperson'] = data['incomeperperson'].convert_objects(convert_numeric=True)

data['co2emissions'] = data['co2emissions'].convert_objects(convert_numeric=True)

#Making a copy of data to sub5 data frame

sub5=data.copy()

#Filling empty records with avearge value of the column, I am doing this invidually to only numeric columns

#fillna function is used to fill NaN with mean values. This helps to make our analysis more accurate. Thus managing empty spaces.

sub5['polityscore'].fillna((sub5['polityscore'].mean()), inplace=True)

sub5['suicideper100th'].fillna((sub5['suicideper100th'].mean()), inplace=True)

sub5['employrate'].fillna((sub5['employrate'].mean()), inplace=True)

sub5['incomeperperson'].fillna((sub5['incomeperperson'].mean()), inplace=True)

sub5['co2emissions'].fillna((sub5['co2emissions'].mean()), inplace=True)

# categorize quantitative variable based on customized splits using cut function - making a new variable polity4

# splits into 4 groups

sub5['polity4'] = pandas.cut(sub5.polityscore, [-10, -5, 0, 5, 10])

f1 = sub5['polity4'].value_counts(sort=False)

f2 = sub5['polity4'].value_counts(sort=False, normalize=True)

print ('\n\n Polity Score divided into 4 parts, frequesty and percentage of each is given \n\n')

print(f1)

print(f2)

# quartile split (use qcut function & ask for 4 groups - gives you quartile split)

sub5['suicide4']=pandas.qcut(sub5.suicideper100th, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])

f3 = sub5['suicide4'].value_counts(sort=False)

f4 = sub5['suicide4'].value_counts(sort=False, normalize=True)

print ('\n\n suicide Score divided into 4 parts, frequesty and percentage of each is given \n\n')

print(f3)

print(f4)

# quartile split (use qcut function & ask for 4 groups - gives you quartile split)

sub5['employ4']=pandas.qcut(sub5.employrate, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])

f5 = sub5['employ4'].value_counts(sort=False)

f6 = sub5['employ4'].value_counts(sort=False, normalize=True)

print ('\n\n employrate Score divided into 4 parts, frequesty and percentage of each is given \n\n')

print(f5)

print(f6)

"""

#basic scatterplot: Q->Q

scat1 = seaborn.regplot(x="polityscore", y="suicideper100th", data=data)

plt.xlabel('polityscore')

plt.ylabel('Suicide rate per 100th')

plt.title('Scatterplot for the Association Between Suicide Rate and polityscore')

scat2 = seaborn.regplot(x="employrate", y="suicideper100th", data=data)

plt.xlabel('employrate')

plt.ylabel('Suicide rate per 100th')

plt.title('Scatterplot for the Association Between Suicide Rate and employrate')

"""

# quartile split (use qcut function & ask for 4 groups - gives you quartile split)

print ('employment - 4 categories - quartiles')

data['employrate']=pandas.qcut(data.incomeperperson, 4, labels=["1=25th%tile","2=50%tile","3=75%tile","4=100%tile"])

g1 = data['employrate'].value_counts(sort=False, dropna=True)

print (g1)

# bivariate bar graph

seaborn.factorplot(x='employrate', y='suicideper100th', data=data, kind="bar", ci=None)

plt.xlabel('employrate')

plt.ylabel('suicideper100th')

OUTPUT

From the bar chart and even the scatter plot I see that there does not any direct relationship between the researched variables. Further analysis is required to look into other variables, or may be the combined effect of variables will have to be studied.