!pip install pycountry
import pycountry
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

Requirement already satisfied: pycountry in /opt/conda/lib/python3.8/site-packages (20.7.3)


df = pd.read_csv("master.csv")
df.rename(columns = {"suicides/100k pop":"suicides_pop","HDI for year":"HDI_for_year",
                  " gdp_for_year ($) ":"gdp_for_year"," gdp_per_capita ($) ":"gdp_per_capita",
                    "gdp_per_capita ($)":"gdp_per_capita"}, inplace=True)
df.head()


df["age"] = df["age"].str.replace("5-14 years","05-14 years")
df.describe(include='all')


df_men = df[df.sex == 'male']
df_women = df[df.sex == 'female']
plt.figure(figsize=(9,6))
plt.title('Suicide number by Gender (1985-2016)', fontsize=20)
sns.lineplot(df_men.year, df.suicides_no, ci = None)
sns.lineplot(df_women.year, df.suicides_no, ci = None)
plt.legend(["male", 'female'])
plt.show()

/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


df_age = df.groupby(["year","age"])["suicides_no","population"].sum()
df_reset = df_age.copy().reset_index()
plt.figure(figsize=(9,6))
plt.title('Suicide number by Age (1985-2016)', fontsize=20)
sns.lineplot("year", df_reset.suicides_no*100/df_reset.population, hue = "age",
             data = df_reset, linewidth = 2.5, style = "age", markers=True
            , dashes=False)
plt.xticks(rotation = 80)
plt.show()

<ipython-input-5-d9ecfd90e2ea>:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  df_age = df.groupby(["year","age"])["suicides_no","population"].sum()
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


plt.figure(figsize=(9,5))
sns.barplot(x=df['age'], y=df['suicides_no'])
plt.xlabel('Age Group')
plt.ylabel('Suicide Count')
plt.title('Age Group - Suicide Count Bar Plot')
plt.show()


data_generation =  df.groupby('generation', as_index=False)['suicides_no'].sum().sort_values(by='suicides_no', ascending=False)
data_generation


plt.figure(figsize=(6,6))
plt.title('Suicide number (1985-2016)', fontsize=20)
plt.pie(data_generation.suicides_no, explode =(0.05, 0.05, 0.05, 0.05, 0.05, 0.05), labels=data_generation.generation, autopct='%1.1f%%')
plt.show()


df1 = df.groupby("country")["suicides_no"].sum()
country_name = list(df1.index.get_level_values(0))
countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_3

country_not_in_list = [i for i in country_name[:] if i not in countries.keys()]
country_not_in_list
#Corresponding the country names with respect to the code
df.replace("Republic of Korea", "Korea, Republic of", inplace = True)
df.replace('Czech Republic', "Czechia", inplace = True)
df.replace('Macau', 'Macao', inplace = True)
df.replace('Saint Vincent and Grenadines', "Saint Vincent and the Grenadines", inplace = True)

#Before ploting the suicide rate of each countries,
#we calcuate the suicide rate for each year, and then take the average of number as rate.
df_suino = df.groupby(["country","year"])["suicides_no"].sum()
df_sum = df_suino.sort_index(ascending=True)[:] * 100

df_pop = df.groupby(["country","year"]).population.sum()
df_pop_sum = df_pop.sort_index(ascending=False)[:]

df_total = df_sum / df_pop_sum

country_dict={}
for country in df_total.index.get_level_values(0):
    if country not in country_dict.keys():
        country_dict[country] = df_total[country].mean()
    else:
        pass

tup = list(country_dict.items())
tup.sort(key= lambda pair:pair[1], reverse = True)

country_list = [a[0] for a in tup]
country_suicide = [a[1] for a in tup]

plt.figure(figsize=(8,32))
sns.barplot(x=country_suicide[:],y=country_list[:])
plt.xlabel("ratio of suicide")
plt.ylabel("country")
plt.title("suicide rate vs country")
plt.show()


plt.figure(figsize=(9,6))
sns.heatmap(df.corr(), annot=True, cmap='Blues')
plt.show()


df = df.drop(['HDI_for_year'], axis = 1)
df = df.drop(['country-year'], axis = 1)
df.shape

(27820, 10)


df['gdp_for_year'] = df['gdp_for_year'].str.replace(',','').astype(float)


#encoding the categorical features with LabelEncoder
categorical = ['country', 'year','age', 'sex', 'generation']
le = sklearn.preprocessing.LabelEncoder()

for column in categorical:
    df[column] = le.fit_transform(df[column])


numerical = ['suicides_no', 'population', 'suicides_pop', 
              'gdp_for_year','gdp_per_capita']

rc = RobustScaler()
df[numerical] = rc.fit_transform(df[numerical])


# Sepratating & assigning features and target columns to X & y

y = df['suicides_pop']
X = df.drop('suicides_pop',axis=1)
X.shape, y.shape

((27820, 9), (27820,))


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((22256, 9), (5564, 9))


from sklearn.metrics import mean_squared_error
# instantiate the model
lr = LinearRegression()
# fit the model 
lr.fit(X_train, y_train)

#predicting the target value from the model for the samples
y_test_lr = lr.predict(X_test)
y_train_lr = lr.predict(X_train)

#computing the accuracy of the model performance
acc_train_lr = lr.score(X_train, y_train)
acc_test_lr = lr.score(X_test, y_test)

#computing root mean squared error (RMSE)
rmse_train_lr = np.sqrt(mean_squared_error(y_train, y_train_lr))
rmse_test_lr = np.sqrt(mean_squared_error(y_test, y_test_lr))

print("Linear Regression: Accuracy on training Data: {:.3f}".format(acc_train_lr))
print("Linear Regression: Accuracy on test Data: {:.3f}".format(acc_test_lr))
print('\nLinear Regression: The RMSE of the training set is:', rmse_train_lr)
print('Linear Regression: The RMSE of the testing set is:', rmse_test_lr)

Linear Regression: Accuracy on training Data: 0.356
Linear Regression: Accuracy on test Data: 0.362

Linear Regression: The RMSE of the training set is: 0.9631210108691918
Linear Regression: The RMSE of the testing set is: 0.9872040149556771


y_hat = lr.predict(X_train)
sns.distplot(y_train - y_hat)
plt.title("Residuals", size=18)
lr.score(X_train,y_train)

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

0.3563802724646521


from sklearn.tree import DecisionTreeRegressor

# instantiate the model 
tree = DecisionTreeRegressor(max_depth=9)
# fit the model 
tree.fit(X_train, y_train)

#predicting the target value from the model for the samples
y_test_tree = tree.predict(X_test)
y_train_tree = tree.predict(X_train)

#computing the accuracy of the model performance
acc_train_tree = tree.score(X_train, y_train)
acc_test_tree = tree.score(X_test, y_test)

#computing root mean squared error (RMSE)
rmse_train_tree = np.sqrt(mean_squared_error(y_train, y_train_tree))
rmse_test_tree = np.sqrt(mean_squared_error(y_test, y_test_tree))

print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(acc_test_tree))
print('\nDecision Tree: The RMSE of the training set is:', rmse_train_tree)
print('Decision Tree: The RMSE of the testing set is:', rmse_test_tree)

Decision Tree: Accuracy on training Data: 0.966
Decision Tree: Accuracy on test Data: 0.952

Decision Tree: The RMSE of the training set is: 0.21989455443994949
Decision Tree: The RMSE of the testing set is: 0.27214664660458954

	country	year	sex	age	suicides_no	population	suicides_pop	country-year	HDI_for_year	gdp_for_year	gdp_per_capita	generation
0	Albania	1987	male	15-24 years	21	312900	6.71	Albania1987	NaN	2,156,624,900	796	Generation X
1	Albania	1987	male	35-54 years	16	308000	5.19	Albania1987	NaN	2,156,624,900	796	Silent
2	Albania	1987	female	15-24 years	14	289700	4.83	Albania1987	NaN	2,156,624,900	796	Generation X
3	Albania	1987	male	75+ years	1	21800	4.59	Albania1987	NaN	2,156,624,900	796	G.I. Generation
4	Albania	1987	male	25-34 years	9	274300	3.28	Albania1987	NaN	2,156,624,900	796	Boomers

	country	year	sex	age	suicides_no	population	suicides_pop	country-year	HDI_for_year	gdp_for_year	gdp_per_capita	generation
count	27820	27820.000000	27820	27820	27820.000000	2.782000e+04	27820.000000	27820	8364.000000	27820	27820.000000	27820
unique	101	NaN	2	6	NaN	NaN	NaN	2321	NaN	2321	NaN	6
top	Netherlands	NaN	female	35-54 years	NaN	NaN	NaN	Malta2014	NaN	5,477,895,475	NaN	Generation X
freq	382	NaN	13910	4642	NaN	NaN	NaN	12	NaN	12	NaN	6408
mean	NaN	2001.258375	NaN	NaN	242.574407	1.844794e+06	12.816097	NaN	0.776601	NaN	16866.464414	NaN
std	NaN	8.469055	NaN	NaN	902.047917	3.911779e+06	18.961511	NaN	0.093367	NaN	18887.576472	NaN
min	NaN	1985.000000	NaN	NaN	0.000000	2.780000e+02	0.000000	NaN	0.483000	NaN	251.000000	NaN
25%	NaN	1995.000000	NaN	NaN	3.000000	9.749850e+04	0.920000	NaN	0.713000	NaN	3447.000000	NaN
50%	NaN	2002.000000	NaN	NaN	25.000000	4.301500e+05	5.990000	NaN	0.779000	NaN	9372.000000	NaN
75%	NaN	2008.000000	NaN	NaN	131.000000	1.486143e+06	16.620000	NaN	0.855000	NaN	24874.000000	NaN
max	NaN	2016.000000	NaN	NaN	22338.000000	4.380521e+07	224.970000	NaN	0.944000	NaN	126352.000000	NaN

	generation	suicides_no
0	Boomers	2284498
5	Silent	1781744
2	Generation X	1532804
4	Millenials	623459
1	G.I. Generation	510009
3	Generation Z	15906

Analyzing Global Suicide Rate from 1985 to 2016¶

Ziyu Wang¶

Outline¶

1. Introduction¶

1.1 Background Infromation¶

1.2 Library Used¶

2 About the data¶

2.1 Data Source¶

2.2 Data Load and view¶

3. Data Analysis & Visualization¶

4. Machine Learning Algorithm¶

4.1 Dataset Standarlization¶

4.2 Splitting Data¶

4.3 Model Building & Training¶

4.3.1 Linear Regression¶

4.3.2 Decision Tree : Regression¶

For more information about machine Learning, I found this amazing website that gives tutorial about Machine Learning skills and you can also download a free ebook about Machine Learnning here¶

5. Conclusion¶

5.1 Tutorial recap¶

5.2 Suicide Prevention¶

For immediate help¶