# Loading required modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Download the data set
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columns = ["age", "work-class", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship",
          "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
           
data = pd.read_csv(url, names=columns, sep=',', na_values='?', skipinitialspace = True)


#Print the first values to familiarize yourself with the features

#Head data
data.head()

# Age Statistics

print("Age Statistics")
print(data['age'].describe())
print("Median Age: ", data['age'].median())

# Final weight Statistics

print("Final weight Statistics")
print(data['fnlwgt'].describe())
print("Median Final Weight: ", data['fnlwgt'].median())

# Education number Statistics

print("Education Number Statistics")
print(data['education-num'].describe())
print("Education Num Age: ", data['education-num'].median())

# Capital Gain Statistics
print("Capital Gain Statistics")
print(data['capital-gain'].describe())
print("Median Capital Gain: ", data['capital-gain'].median())

# Capital Loss Statistics
print("Capital Loss Statistics")
print(data['capital-loss'].describe())
print("Median Capital Loss: ", data['capital-loss'].median())

# Hours per week Statistics
print("Hours per week Statistics")
print(data['hours-per-week'].describe())
print("Median hours-per-week: ", data['hours-per-week'].median())

# Can you do a table summarizing all numeric features?
# Explain why it is different from doing one by one than doing all in one

# Are we missing any statistical information?
# Clue anything relating to extreme values
# If so calculate this value and interpret the output

# DATA VISUALIZATION
# Visualize the most important or interesting attributes using appropriate techniques. 
# For each visualization, provide an interpretation explaining why it is appropriate or interesting. 
# What does each visualization tell us?

# Plotting histogram for numerical values
numerical_attributes = data.select_dtypes(include=['int'])

numerical_attributes.hist(figsize=(12,12))

# Read the documentation and explain in your own words the code: 
# "numerical_attributes = data.select_dtypes(include=['int'])", 
# this should be easy, barely an inconvenience and take less than 50 words but you MUST be very specific.

# Interpet the visualizations, aka the histograms and summarize your observations, BASED ON THE HISTOGRAMS

# Plotting count plot for categorical values
categorical_attributes = data.select_dtypes(include=['object'])

# Work-class Count plot
plt.figure(figsize=(12,6))
sns.countplot(data = categorical_attributes, x = "work-class")

# What do you see from the graph? What foes the worker distribution tells you?

# Marital-Status Count plot
plt.figure(figsize=(12,6))
sns.countplot(data = categorical_attributes, x = "marital-status")

# Create a histogram for the occupancy of surveyed individuals
# Interpret the results in a summaruzed manner

# Do the same for their relationship status
# Explain the differecnes or observations

# Do a sex count graph
# Explain your findings, is this expected?

# Finally do an income count graph
# What do you see?
# What are the implications fo your observations?

#DATA QUALITY ASSURANCE
#Verify data quality: explain any missing values, duplicate data, or outliers. 
#What, if anything, do you need to do about these? Be specific. 

#Check missing values
data.info()

# Identify the features missing values and write code to identify these features and the number of missing values

# Now what... what do you do with the missing data?

# DATA RELATIONSHIPS

# Scatterpolots are really good to identify data relationships between features or attributes. 
# We can also use correlation heatmaps for this purpose

# Explore the relationships among the attributes, excluding the class attribute. 
# Use scatter plots, correlation matrices, cross-tabulations, group-wise averages, or other appropriate techniques. 
# Explain and interpret any interesting relationships

sns.pairplot(data, height=3, diag_kind = 'kde', hue='income')

# Compute the correlation matrix
corr = data.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap
_ = sns.heatmap(corr, cmap="YlGn", square=True, ax = ax, annot=True, linewidth = 0.1)

plt.title('Pearson Correlation of Features', y=1.05, size=15)

# Compare and contrast how using scatterplots and heatmoas makes data interpretation different
# What does the scatterplot tells you and what does the the heatmap tells you?
# Exaplin the observations from the heatmap - use the correlation information!

# We are interested in tabulating occupation/ work-class and sex
# Pandas has certain commands to do this, use the correct oen for this
# This process is known as corss -tabulation

# Cross-tabulation code:
pd.crosstab(data['work-class'],data['sex'], margins=True)


# Interpret your data

# Now we visualize the corss-tabulated information
# Box plot between work-class and age for different sex

plt.figure(figsize=(12,6))
sns.boxplot(x="work-class",y="age", hue="sex", data=data)

# What did we see?
# It is easier to generate and convey info using graphs than the table
# Be smart, your data is skewed from the survey do not say something that is not data driven

# Similarly we are now interested in sex but also in country of origin or native country
# First cross-tabulate, again using Pandas

# This is the cross-tabulate line:
#pd.crosstab(data['native-country'],
#            data['sex'], margins=True)

# Look at the table, and describe what can you see?
# Do tell, if you think the same way I do, then something must be done... perhaps convert the data/feature?
# How? Explain
# Plot and show 


# Cross-tabulate sex and education
# Look at the classes, perhaps you can group?
# Think about it. It seems that the ones that:
# Plot the new grouped data and also cross-tabulate this data set
# Compare the previous and new results. Explaion your findings and your griuoing method

# Previously you grouped education, and native country
# It seems possible that you can also group data by marital-status
# Create new groups of marital-status using the knowledge you have

# RELATIONSHIP WITH RESPONSE ATTRIBUTE

# OK, up to this pont we have looked at the data, its charactersitcs and trends. We know what is what inside the dataset, 
# but how about its relationship to the response variable

# What is the response variable in oir datset?

# Going back to the introduction age is numeric, and it is a continous varible.
# What does this mean?

# ExactlY! You must divide age into ranges
data['age_bin'] = pd.cut(data['age'], 10)

# Plotting count plot for age and income
fig = plt.figure(figsize=(20,5))
sns.countplot(x="age_bin", hue="income", data=data)

# Interpret

# OK, now look at at sex and the response variable
# Use the right plot to do this
# Interpret

# Plot education and response variable
# Remeber that you have grouped education
# Interpret

# Finally, you grouped people based on their relationship
# Plot response variable and relationship
# Interpret

# Do you think that we could do further groupings?


# DATA PROCESSING STEP

# We have grouoped some features and changed the dataset based on the EDA
# Now we need to address 2 things: a) duplicates and b) missing values

# Remove missing values
data = data.dropna()
# Remove duplicate values
data = data.drop_duplicates()
# Remove extreme values
o = data[data['capital-gain'] > 80000].index
data = data.drop(o)
# Remove the column fnlwgt using the drop function, again
data = data.drop(columns='fnlwgt')
# Create new columns, by combining capital-gain and capital-loss
data['netcapitalgain'] = data.apply(lambda x: x['capital-gain'] + x['capital-loss'], axis=1)
data = data.drop(columns='capital-gain')
data = data.drop(columns='capital-loss')

data.info()

# Explain what dropping the capital gain outliers did
# Explain what is net-capital-gain

# Summarize or describe the main statistics of the data

# Correlation matrix
corr = data.corr()

# Matplotlib figure
f, ax = plt.subplots(figsize=(16, 12))

# Generate the customized colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap, however please read the libraries readme and toy with color schemes
_ = sns.heatmap(corr, cmap="YlGn", square=True, ax = ax, annot=True, linewidth = 0.1)

plt.title('Pearson Correlation of Features', y=1.05, size=15)