# Loading required modules import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt #Download the data set url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" columns = ["age", "work-class", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"] data = pd.read_csv(url, names=columns, sep=',', na_values='?', skipinitialspace = True) #Print the first values to familiarize yourself with the features #Head data data.head() # Age Statistics print("Age Statistics") print(data['age'].describe()) print("Median Age: ", data['age'].median()) # Final weight Statistics print("Final weight Statistics") print(data['fnlwgt'].describe()) print("Median Final Weight: ", data['fnlwgt'].median()) # Education number Statistics print("Education Number Statistics") print(data['education-num'].describe()) print("Education Num Age: ", data['education-num'].median()) # Capital Gain Statistics print("Capital Gain Statistics") print(data['capital-gain'].describe()) print("Median Capital Gain: ", data['capital-gain'].median()) # Capital Loss Statistics print("Capital Loss Statistics") print(data['capital-loss'].describe()) print("Median Capital Loss: ", data['capital-loss'].median()) # Hours per week Statistics print("Hours per week Statistics") print(data['hours-per-week'].describe()) print("Median hours-per-week: ", data['hours-per-week'].median()) # Can you do a table summarizing all numeric features? # Explain why it is different from doing one by one than doing all in one # Are we missing any statistical information? # Clue anything relating to extreme values # If so calculate this value and interpret the output # DATA VISUALIZATION # Visualize the most important or interesting attributes using appropriate techniques. # For each visualization, provide an interpretation explaining why it is appropriate or interesting. # What does each visualization tell us? # Plotting histogram for numerical values numerical_attributes = data.select_dtypes(include=['int']) numerical_attributes.hist(figsize=(12,12)) # Read the documentation and explain in your own words the code: # "numerical_attributes = data.select_dtypes(include=['int'])", # this should be easy, barely an inconvenience and take less than 50 words but you MUST be very specific. # Interpet the visualizations, aka the histograms and summarize your observations, BASED ON THE HISTOGRAMS # Plotting count plot for categorical values categorical_attributes = data.select_dtypes(include=['object']) # Work-class Count plot plt.figure(figsize=(12,6)) sns.countplot(data = categorical_attributes, x = "work-class") # What do you see from the graph? What foes the worker distribution tells you? # Marital-Status Count plot plt.figure(figsize=(12,6)) sns.countplot(data = categorical_attributes, x = "marital-status") # Create a histogram for the occupancy of surveyed individuals # Interpret the results in a summaruzed manner # Do the same for their relationship status # Explain the differecnes or observations # Do a sex count graph # Explain your findings, is this expected? # Finally do an income count graph # What do you see? # What are the implications fo your observations? #DATA QUALITY ASSURANCE #Verify data quality: explain any missing values, duplicate data, or outliers. #What, if anything, do you need to do about these? Be specific. #Check missing values data.info() # Identify the features missing values and write code to identify these features and the number of missing values # Now what... what do you do with the missing data? # DATA RELATIONSHIPS # Scatterpolots are really good to identify data relationships between features or attributes. # We can also use correlation heatmaps for this purpose # Explore the relationships among the attributes, excluding the class attribute. # Use scatter plots, correlation matrices, cross-tabulations, group-wise averages, or other appropriate techniques. # Explain and interpret any interesting relationships sns.pairplot(data, height=3, diag_kind = 'kde', hue='income') # Compute the correlation matrix corr = data.corr() # Set up the matplotlib figure f, ax = plt.subplots(figsize=(16, 12)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap _ = sns.heatmap(corr, cmap="YlGn", square=True, ax = ax, annot=True, linewidth = 0.1) plt.title('Pearson Correlation of Features', y=1.05, size=15) # Compare and contrast how using scatterplots and heatmoas makes data interpretation different # What does the scatterplot tells you and what does the the heatmap tells you? # Exaplin the observations from the heatmap - use the correlation information! # We are interested in tabulating occupation/ work-class and sex # Pandas has certain commands to do this, use the correct oen for this # This process is known as corss -tabulation # Cross-tabulation code: pd.crosstab(data['work-class'],data['sex'], margins=True) # Interpret your data # Now we visualize the corss-tabulated information # Box plot between work-class and age for different sex plt.figure(figsize=(12,6)) sns.boxplot(x="work-class",y="age", hue="sex", data=data) # What did we see? # It is easier to generate and convey info using graphs than the table # Be smart, your data is skewed from the survey do not say something that is not data driven # Similarly we are now interested in sex but also in country of origin or native country # First cross-tabulate, again using Pandas # This is the cross-tabulate line: #pd.crosstab(data['native-country'], # data['sex'], margins=True) # Look at the table, and describe what can you see? # Do tell, if you think the same way I do, then something must be done... perhaps convert the data/feature? # How? Explain # Plot and show # Cross-tabulate sex and education # Look at the classes, perhaps you can group? # Think about it. It seems that the ones that: # Plot the new grouped data and also cross-tabulate this data set # Compare the previous and new results. Explaion your findings and your griuoing method # Previously you grouped education, and native country # It seems possible that you can also group data by marital-status # Create new groups of marital-status using the knowledge you have # RELATIONSHIP WITH RESPONSE ATTRIBUTE # OK, up to this pont we have looked at the data, its charactersitcs and trends. We know what is what inside the dataset, # but how about its relationship to the response variable # What is the response variable in oir datset? # Going back to the introduction age is numeric, and it is a continous varible. # What does this mean? # ExactlY! You must divide age into ranges data['age_bin'] = pd.cut(data['age'], 10) # Plotting count plot for age and income fig = plt.figure(figsize=(20,5)) sns.countplot(x="age_bin", hue="income", data=data) # Interpret # OK, now look at at sex and the response variable # Use the right plot to do this # Interpret # Plot education and response variable # Remeber that you have grouped education # Interpret # Finally, you grouped people based on their relationship # Plot response variable and relationship # Interpret # Do you think that we could do further groupings? # DATA PROCESSING STEP # We have grouoped some features and changed the dataset based on the EDA # Now we need to address 2 things: a) duplicates and b) missing values # Remove missing values data = data.dropna() # Remove duplicate values data = data.drop_duplicates() # Remove extreme values o = data[data['capital-gain'] > 80000].index data = data.drop(o) # Remove the column fnlwgt using the drop function, again data = data.drop(columns='fnlwgt') # Create new columns, by combining capital-gain and capital-loss data['netcapitalgain'] = data.apply(lambda x: x['capital-gain'] + x['capital-loss'], axis=1) data = data.drop(columns='capital-gain') data = data.drop(columns='capital-loss') data.info() # Explain what dropping the capital gain outliers did # Explain what is net-capital-gain # Summarize or describe the main statistics of the data # Correlation matrix corr = data.corr() # Matplotlib figure f, ax = plt.subplots(figsize=(16, 12)) # Generate the customized colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap, however please read the libraries readme and toy with color schemes _ = sns.heatmap(corr, cmap="YlGn", square=True, ax = ax, annot=True, linewidth = 0.1) plt.title('Pearson Correlation of Features', y=1.05, size=15)