Datasource : https://www.kaggle.com/datasets/tklimonova/gapminder-datacamp-2007/data
# Load in some packages
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
population_df = pd.read_csv(r"C:\Users\jki\Downloads\gapminder_full.csv")
population_df
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
0 | Afghanistan | 1952 | 8425333 | Asia | 28.801 | 779.445314 |
1 | Afghanistan | 1957 | 9240934 | Asia | 30.332 | 820.853030 |
2 | Afghanistan | 1962 | 10267083 | Asia | 31.997 | 853.100710 |
3 | Afghanistan | 1967 | 11537966 | Asia | 34.020 | 836.197138 |
4 | Afghanistan | 1972 | 13079460 | Asia | 36.088 | 739.981106 |
... | ... | ... | ... | ... | ... | ... |
1699 | Zimbabwe | 1987 | 9216418 | Africa | 62.351 | 706.157306 |
1700 | Zimbabwe | 1992 | 10704340 | Africa | 60.377 | 693.420786 |
1701 | Zimbabwe | 1997 | 11404948 | Africa | 46.809 | 792.449960 |
1702 | Zimbabwe | 2002 | 11926563 | Africa | 39.989 | 672.038623 |
1703 | Zimbabwe | 2007 | 12311143 | Africa | 43.487 | 469.709298 |
1704 rows × 6 columns
After getting a sense of the data's structure, it is a good idea to look at a statistical summary of the variables with df.describe()
population_df.describe()
year | population | life_exp | gdp_cap | |
---|---|---|---|---|
count | 1704.00000 | 1.704000e+03 | 1704.000000 | 1704.000000 |
mean | 1979.50000 | 2.960121e+07 | 59.474439 | 7215.327081 |
std | 17.26533 | 1.061579e+08 | 12.917107 | 9857.454543 |
min | 1952.00000 | 6.001100e+04 | 23.599000 | 241.165876 |
25% | 1965.75000 | 2.793664e+06 | 48.198000 | 1202.060309 |
50% | 1979.50000 | 7.023596e+06 | 60.712500 | 3531.846988 |
75% | 1993.25000 | 1.958522e+07 | 70.845500 | 9325.462346 |
max | 2007.00000 | 1.318683e+09 | 82.603000 | 113523.132900 |
# lets check for missing
missing_values = population_df.isna().sum()
print(missing_values)
country 0 year 0 population 0 continent 0 life_exp 0 gdp_cap 0 dtype: int64
# lets check on data types
population_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1704 entries, 0 to 1703 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 1704 non-null object 1 year 1704 non-null int64 2 population 1704 non-null int64 3 continent 1704 non-null object 4 life_exp 1704 non-null float64 5 gdp_cap 1704 non-null float64 dtypes: float64(2), int64(2), object(2) memory usage: 80.0+ KB
population_df.head(3)
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
0 | Afghanistan | 1952 | 8425333 | Asia | 28.801 | 779.445314 |
1 | Afghanistan | 1957 | 9240934 | Asia | 30.332 | 820.853030 |
2 | Afghanistan | 1962 | 10267083 | Asia | 31.997 | 853.100710 |
# Plot the Population size for each country
population_size = population_df.groupby('country').sum()['population']
# Sort the values in descending order and select the top ten countries
top_ten_population_size= population_size.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_population_size)
# Plot the top ten Countries
top_ten_population_size.plot(kind='bar', color='green', figsize=(10, 6))
plt.title('Top Ten Countries based on popluation ')
plt.xlabel('Country')
plt.ylabel('population')
plt.show()
country China 11497920623 India 8413568878 United States 2738534790 Indonesia 1779874000 Brazil 1467745520 Japan 1341105696 Pakistan 1124200629 Bangladesh 1089064744 Germany 930564520 Nigeria 884496214 Name: population, dtype: int64
population_df['life_exp'] = population_df['life_exp'].astype(int)
# Plot the life expectancy for each country
life_expectancy = population_df.groupby('country').sum()['life_exp']
# Sort the values in descending order and select the top ten
top_ten_life_expectancy= life_expectancy.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_life_expectancy)
# Plot the top ten Country
top_ten_life_expectancy.plot(kind='bar', color='blue', figsize=(10, 6))
plt.title('Top Ten Countries based on life expectancy ')
plt.xlabel('Country')
plt.ylabel('life expectancy ')
plt.show()
country Iceland 911 Sweden 909 Norway 905 Netherlands 902 Switzerland 901 Japan 893 Canada 891 Australia 890 Denmark 886 France 885 Name: life_exp, dtype: int32
# Plot the GDP (income) for each country
income_amount = population_df.groupby('country').sum()['gdp_cap']
# Sort the values in descending order and select the top ten
top_ten_income_amount= income_amount.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_income_amount)
# Pltop_ten_income_amount the top ten Country
top_ten_income_amount.plot(kind='bar', color='brown', figsize=(10, 6))
plt.title('Top Ten Countries based on Income ')
plt.xlabel('Country')
plt.ylabel('GDP Income')
plt.show()
country Kuwait 783994.925660 Switzerland 324892.012860 Norway 320967.678650 United States 315133.816160 Canada 268928.956080 Netherlands 260986.226498 Denmark 260061.898655 Germany 246680.213193 Iceland 246377.067270 Austria 244942.995352 Name: gdp_cap, dtype: float64
population_df
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
0 | Afghanistan | 1952 | 8425333 | Asia | 28 | 779.445314 |
1 | Afghanistan | 1957 | 9240934 | Asia | 30 | 820.853030 |
2 | Afghanistan | 1962 | 10267083 | Asia | 31 | 853.100710 |
3 | Afghanistan | 1967 | 11537966 | Asia | 34 | 836.197138 |
4 | Afghanistan | 1972 | 13079460 | Asia | 36 | 739.981106 |
... | ... | ... | ... | ... | ... | ... |
1699 | Zimbabwe | 1987 | 9216418 | Africa | 62 | 706.157306 |
1700 | Zimbabwe | 1992 | 10704340 | Africa | 60 | 693.420786 |
1701 | Zimbabwe | 1997 | 11404948 | Africa | 46 | 792.449960 |
1702 | Zimbabwe | 2002 | 11926563 | Africa | 39 | 672.038623 |
1703 | Zimbabwe | 2007 | 12311143 | Africa | 43 | 469.709298 |
1704 rows × 6 columns
# Selecting data for Kuwait
kuwait_data = population_df[population_df['country'] == "Kuwait"]
# Extracting specific columns
f1 = kuwait_data['life_exp']
f2 = kuwait_data['gdp_cap']
# Checking if both conditions hold true
kuwait_filtered_data = kuwait_data[(f1.notnull()) & (f2.notnull())]
kuwait_filtered_data.head(5)
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
852 | Kuwait | 1952 | 160000 | Asia | 55 | 108382.35290 |
853 | Kuwait | 1957 | 212846 | Asia | 58 | 113523.13290 |
854 | Kuwait | 1962 | 358266 | Asia | 60 | 95458.11176 |
855 | Kuwait | 1967 | 575003 | Asia | 64 | 80894.88326 |
856 | Kuwait | 1972 | 841934 | Asia | 67 | 109347.86700 |
kuwait_filtered_data.describe()
year | population | life_exp | gdp_cap | |
---|---|---|---|---|
count | 12.000000 | 1.200000e+01 | 12.000000 | 12.000000 |
mean | 1979.500000 | 1.206496e+06 | 68.500000 | 65332.910472 |
std | 18.027756 | 7.836823e+05 | 7.692972 | 33882.139536 |
min | 1952.000000 | 1.600000e+05 | 55.000000 | 28118.429980 |
25% | 1965.750000 | 5.208188e+05 | 63.000000 | 35065.809143 |
50% | 1979.500000 | 1.279226e+06 | 70.000000 | 53286.233460 |
75% | 1993.250000 | 1.796880e+06 | 75.250000 | 98689.172045 |
max | 2007.000000 | 2.505559e+06 | 77.000000 | 113523.132900 |
import plotly.express as px
# Plot histogram
fig = px.histogram(kuwait_filtered_data, x='life_exp', height=320, labels={'life_exp':'Life Expectancy'}, title='Life Expectancy Histogram - Kuwait')
# Show the plot
fig.show()
import plotly.express as px
# Plot relationship between life expectancy and years
fig = px.bar(kuwait_filtered_data, x='year', y='life_exp', height=320, labels={'life_exp':'Life Expectancy'}, title='Relationship between Life Expectancy and Years - Kuwait')
fig.update_layout(xaxis_tickangle=-45) # Rotate x-axis labels for better readability
# Show the plot
fig.show()
import plotly.express as px
# Create a bar plot
fig = px.bar(kuwait_filtered_data, x='year', y='population', color='life_exp', height=320, labels={'population':'Population Kuwait'}, title='Life Expectancy with Respect to Population Growth per Year - Kuwait')
# Show the plot
fig.show()
population_df
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
0 | Afghanistan | 1952 | 8425333 | Asia | 28 | 779.445314 |
1 | Afghanistan | 1957 | 9240934 | Asia | 30 | 820.853030 |
2 | Afghanistan | 1962 | 10267083 | Asia | 31 | 853.100710 |
3 | Afghanistan | 1967 | 11537966 | Asia | 34 | 836.197138 |
4 | Afghanistan | 1972 | 13079460 | Asia | 36 | 739.981106 |
... | ... | ... | ... | ... | ... | ... |
1699 | Zimbabwe | 1987 | 9216418 | Africa | 62 | 706.157306 |
1700 | Zimbabwe | 1992 | 10704340 | Africa | 60 | 693.420786 |
1701 | Zimbabwe | 1997 | 11404948 | Africa | 46 | 792.449960 |
1702 | Zimbabwe | 2002 | 11926563 | Africa | 39 | 672.038623 |
1703 | Zimbabwe | 2007 | 12311143 | Africa | 43 | 469.709298 |
1704 rows × 6 columns
# Selecting data for China
China_data = population_df[population_df['country'] == "China"]
# Extracting specific columns
Chinaf1 = China_data['life_exp']
Chinaf2 = China_data['gdp_cap']
# Checking if both conditions hold true
China_filtered_data = China_data[(Chinaf1.notnull()) & (Chinaf2.notnull())]
China_filtered_data.head(5)
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
288 | China | 1952 | 556263527 | Asia | 44 | 400.448611 |
289 | China | 1957 | 637408000 | Asia | 50 | 575.987001 |
290 | China | 1962 | 665770000 | Asia | 44 | 487.674018 |
291 | China | 1967 | 754550000 | Asia | 58 | 612.705693 |
292 | China | 1972 | 862030000 | Asia | 63 | 676.900092 |
China_filtered_data.describe()
year | population | life_exp | gdp_cap | |
---|---|---|---|---|
count | 12.000000 | 1.200000e+01 | 12.000000 | 12.000000 |
mean | 1979.500000 | 9.581601e+08 | 61.333333 | 1488.307694 |
std | 18.027756 | 2.643949e+08 | 10.174240 | 1370.628333 |
min | 1952.000000 | 5.562635e+08 | 44.000000 | 400.448611 |
25% | 1965.750000 | 7.323550e+08 | 56.000000 | 603.526020 |
50% | 1979.500000 | 9.718680e+08 | 64.000000 | 851.829425 |
75% | 1993.250000 | 1.181246e+09 | 68.500000 | 1814.146653 |
max | 2007.000000 | 1.318683e+09 | 72.000000 | 4959.114854 |
import plotly.express as px
# Assuming population_df is defined elsewhere
# Selecting data for China
china_data = population_df[population_df['country'] == "China"]
# Extracting specific columns
f1 = china_data['life_exp']
f2 = china_data['gdp_cap']
# Checking if both conditions hold true
china_filtered_data = china_data[(f1.notnull()) & (f2.notnull())]
# Plot histogram
fig = px.histogram(china_filtered_data, x='life_exp', height=320, labels={'life_exp':'Life Expectancy'}, title='Life Expectancy Histogram - China')
# Show the plot
fig.show()
import plotly.express as px
# Plot relationship between life expectancy and years
fig = px.bar(china_filtered_data, x='year', y='life_exp', height=320, labels={'life_exp':'Life Expectancy'}, title='Relationship between Life Expectancy and Years - China')
fig.update_layout(xaxis_tickangle=-45) # Rotate x-axis labels for better readability
# Show the plot
fig.show()
import plotly.express as px
# Create a bar plot
fig = px.bar(china_filtered_data, x='year', y='population', color='life_exp', height=320, labels={'population':'Population Kuwait'}, title='Life Expectancy with Respect to Population Growth per Year - China')
# Show the plot
fig.show()
population_df
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
0 | Afghanistan | 1952 | 8425333 | Asia | 28 | 779.445314 |
1 | Afghanistan | 1957 | 9240934 | Asia | 30 | 820.853030 |
2 | Afghanistan | 1962 | 10267083 | Asia | 31 | 853.100710 |
3 | Afghanistan | 1967 | 11537966 | Asia | 34 | 836.197138 |
4 | Afghanistan | 1972 | 13079460 | Asia | 36 | 739.981106 |
... | ... | ... | ... | ... | ... | ... |
1699 | Zimbabwe | 1987 | 9216418 | Africa | 62 | 706.157306 |
1700 | Zimbabwe | 1992 | 10704340 | Africa | 60 | 693.420786 |
1701 | Zimbabwe | 1997 | 11404948 | Africa | 46 | 792.449960 |
1702 | Zimbabwe | 2002 | 11926563 | Africa | 39 | 672.038623 |
1703 | Zimbabwe | 2007 | 12311143 | Africa | 43 | 469.709298 |
1704 rows × 6 columns
# Selecting data for Iceland
Iceland_data = population_df[population_df['country'] == "Iceland"]
# Extracting specific columns
Icelandf1 = Iceland_data['life_exp']
Icelandf2 = Iceland_data['gdp_cap']
# Checking if both conditions hold true
Iceland_filtered_data = Iceland_data[(Icelandf1.notnull()) & (Icelandf2.notnull())]
Iceland_filtered_data.head(5)
country | year | population | continent | life_exp | gdp_cap | |
---|---|---|---|---|---|---|
684 | Iceland | 1952 | 147962 | Europe | 72 | 7267.688428 |
685 | Iceland | 1957 | 165110 | Europe | 73 | 9244.001412 |
686 | Iceland | 1962 | 182053 | Europe | 73 | 10350.159060 |
687 | Iceland | 1967 | 198676 | Europe | 73 | 13319.895680 |
688 | Iceland | 1972 | 209275 | Europe | 74 | 15798.063620 |
Iceland_filtered_data.describe()
year | population | life_exp | gdp_cap | |
---|---|---|---|---|
count | 12.000000 | 12.000000 | 12.000000 | 12.000000 |
mean | 1979.500000 | 226978.083333 | 75.916667 | 20531.422273 |
std | 18.027756 | 48541.684217 | 2.968267 | 9373.245893 |
min | 1952.000000 | 147962.000000 | 72.000000 | 7267.688428 |
25% | 1965.750000 | 194520.250000 | 73.000000 | 12577.461525 |
50% | 1979.500000 | 227910.000000 | 76.000000 | 21462.284985 |
75% | 1993.250000 | 262057.000000 | 78.000000 | 27207.679625 |
max | 2007.000000 | 301931.000000 | 81.000000 | 36180.789190 |
import plotly.express as px
# Plot histogram
fig = px.histogram(Iceland_filtered_data, x='life_exp', height=320, labels={'life_exp':'Life Expectancy'}, title='Life Expectancy Histogram - Iceland')
# Show the plot
fig.show()
import plotly.express as px
# Plot relationship between life expectancy and years
fig = px.bar(Iceland_filtered_data, x='year', y='life_exp', height=320, labels={'life_exp':'Life Expectancy'}, title='Relationship between Life Expectancy and Years - Iceland')
fig.update_layout(xaxis_tickangle=-45) # Rotate x-axis labels for better readability
# Show the plot
fig.show()
import plotly.express as px
# Create a bar plot
fig = px.bar(Iceland_filtered_data, x='year', y='population', color='life_exp', height=320, labels={'pop':'Population Kuwait'}, title='Life Expectancy with Respect to population Growth per Year - Iceland')
# Show the plot
fig.show()