# import libraries
import pandas as pd
import numpy as np

#load dataset
real_estate_df =  pd.read_csv(r"C:\Users\jki\Downloads\Real_Estate_Sales_2001-2020_GL.csv")
real_estate_df.head(5)

C:\Users\jki\AppData\Local\Temp\ipykernel_9032\2561183663.py:6: DtypeWarning: Columns (8,9,10,11,12) have mixed types. Specify dtype option on import or set low_memory=False.
  real_estate_df =  pd.read_csv(r"C:\Users\jki\Downloads\Real_Estate_Sales_2001-2020_GL.csv")


# lets check for  missing values
missing_values =  real_estate_df.isna().sum()
print(missing_values)

Serial Number            0
List Year                0
Date Recorded            2
Town                     0
Address                 51
Assessed Value           0
Sale Amount              0
Sales Ratio              0
Property Type       382446
Residential Type    388309
Non Use Code        707532
Assessor Remarks    847349
OPM remarks         987279
Location            799516
dtype: int64


# let remove the missing values
real_estate_df.dropna(subset=['Property Type','Residential Type','Non Use Code','Assessor Remarks','OPM remarks','Location'],inplace = True)
# lets confirm  missing values are removed
missing_values =  real_estate_df.isna().sum()
print(missing_values)

Serial Number       0
List Year           0
Date Recorded       0
Town                0
Address             0
Assessed Value      0
Sale Amount         0
Sales Ratio         0
Property Type       0
Residential Type    0
Non Use Code        0
Assessor Remarks    0
OPM remarks         0
Location            0
dtype: int64


# let check if we have unwanted negative values
real_estate_df.describe()


# lets check for data types to perform numerical calculations
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264 entries, 806 to 993143
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Serial Number     264 non-null    int64  
 1   List Year         264 non-null    int64  
 2   Date Recorded     264 non-null    object 
 3   Town              264 non-null    object 
 4   Address           264 non-null    object 
 5   Assessed Value    264 non-null    float64
 6   Sale Amount       264 non-null    float64
 7   Sales Ratio       264 non-null    float64
 8   Property Type     264 non-null    object 
 9   Residential Type  264 non-null    object 
 10  Non Use Code      264 non-null    object 
 11  Assessor Remarks  264 non-null    object 
 12  OPM remarks       264 non-null    object 
 13  Location          264 non-null    object 
dtypes: float64(3), int64(2), object(9)
memory usage: 30.9+ KB


import locale

# Set the locale to your preferred format (e.g., US English)
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# Calculate the rounded max
rounded_max = round(real_estate_df['Assessed Value'].max())

# Format and display as an accounting value
formatted_max = locale.currency(rounded_max, grouping=True)
print(formatted_max)

$2,083,410.00


import locale

# Set the locale to your preferred format (e.g., US English)
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# Calculate the rounded mean
rounded_mean = round(real_estate_df['Assessed Value'].mean())

# Format and display as an accounting value
formatted_mean = locale.currency(rounded_mean, grouping=True)
print(formatted_mean)

$170,127.00


# Load in some packages
import calendar
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter

# Group by 'Town' and calculate the sum of 'Assesed Values'
towns_real_estate_values = real_estate_df.groupby('Town')['Assessed Value'].sum()

# Sort the values in descending order and select the top ten
top_ten_towns = towns_real_estate_values.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_towns)

# Plot the top ten towns
top_ten_towns.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten Towns with Most Assesed Values')
plt.xlabel('Town')
plt.ylabel('Assessed Value')
plt.show()

Town
Danbury        9146700.0
Norwalk        6698810.0
East Lyme      2690590.0
New Milford    2644360.0
Farmington     1493760.0
Bridgeport     1363515.0
East Haven     1324850.0
New Haven      1172080.0
Cromwell       1038730.0
Woodbridge      946330.0
Name: Assessed Value, dtype: float64


# Group by 'Property Type' and calculate the sum of 'Assesed Values'
Property_Type_real_estate_values = real_estate_df.groupby('Property Type')['Assessed Value'].sum()

# Sort the values in descending order and select the top ten
top_ten_Property_Type = Property_Type_real_estate_values.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_Property_Type)

# Plot the top ten Property Values
top_ten_Property_Type.plot(kind='bar', color='red', figsize=(10, 6))
plt.title('Top Ten top Property Type as per  Assesed Values')
plt.xlabel('Property Type')
plt.ylabel('Assessed Value')
plt.show()

Property Type
Residential      19210790.0
Single Family    17872740.0
Condo             4319350.0
Three Family      1445475.0
Two Family        1326690.0
Four Family        738400.0
Name: Assessed Value, dtype: float64


# Group by 'Residential Type and calculate the sum of 'Assessed Value'
Residential_Type_real_estate_values = real_estate_df.groupby('Residential Type')['Assessed Value'].sum()

# Sort the values in descending order and select the top ten
top_ten_Residential_Type = Residential_Type_real_estate_values.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_Residential_Type)

# Plot the top ten Property Values
top_ten_Residential_Type.plot(kind='bar', color='brown', figsize=(10, 6))
plt.title('Top Ten top Residential Type as per  Assesed Values')
plt.xlabel('Property Type')
plt.ylabel('Assesed Values')
plt.show()

Residential Type
Single Family    32029990.0
Condo             7027790.0
Two Family        3324000.0
Three Family      1793265.0
Four Family        738400.0
Name: Assessed Value, dtype: float64


# lets fix up the dates
real_estate_df['Date Recorded'] = pd.to_datetime(real_estate_df['Date Recorded'])
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264 entries, 806 to 993143
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Serial Number     264 non-null    int64         
 1   List Year         264 non-null    int64         
 2   Date Recorded     264 non-null    datetime64[ns]
 3   Town              264 non-null    object        
 4   Address           264 non-null    object        
 5   Assessed Value    264 non-null    float64       
 6   Sale Amount       264 non-null    float64       
 7   Sales Ratio       264 non-null    float64       
 8   Property Type     264 non-null    object        
 9   Residential Type  264 non-null    object        
 10  Non Use Code      264 non-null    object        
 11  Assessor Remarks  264 non-null    object        
 12  OPM remarks       264 non-null    object        
 13  Location          264 non-null    object        
dtypes: datetime64[ns](1), float64(3), int64(2), object(8)
memory usage: 30.9+ KB


# lets create new column for month
real_estate_df['Month Recorded'] = real_estate_df['Date Recorded'].dt.month
real_estate_df.head(3)


# lets create new column for Year
real_estate_df['Year Recorded'] = real_estate_df['Date Recorded'].dt.year
real_estate_df.head(3)


# Load in some packages
import calendar
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter

warnings.filterwarnings("ignore")

import calendar

# Group by month and sum the 'Sale Amount'
monthly_sales = real_estate_df.groupby('Month Recorded')['Sale Amount'].sum()

# Find the month with the highest sales
max_month = monthly_sales.idxmax()
earnings_for_best_month = sales_by_month.max()
print(f"The month with the highest sales is {max_month} with earning of {earnings_for_best_month:,.2f}")

# Plot the highest number of sales for each month
plt.figure(figsize=(10, 6))
plt.bar(monthly_sales.index, monthly_sales, color='green')
plt.xlabel('Month')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount for Each Month')
plt.xticks(rotation=45)
plt.show()

The month with the highest sales is 7 with earning of 331,269,169.00


import calendar

# Group by Year and sum the 'Sale Amount'
yearly_sales = real_estate_df.groupby('Year Recorded')['Sale Amount'].sum()

# Find the month with the highest sales
max_year = yearly_sales.idxmax()
earnings_for_best_year = yearly_sales.max()
print(f"The Year with the highest sales is {max_year} with earning of {earnings_for_best_year:,.2f}")

# Plot the highest number of sales for each month
plt.figure(figsize=(10, 6))
plt.bar(yearly_sales.index, yearly_sales, color='purple')
plt.xlabel('Year')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount for Each Month')
plt.xticks(rotation=45)
plt.show()

The Year with the highest sales is 2021 with earning of 352,540,596.00


# Group by 'Town' and calculate the sum of 'Assesed Values'
towns_real_estate_values = real_estate_df.groupby('Town')['Sale Amount'].sum()

# Sort the values in descending order and select the top ten
top_ten_towns = towns_real_estate_values.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_towns)

# Plot the top ten counties
top_ten_towns.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten most expensive Towns ')
plt.xlabel('Town')
plt.ylabel('Sale Amount')
plt.show()

Town
Willington     318790019.0
Danbury         17757439.0
Norwalk         15537677.0
East Lyme        8303750.0
New London       6148000.0
New Milford      3382500.0
Farmington       2826187.0
Bridgeport       1759267.0
Cromwell         1527500.0
East Haven       1468900.0
Name: Sale Amount, dtype: float64


# Group by 'Property Type' and calculate the sum of 'Assesed Values'
Property_Type_real_estate_values = real_estate_df.groupby('Property Type')['Sale Amount'].sum()

# Sort the values in descending order and select the top ten
top_ten_Property_Type = Property_Type_real_estate_values.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_Property_Type)

# Plot the top ten Property Values
top_ten_Property_Type.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten most expensive  Property Types ')
plt.xlabel('Property Type')
plt.ylabel('Sale Amount')
plt.show()

Property Type
Residential      362613502.0
Single Family     24145005.0
Four Family        6605000.0
Condo              6328190.0
Two Family         1765000.0
Three Family       1529000.0
Name: Sale Amount, dtype: float64


# Group by 'Residential Type and calculate the sum of 'Assessed Value'
Residential_Type_real_estate_values = real_estate_df.groupby('Residential Type')['Sale Amount'].sum()

# Sort the values in descending order and select the top ten
top_ten_Residential_Type = Residential_Type_real_estate_values.sort_values(ascending=False).head(10)

# Display the result
print(top_ten_Residential_Type)

# Plot the top ten Property Values
top_ten_Residential_Type.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten top Residential Type with Most Sale Amount')
plt.xlabel('Residential Type')
plt.ylabel('Sale Amount')
plt.show()

Residential Type
Single Family    373705042.0
Condo             13935155.0
Four Family        6605000.0
Two Family         6131500.0
Three Family       2609000.0
Name: Sale Amount, dtype: float64

	Serial Number	List Year	Date Recorded	Town	Address	Assessed Value	Sale Amount	Sales Ratio	Property Type	Residential Type	Non Use Code	Assessor Remarks	OPM remarks	Location
0	2020348	2020	09/13/2021	Ansonia	230 WAKELEE AVE	150500.0	325000.0	0.4630	Commercial	NaN	NaN	NaN	NaN	NaN
1	20002	2020	10/02/2020	Ashford	390 TURNPIKE RD	253000.0	430000.0	0.5883	Residential	Single Family	NaN	NaN	NaN	NaN
2	200212	2020	03/09/2021	Avon	5 CHESTNUT DRIVE	130400.0	179900.0	0.7248	Residential	Condo	NaN	NaN	NaN	NaN
3	200243	2020	04/13/2021	Avon	111 NORTHINGTON DRIVE	619290.0	890000.0	0.6958	Residential	Single Family	NaN	NaN	NaN	NaN
4	200377	2020	07/02/2021	Avon	70 FAR HILLS DRIVE	862330.0	1447500.0	0.5957	Residential	Single Family	NaN	NaN	NaN	NaN

	Serial Number	List Year	Assessed Value	Sale Amount	Sales Ratio
count	2.640000e+02	264.000000	2.640000e+02	2.640000e+02	264.000000
mean	3.518540e+06	2018.598485	1.701267e+05	1.526461e+06	0.789047
std	2.454170e+07	1.318830	1.675329e+05	1.960760e+07	0.619845
min	1.701400e+04	2016.000000	3.770000e+03	4.500000e+03	0.000700
25%	1.702550e+05	2017.000000	9.030250e+04	1.193750e+05	0.442072
50%	1.901695e+05	2019.000000	1.378300e+05	1.888500e+05	0.647560
75%	2.007905e+05	2020.000000	1.992075e+05	3.526500e+05	1.018919
max	2.020002e+08	2020.000000	2.083410e+06	3.187900e+08	7.422222

Return Home

Real Estate Sales 2001-2020 GL EDA¶

1 What was the highest property value¶

2 What was the Average Propery Value¶

3 Whick top Ten Towns have the highest Property Values¶

4 Whick top ten Property Type have the highest property values¶

5 Whick top ten Residential Type have the highest property values¶

6. What was the best month for sales? How much was earned that month?¶

7. What was the best Year for sales? How much was earned that month?¶

8. What are the top ten Towns most expensive towns?¶

9. What are the 10 most exponsive properties based on property type category?¶

10 Whick top ten Residential Type have the highest Sale Prices¶

	Serial Number	List Year	Date Recorded	Town	Address	Assessed Value	Sale Amount	Sales Ratio	Property Type	Residential Type	Non Use Code	Assessor Remarks	OPM remarks	Location	Month Recorded	Year Recorded
806	200594	2020	2021-02-16	Danbury	8 HICKORY ST	121600.0	146216.0	0.831646	Residential	Single Family	25 - Other	I11192	HOUSE HAS SETTLED PER MLS	POINT (-73.44696 41.41179)	2	2021
981	200562	2020	2021-02-03	Danbury	19 MILL RD	263600.0	415000.0	0.635181	Residential	Single Family	25 - Other	AFFORDABLE HOUSING / B15001-20-19	INCORRECT DATA PER TOWN RECORDS	POINT (-73.53692 41.38822)	2	2021
1529	200260	2020	2020-11-23	Danbury	32 COALPIT HILL RD #4	84900.0	181778.0	0.467053	Residential	Condo	25 - Other	J16087-4	MULTIPLE UNIT SALE	POINT (-73.43796 41.38549)	11	2020