1. Load the Data

ames <- read_csv("../data/ames.csv") %>%
clean_names()
glimpse(ames)
## Rows: 2,930
## Columns: 82
## $ order           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ pid             <chr> "0526301100", "0526350040", "0526351010", "0526353030"…
## $ ms_sub_class    <chr> "020", "020", "020", "020", "060", "060", "120", "120"…
## $ ms_zoning       <chr> "RL", "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
## $ lot_frontage    <dbl> 141, 80, 81, 93, 74, 78, 41, 43, 39, 60, 75, NA, 63, 8…
## $ lot_area        <dbl> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005, 5…
## $ street          <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave"…
## $ alley           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lot_shape       <chr> "IR1", "Reg", "IR1", "Reg", "IR1", "IR1", "Reg", "IR1"…
## $ land_contour    <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "HLS"…
## $ utilities       <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "All…
## $ lot_config      <chr> "Corner", "Inside", "Corner", "Corner", "Inside", "Ins…
## $ land_slope      <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl"…
## $ neighborhood    <chr> "NAmes", "NAmes", "NAmes", "NAmes", "Gilbert", "Gilber…
## $ condition_1     <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm…
## $ condition_2     <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm"…
## $ bldg_type       <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "Twnhs…
## $ house_style     <chr> "1Story", "1Story", "1Story", "1Story", "2Story", "2St…
## $ overall_qual    <dbl> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, …
## $ overall_cond    <dbl> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, …
## $ year_built      <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ year_remod_add  <dbl> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996, …
## $ roof_style      <chr> "Hip", "Gable", "Hip", "Hip", "Gable", "Gable", "Gable…
## $ roof_matl       <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg",…
## $ exterior_1st    <chr> "BrkFace", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
## $ exterior_2nd    <chr> "Plywood", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
## $ mas_vnr_type    <chr> "Stone", "None", "BrkFace", "None", "None", "BrkFace",…
## $ mas_vnr_area    <dbl> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603,…
## $ exter_qual      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "Gd", "Gd", "Gd", …
## $ exter_cond      <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ foundation      <chr> "CBlock", "CBlock", "CBlock", "CBlock", "PConc", "PCon…
## $ bsmt_qual       <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", …
## $ bsmt_cond       <chr> "Gd", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ bsmt_exposure   <chr> "Gd", "No", "No", "No", "No", "No", "Mn", "No", "No", …
## $ bsmt_fin_type_1 <chr> "BLQ", "Rec", "ALQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ"…
## $ bsmt_fin_sf_1   <dbl> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0, 9…
## $ bsmt_fin_type_2 <chr> "Unf", "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf"…
## $ bsmt_fin_sf_2   <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0…
## $ bsmt_unf_sf     <dbl> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, 76…
## $ total_bsmt_sf   <dbl> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 994…
## $ heating         <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA"…
## $ heating_qc      <chr> "Fa", "TA", "TA", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", …
## $ central_air     <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ electrical      <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", …
## $ x1st_flr_sf     <dbl> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 102…
## $ x2nd_flr_sf     <dbl> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 0, 676, 0, 0,…
## $ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gr_liv_area     <dbl> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616, 1…
## $ bsmt_full_bath  <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, …
## $ bsmt_half_bath  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ full_bath       <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, …
## $ half_bath       <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ bedroom_abv_gr  <dbl> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 1, 4, 4, 1, …
## $ kitchen_abv_gr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ kitchen_qual    <chr> "TA", "TA", "Gd", "Ex", "TA", "Gd", "Gd", "Gd", "Gd", …
## $ tot_rms_abv_grd <dbl> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 6, 7, 5, 4, 12, 8, 8,…
## $ functional      <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ"…
## $ fireplaces      <dbl> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ fireplace_qu    <chr> "Gd", NA, NA, "TA", "TA", "Gd", NA, NA, "TA", "TA", "T…
## $ garage_type     <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd", "Att…
## $ garage_yr_blt   <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ garage_finish   <chr> "Fin", "Unf", "Unf", "Fin", "Fin", "Fin", "Fin", "RFn"…
## $ garage_cars     <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ garage_area     <dbl> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 440,…
## $ garage_qual     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ garage_cond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ paved_drive     <chr> "P", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ wood_deck_sf    <dbl> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 483, …
## $ open_porch_sf   <dbl> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 21, 75, 0, 5…
## $ enclosed_porch  <dbl> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ x3ssn_porch     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ screen_porch    <dbl> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 140, 210…
## $ pool_area       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pool_qc         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ fence           <chr> NA, "MnPrv", NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, …
## $ misc_feature    <chr> NA, NA, "Gar2", NA, NA, NA, NA, NA, NA, NA, NA, "Shed"…
## $ misc_val        <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0, …
## $ mo_sold         <dbl> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 3, 5, 2, 6, 6, 6, 6, …
## $ yr_sold         <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
## $ sale_type       <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
## $ sale_condition  <chr> "Normal", "Normal", "Normal", "Normal", "Normal", "Nor…
## $ sale_price      <dbl> 215000, 105000, 172000, 244000, 189900, 195500, 213500…

2. Summary Statistics

skim(ames)
Data summary
Name ames
Number of rows 2930
Number of columns 82
_______________________
Column type frequency:
character 45
numeric 37
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
pid 0 1.00 10 10 0 2930 0
ms_sub_class 0 1.00 3 3 0 16 0
ms_zoning 0 1.00 2 7 0 7 0
street 0 1.00 4 4 0 2 0
alley 2732 0.07 4 4 0 2 0
lot_shape 0 1.00 3 3 0 4 0
land_contour 0 1.00 3 3 0 4 0
utilities 0 1.00 6 6 0 3 0
lot_config 0 1.00 3 7 0 5 0
land_slope 0 1.00 3 3 0 3 0
neighborhood 0 1.00 5 7 0 28 0
condition_1 0 1.00 4 6 0 9 0
condition_2 0 1.00 4 6 0 8 0
bldg_type 0 1.00 4 6 0 5 0
house_style 0 1.00 4 6 0 8 0
roof_style 0 1.00 3 7 0 6 0
roof_matl 0 1.00 4 7 0 8 0
exterior_1st 0 1.00 5 7 0 16 0
exterior_2nd 0 1.00 5 7 0 17 0
mas_vnr_type 23 0.99 4 7 0 5 0
exter_qual 0 1.00 2 2 0 4 0
exter_cond 0 1.00 2 2 0 5 0
foundation 0 1.00 4 6 0 6 0
bsmt_qual 80 0.97 2 2 0 5 0
bsmt_cond 80 0.97 2 2 0 5 0
bsmt_exposure 83 0.97 2 2 0 4 0
bsmt_fin_type_1 80 0.97 3 3 0 6 0
bsmt_fin_type_2 81 0.97 3 3 0 6 0
heating 0 1.00 4 5 0 6 0
heating_qc 0 1.00 2 2 0 5 0
central_air 0 1.00 1 1 0 2 0
electrical 1 1.00 3 5 0 5 0
kitchen_qual 0 1.00 2 2 0 5 0
functional 0 1.00 3 4 0 8 0
fireplace_qu 1422 0.51 2 2 0 5 0
garage_type 157 0.95 6 7 0 6 0
garage_finish 159 0.95 3 3 0 3 0
garage_qual 159 0.95 2 2 0 5 0
garage_cond 159 0.95 2 2 0 5 0
paved_drive 0 1.00 1 1 0 3 0
pool_qc 2917 0.00 2 2 0 4 0
fence 2358 0.20 4 5 0 4 0
misc_feature 2824 0.04 4 4 0 5 0
sale_type 0 1.00 2 5 0 10 0
sale_condition 0 1.00 6 7 0 6 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
order 0 1.00 1465.50 845.96 1 733.25 1465.5 2197.75 2930 ▇▇▇▇▇
lot_frontage 490 0.83 69.22 23.37 21 58.00 68.0 80.00 313 ▇▃▁▁▁
lot_area 0 1.00 10147.92 7880.02 1300 7440.25 9436.5 11555.25 215245 ▇▁▁▁▁
overall_qual 0 1.00 6.09 1.41 1 5.00 6.0 7.00 10 ▁▂▇▅▁
overall_cond 0 1.00 5.56 1.11 1 5.00 5.0 6.00 9 ▁▁▇▅▁
year_built 0 1.00 1971.36 30.25 1872 1954.00 1973.0 2001.00 2010 ▁▂▃▆▇
year_remod_add 0 1.00 1984.27 20.86 1950 1965.00 1993.0 2004.00 2010 ▅▂▂▃▇
mas_vnr_area 23 0.99 101.90 179.11 0 0.00 0.0 164.00 1600 ▇▁▁▁▁
bsmt_fin_sf_1 1 1.00 442.63 455.59 0 0.00 370.0 734.00 5644 ▇▁▁▁▁
bsmt_fin_sf_2 1 1.00 49.72 169.17 0 0.00 0.0 0.00 1526 ▇▁▁▁▁
bsmt_unf_sf 1 1.00 559.26 439.49 0 219.00 466.0 802.00 2336 ▇▅▂▁▁
total_bsmt_sf 1 1.00 1051.61 440.62 0 793.00 990.0 1302.00 6110 ▇▃▁▁▁
x1st_flr_sf 0 1.00 1159.56 391.89 334 876.25 1084.0 1384.00 5095 ▇▃▁▁▁
x2nd_flr_sf 0 1.00 335.46 428.40 0 0.00 0.0 703.75 2065 ▇▃▂▁▁
low_qual_fin_sf 0 1.00 4.68 46.31 0 0.00 0.0 0.00 1064 ▇▁▁▁▁
gr_liv_area 0 1.00 1499.69 505.51 334 1126.00 1442.0 1742.75 5642 ▇▇▁▁▁
bsmt_full_bath 2 1.00 0.43 0.52 0 0.00 0.0 1.00 3 ▇▆▁▁▁
bsmt_half_bath 2 1.00 0.06 0.25 0 0.00 0.0 0.00 2 ▇▁▁▁▁
full_bath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 4 ▁▇▇▁▁
half_bath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2 ▇▁▅▁▁
bedroom_abv_gr 0 1.00 2.85 0.83 0 2.00 3.0 3.00 8 ▁▇▂▁▁
kitchen_abv_gr 0 1.00 1.04 0.21 0 1.00 1.0 1.00 3 ▁▇▁▁▁
tot_rms_abv_grd 0 1.00 6.44 1.57 2 5.00 6.0 7.00 15 ▁▇▂▁▁
fireplaces 0 1.00 0.60 0.65 0 0.00 1.0 1.00 4 ▇▇▁▁▁
garage_yr_blt 159 0.95 1978.13 25.53 1895 1960.00 1979.0 2002.00 2207 ▂▇▁▁▁
garage_cars 1 1.00 1.77 0.76 0 1.00 2.0 2.00 5 ▅▇▂▁▁
garage_area 1 1.00 472.82 215.05 0 320.00 480.0 576.00 1488 ▃▇▃▁▁
wood_deck_sf 0 1.00 93.75 126.36 0 0.00 0.0 168.00 1424 ▇▁▁▁▁
open_porch_sf 0 1.00 47.53 67.48 0 0.00 27.0 70.00 742 ▇▁▁▁▁
enclosed_porch 0 1.00 23.01 64.14 0 0.00 0.0 0.00 1012 ▇▁▁▁▁
x3ssn_porch 0 1.00 2.59 25.14 0 0.00 0.0 0.00 508 ▇▁▁▁▁
screen_porch 0 1.00 16.00 56.09 0 0.00 0.0 0.00 576 ▇▁▁▁▁
pool_area 0 1.00 2.24 35.60 0 0.00 0.0 0.00 800 ▇▁▁▁▁
misc_val 0 1.00 50.64 566.34 0 0.00 0.0 0.00 17000 ▇▁▁▁▁
mo_sold 0 1.00 6.22 2.71 1 4.00 6.0 8.00 12 ▅▆▇▃▃
yr_sold 0 1.00 2007.79 1.32 2006 2007.00 2008.0 2009.00 2010 ▇▇▇▇▃
sale_price 0 1.00 180796.06 79886.69 12789 129500.00 160000.0 213500.00 755000 ▇▇▁▁▁

3. Missing Values Overview

ames %>% 
summarise(across(everything(), ~sum(is.na(.)))) %>% 
gather(variable, missing_count) %>% 
arrange(desc(missing_count))
## # A tibble: 82 × 2
##    variable      missing_count
##    <chr>                 <int>
##  1 pool_qc                2917
##  2 misc_feature           2824
##  3 alley                  2732
##  4 fence                  2358
##  5 fireplace_qu           1422
##  6 lot_frontage            490
##  7 garage_yr_blt           159
##  8 garage_finish           159
##  9 garage_qual             159
## 10 garage_cond             159
## # ℹ 72 more rows

4. Key Numeric Distributions

ames %>% 
select(sale_price, gr_liv_area, total_bsmt_sf, garage_area, lot_area) %>%
gather(variable, value) %>%
ggplot(aes(value)) +
geom_histogram(bins = 30) +
facet_wrap(~ variable, scales = "free") +
theme_minimal()

5. Pairwise Relationships

ames %>%
drop_na(sale_price, gr_liv_area, overall_qual, garage_area, year_built) %>%
select(sale_price, gr_liv_area, overall_qual, garage_area, year_built) %>%
ggpairs()

6. Data Cleaning & Feature Engineering

ames_clean <- ames %>% 
mutate(
across(where(is.character), as.factor),
fireplace_qu = fct_na_value_to_level(fireplace_qu, "None"),
garage_qual  = fct_na_value_to_level(garage_qual, "None"),
garage_type  = fct_na_value_to_level(garage_type, "None"),
alley        = fct_na_value_to_level(alley, "None"),
pool_qc      = fct_na_value_to_level(pool_qc, "None"),
misc_feature = fct_na_value_to_level(misc_feature, "None")
)

6.2 Check Missing Values After Cleaning

ames_clean %>% 
summarise(across(everything(), ~sum(is.na(.)))) %>% 
gather(variable, missing_count) %>% 
filter(missing_count > 0) %>%
arrange(desc(missing_count))
## # A tibble: 21 × 2
##    variable        missing_count
##    <chr>                   <int>
##  1 fence                    2328
##  2 lot_frontage              489
##  3 garage_yr_blt             159
##  4 garage_finish             159
##  5 garage_cond               159
##  6 bsmt_exposure              83
##  7 bsmt_fin_type_2            81
##  8 bsmt_qual                  80
##  9 bsmt_cond                  80
## 10 bsmt_fin_type_1            80
## # ℹ 11 more rows

6.3 Train/Test Split

data_split <- initial_split(ames_clean, prop = 0.8, strata = sale_price)
train_data <- training(data_split)
test_data  <- testing(data_split)

train_data %>% head()
## # A tibble: 6 × 82
##   order pid        ms_sub_class ms_zoning lot_frontage lot_area street alley
##   <dbl> <fct>      <fct>        <fct>            <dbl>    <dbl> <fct>  <fct>
## 1    27 0527404120 020          RL                  70     8400 Pave   None 
## 2    30 0527451180 160          RM                  21     1680 Pave   None 
## 3    31 0527451330 160          RM                  21     1680 Pave   None 
## 4    32 0527451410 160          RM                  21     1680 Pave   None 
## 5    33 0527452190 120          RL                  53     4043 Pave   None 
## 6    35 0527453150 120          RL                  24     2280 Pave   None 
## # ℹ 74 more variables: lot_shape <fct>, land_contour <fct>, utilities <fct>,
## #   lot_config <fct>, land_slope <fct>, neighborhood <fct>, condition_1 <fct>,
## #   condition_2 <fct>, bldg_type <fct>, house_style <fct>, overall_qual <dbl>,
## #   overall_cond <dbl>, year_built <dbl>, year_remod_add <dbl>,
## #   roof_style <fct>, roof_matl <fct>, exterior_1st <fct>, exterior_2nd <fct>,
## #   mas_vnr_type <fct>, mas_vnr_area <dbl>, exter_qual <fct>, exter_cond <fct>,
## #   foundation <fct>, bsmt_qual <fct>, bsmt_cond <fct>, bsmt_exposure <fct>, …
train_model <- train_data %>%
  mutate(log_sale_price = log(sale_price + 1)) %>%
  select(-sale_price)

test_model <- test_data %>%
  mutate(log_sale_price = log(sale_price + 1)) %>%
  select(-sale_price)

7. Feature Engineering Recipe (Tidymodels)

numeric_vars <- train_data %>%
  select(where(is.numeric)) %>%
  select(-sale_price) %>% 
  pivot_longer(everything(), names_to = "var", values_to = "value") %>%
  group_by(var) %>%
  summarise(skew = abs(skewness(value, na.rm = TRUE)), .groups = "drop") %>%
  filter(skew > 1) %>%
  pull(var)

numeric_vars
##  [1] "bsmt_fin_sf_2"   "bsmt_half_bath"  "enclosed_porch"  "kitchen_abv_gr" 
##  [5] "lot_area"        "lot_frontage"    "low_qual_fin_sf" "mas_vnr_area"   
##  [9] "misc_val"        "open_porch_sf"   "pool_area"       "screen_porch"   
## [13] "wood_deck_sf"    "x3ssn_porch"

7.1 Build the Recipe

ames_recipe <- recipe(log_sale_price ~ ., data = train_model) %>%

  # Imputation
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%

  # Log-transform skewed predictors
  step_log(all_of(numeric_vars), offset = 1e-6) %>%

  # Encoding
  step_dummy(all_nominal_predictors()) %>%

  # Cleanup
  step_zv(all_predictors()) %>%
  step_lincomb(all_predictors()) %>%
  step_corr(all_numeric_predictors(), threshold = 0.9) %>%
  step_normalize(all_numeric_predictors())

7.2 Prep the Recipe

ames_prep <- prep(ames_recipe)
train_processed <- bake(ames_prep, new_data = NULL)
test_processed <- bake(ames_prep, new_data = test_model)
train_processed %>% head()
## # A tibble: 6 × 2,316
##   lot_frontage lot_area overall_qual overall_cond year_built year_remod_add
##          <dbl>    <dbl>        <dbl>        <dbl>      <dbl>          <dbl>
## 1        0.201  -0.0961      -1.50         -0.522   -0.0279          -0.669
## 2       -3.47   -3.26        -0.0357       -0.522    0.00526         -0.622
## 3       -3.47   -3.26        -0.767        -0.522    0.00526         -0.622
## 4       -3.47   -3.26        -0.0357       -0.522    0.00526         -0.622
## 5       -0.647  -1.53        -0.0357       -0.522    0.204           -0.334
## 6       -3.06   -2.66         0.695         0.379    0.138           -0.430
## # ℹ 2,310 more variables: mas_vnr_area <dbl>, bsmt_fin_sf_1 <dbl>,
## #   bsmt_fin_sf_2 <dbl>, bsmt_unf_sf <dbl>, total_bsmt_sf <dbl>,
## #   x1st_flr_sf <dbl>, x2nd_flr_sf <dbl>, low_qual_fin_sf <dbl>,
## #   gr_liv_area <dbl>, bsmt_full_bath <dbl>, bsmt_half_bath <dbl>,
## #   full_bath <dbl>, half_bath <dbl>, bedroom_abv_gr <dbl>,
## #   kitchen_abv_gr <dbl>, tot_rms_abv_grd <dbl>, fireplaces <dbl>,
## #   garage_yr_blt <dbl>, garage_cars <dbl>, garage_area <dbl>, …

8. Model Training

folds <- vfold_cv(train_model, v = 5)

9. Linear Regression Model

lin_mod <- linear_reg() %>%
set_engine("lm")
lin_wf <- workflow() %>%
  add_model(lin_mod) %>%
  add_recipe(ames_recipe)

lin_res <- fit_resamples(
  lin_wf,
  folds,
  metrics = metric_set(rmse, rsq),
  control = control_resamples(save_pred = TRUE)
)
collect_metrics(lin_res)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config        
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
## 1 rmse    standard   0.764     5  0.0663 pre0_mod0_post0
## 2 rsq     standard   0.144     5  0.0429 pre0_mod0_post0

10. Random Forest Model

# 10.1 RF model specification
rf_mod <- rand_forest(
mtry  = tune(),
trees = 800,
min_n = tune()
) %>%
set_engine("ranger") %>%
set_mode("regression")

# 10.2 Workflow
rf_wf <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(ames_recipe)

# 10.3 RF hyperparameter grid
rf_grid <- grid_regular(
mtry(range = c(5, 15)),
min_n(range = c(2, 20)),
levels = 5
)

# 10.4 Tune with CV
rf_res <- tune_grid(
rf_wf,
resamples = folds,
grid = rf_grid,
metrics = metric_set(rmse, rsq),
control = control_grid(save_pred = TRUE)
)
rf_res
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 5
##   splits             id    .metrics          .notes           .predictions
##   <list>             <chr> <list>            <list>           <list>      
## 1 <split [1852/464]> Fold1 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 2 <split [1853/463]> Fold2 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 3 <split [1853/463]> Fold3 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 4 <split [1853/463]> Fold4 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 5 <split [1853/463]> Fold5 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>

11. Select best RF model

rf_best <- select_best(rf_res, metric = "rmse")

rf_final_wf <- finalize_workflow(rf_wf, rf_best)

rf_final_fit <- fit(rf_final_wf, data = train_model)

rf_test_preds <- predict(
  rf_final_fit,
  new_data = test_model
) %>%
  mutate(
    sale_price = test_data$sale_price,
    .pred = exp(.pred) - 1
  )

rf_metrics <- rf_test_preds %>%
  metrics(truth = sale_price, estimate = .pred)

rf_metrics
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard   42998.   
## 2 rsq     standard       0.835
## 3 mae     standard   26760.

12. XGBoost Model

library(xgboost)

# 12.1 Model specification
xgb_mod <- boost_tree(
  trees = tune(),
  learn_rate = tune(),
  mtry = tune(),
  tree_depth = tune(),
  loss_reduction = tune(),
  min_n = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# 12.2 Workflow
xgb_wf <- workflow() %>%
  add_recipe(ames_recipe) %>%
  add_model(xgb_mod)

# 12.3 Improved tuning grid
xgb_grid <- grid_space_filling(
  trees(range = c(500, 1500)),
  learn_rate(range = c(0.05, 0.3)),
  mtry(range = c(10, 40)),
  tree_depth(range = c(4, 10)),
  loss_reduction(range = c(0, 5)),
  min_n(range = c(5, 30)),
  size = 20
)

# 12.4 Tune with cross-validation
set.seed(123)
xgb_res <- tune_grid(
  xgb_wf,
  resamples = folds,
  grid = xgb_grid,
  metrics = metric_set(rmse, rsq),
  control = control_grid(
    save_pred = TRUE,
    verbose = FALSE
  )
)
xgb_res
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 5
##   splits             id    .metrics           .notes            .predictions
##   <list>             <chr> <list>             <list>            <list>      
## 1 <split [1852/464]> Fold1 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 2 <split [1853/463]> Fold2 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 3 <split [1853/463]> Fold3 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 4 <split [1853/463]> Fold4 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 5 <split [1853/463]> Fold5 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 
## There were issues with some computations:
## 
##   - Warning(s) x60: A correlation computation is required, but `estimate` is constant...
## 
## Run `show_notes(.Last.tune.result)` for more information.

13. Select Best XGBoost Model

xgb_best <- select_best(xgb_res, metric = "rmse")

xgb_final_wf <- finalize_workflow(xgb_wf, xgb_best)

xgb_final_fit <- fit(xgb_final_wf, data = train_model)

xgb_test_preds <- predict(
  xgb_final_fit,
  new_data = test_model
) %>%
  mutate(
    sale_price = test_data$sale_price,
    .pred = exp(.pred) - 1
  )

xgb_metrics <- xgb_test_preds %>%
  metrics(truth = sale_price, estimate = .pred)

xgb_metrics
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard   32330.   
## 2 rsq     standard       0.808
## 3 mae     standard   23579.

14. Model Comparison Table (Linear vs RF vs XGB)

model_comparison <- bind_rows(
  lin_res %>%
    collect_metrics() %>%
    filter(.metric == "rmse") %>%
    mutate(model = "Linear Regression"),
  
  rf_res %>%
    collect_metrics() %>%
    filter(.metric == "rmse") %>%
    mutate(model = "Random Forest"),
  
  xgb_res %>%
    collect_metrics() %>%
    filter(.metric == "rmse") %>%
    mutate(model = "XGBoost")
) %>%
  select(model, mean, std_err) %>%
  arrange(mean)

model_comparison
## # A tibble: 46 × 3
##    model          mean std_err
##    <chr>         <dbl>   <dbl>
##  1 XGBoost       0.200 0.00778
##  2 XGBoost       0.208 0.0122 
##  3 XGBoost       0.223 0.00664
##  4 Random Forest 0.224 0.00926
##  5 Random Forest 0.224 0.00981
##  6 Random Forest 0.224 0.00987
##  7 Random Forest 0.224 0.00990
##  8 Random Forest 0.224 0.00978
##  9 Random Forest 0.238 0.00924
## 10 Random Forest 0.238 0.00982
## # ℹ 36 more rows

15. Cross-Validation RMSE Plot (All Models)

library(ggplot2)

cv_results <- bind_rows(
  lin_res %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "Linear"),
  rf_res  %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "Random Forest"),
  xgb_res %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "XGBoost")
)

ggplot(cv_results, aes(x = model, y = mean, fill = model)) +
  geom_col(width = 0.6) +
  scale_fill_manual(
    values = c(
      "Linear" = "#4C72B0",
      "Random Forest" = "#55A868",
      "XGBoost" = "#C44E52"
    )
  ) +
  labs(
    title = "Cross-Validation RMSE Comparison (Log Sale Price)",
    x = "",
    y = "RMSE"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

16. Final Test-Set Predictions Plot

ggplot(xgb_test_preds, aes(x = sale_price, y = .pred)) +
  geom_point(alpha = 0.4, color = "#4C72B0") +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed",
              color = "#C44E52") +
  labs(
    title = "XGBoost Predictions vs Actual Sale Price",
    x = "Actual Sale Price",
    y = "Predicted Sale Price"
  ) +
  theme_minimal(base_size = 13)

17. Residual Diagnostics (XGBoost) - Residuals are the standard way to show model fit quality

xgb_test_preds <- xgb_test_preds %>%
  mutate(residual = sale_price - .pred)

ggplot(xgb_test_preds, aes(x = .pred, y = residual)) +
  geom_point(alpha = 0.35, color = "#55A868") +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             color = "#C44E52") +
  labs(
    title = "Residuals vs Predicted Values (XGBoost)",
    x = "Predicted Sale Price",
    y = "Residuals"
  ) +
  theme_minimal(base_size = 13)

18. Distribution of Prediction Errors - Standard check for error symmetry, well-behaviour, and not wild skewness

ggplot(xgb_test_preds, aes(residual)) +
  geom_histogram(bins = 30,
                 fill = "#4C72B0",
                 alpha = 0.85) +
  labs(
    title = "Distribution of Prediction Errors (XGBoost)",
    x = "Prediction Error",
    y = "Count"
  ) +
  theme_minimal(base_size = 13)

19: Real-Life Impact of Price Predictions

Among the models, XGBoost demonstrated the strongest predictive performance on the test set. The XGBoost model is very accurate in predicting actual house prices:

# Add prediction errors
xgb_test_preds <- xgb_test_preds %>%
  mutate(
    error = abs(.pred - sale_price),
    pct_error = abs(.pred - sale_price) / sale_price * 100,
    underpriced = .pred > sale_price,
    potential_gain = if_else(underpriced, .pred - sale_price, 0)
  )

# 1. Model R² report

rsq_value <- rsq(xgb_test_preds, truth = sale_price, estimate = .pred)$.estimate
rsq_value
## [1] 0.80782
cat("The XGBoost model achieves an R² of", round(rsq_value, 3), "on the test set, indicating strong predictive performance.\n")
## The XGBoost model achieves an R² of 0.808 on the test set, indicating strong predictive performance.
# 2. Plot predicted vs actual prices

ggplot(xgb_test_preds, aes(x=.pred, y=sale_price)) +
  geom_point(alpha=0.5, color="blue") +
  geom_abline(slope=1, intercept=0, color='red', linetype="dashed") +
  labs(
    title="Predicted vs Actual Sale Price",
    x="Predicted Price",
    y="Actual Sale Price"
  ) +
  theme_minimal()

20. Real-World Prediction Example:

# This simulates a real client scenario, where a new house is defined and the predictive model predicts its market value based on the data it trained on.
new_house <- test_data[1, ] %>%
  mutate(
    lot_area = 8500,
    gr_liv_area = 1800,
    overall_qual = 7,
    overall_cond = 5,
    year_built = 2005,
    garage_area = 450,
    total_bsmt_sf = 900,
    neighborhood = "NridgHt",
    house_style = "2Story",
    bldg_type = "1Fam",
    foundation = "PConc"
  ) %>%
  select(-sale_price)

new_house_pred <- predict(
  xgb_final_fit,
  new_data = new_house
) %>%
  mutate(predicted_price = exp(.pred) - 1)
new_house_pred
## # A tibble: 1 × 2
##   .pred predicted_price
##   <dbl>           <dbl>
## 1  11.8         128642.

21. Feature Importance: What actually drives price?

vip(
  xgb_final_fit,
  num_features = 15,
  geom = "col",
  aesthetics = list(fill = "#4C72B0")
) +
  labs(
    title = "Top 15 Important Features in House Price Prediction",
    x = "Importance",
    y = ""
  ) +
  theme_minimal(base_size = 13)

22. Sensitivity Analysis: Living Area vs Price

template_house <- test_data[1, ] %>%
  select(-sale_price)

area_grid <- template_house[rep(1, 23), ] %>%
  mutate(
    gr_liv_area = seq(800, 3000, by = 100),
    lot_area = 8500,
    overall_qual = 7,
    overall_cond = 5,
    year_built = 2005,
    garage_area = 450,
    total_bsmt_sf = 900,
    neighborhood = "NridgHt",
    house_style = "2Story",
    bldg_type = "1Fam",
    foundation = "PConc"
  )

area_preds <- predict(
  xgb_final_fit,
  new_data = area_grid
) %>%
  mutate(predicted_price = exp(.pred) - 1) %>%
  bind_cols(area_grid)

ggplot(area_preds, aes(gr_liv_area, predicted_price)) +
  geom_point(alpha = 0.4) +
  geom_line(linewidth = 1.2) +
  labs(
    title = "Effect of Living Area on Predicted House Price",
    x = "Above-Ground Living Area (sq ft)",
    y = "Predicted Sale Price"
  ) +
  theme_minimal(base_size = 13)

cat("The shape shows a sharp increase in predicted price from ~800 to ~1000 sq ft. Additional square footage adds little value afterwards.")
## The shape shows a sharp increase in predicted price from ~800 to ~1000 sq ft. Additional square footage adds little value afterwards.