1. Load the Data

ames <- read_csv("../data/ames.csv") %>%
clean_names()
glimpse(ames)

## Rows: 2,930
## Columns: 82
## $ order           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ pid             <chr> "0526301100", "0526350040", "0526351010", "0526353030"…
## $ ms_sub_class    <chr> "020", "020", "020", "020", "060", "060", "120", "120"…
## $ ms_zoning       <chr> "RL", "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
## $ lot_frontage    <dbl> 141, 80, 81, 93, 74, 78, 41, 43, 39, 60, 75, NA, 63, 8…
## $ lot_area        <dbl> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005, 5…
## $ street          <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave"…
## $ alley           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lot_shape       <chr> "IR1", "Reg", "IR1", "Reg", "IR1", "IR1", "Reg", "IR1"…
## $ land_contour    <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "HLS"…
## $ utilities       <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "All…
## $ lot_config      <chr> "Corner", "Inside", "Corner", "Corner", "Inside", "Ins…
## $ land_slope      <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl"…
## $ neighborhood    <chr> "NAmes", "NAmes", "NAmes", "NAmes", "Gilbert", "Gilber…
## $ condition_1     <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm…
## $ condition_2     <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm"…
## $ bldg_type       <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "Twnhs…
## $ house_style     <chr> "1Story", "1Story", "1Story", "1Story", "2Story", "2St…
## $ overall_qual    <dbl> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, …
## $ overall_cond    <dbl> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, …
## $ year_built      <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ year_remod_add  <dbl> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996, …
## $ roof_style      <chr> "Hip", "Gable", "Hip", "Hip", "Gable", "Gable", "Gable…
## $ roof_matl       <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg",…
## $ exterior_1st    <chr> "BrkFace", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
## $ exterior_2nd    <chr> "Plywood", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
## $ mas_vnr_type    <chr> "Stone", "None", "BrkFace", "None", "None", "BrkFace",…
## $ mas_vnr_area    <dbl> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603,…
## $ exter_qual      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "Gd", "Gd", "Gd", …
## $ exter_cond      <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ foundation      <chr> "CBlock", "CBlock", "CBlock", "CBlock", "PConc", "PCon…
## $ bsmt_qual       <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", …
## $ bsmt_cond       <chr> "Gd", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ bsmt_exposure   <chr> "Gd", "No", "No", "No", "No", "No", "Mn", "No", "No", …
## $ bsmt_fin_type_1 <chr> "BLQ", "Rec", "ALQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ"…
## $ bsmt_fin_sf_1   <dbl> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0, 9…
## $ bsmt_fin_type_2 <chr> "Unf", "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf"…
## $ bsmt_fin_sf_2   <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0…
## $ bsmt_unf_sf     <dbl> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, 76…
## $ total_bsmt_sf   <dbl> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 994…
## $ heating         <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA"…
## $ heating_qc      <chr> "Fa", "TA", "TA", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", …
## $ central_air     <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ electrical      <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", …
## $ x1st_flr_sf     <dbl> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 102…
## $ x2nd_flr_sf     <dbl> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 0, 676, 0, 0,…
## $ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gr_liv_area     <dbl> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616, 1…
## $ bsmt_full_bath  <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, …
## $ bsmt_half_bath  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ full_bath       <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, …
## $ half_bath       <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ bedroom_abv_gr  <dbl> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 1, 4, 4, 1, …
## $ kitchen_abv_gr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ kitchen_qual    <chr> "TA", "TA", "Gd", "Ex", "TA", "Gd", "Gd", "Gd", "Gd", …
## $ tot_rms_abv_grd <dbl> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 6, 7, 5, 4, 12, 8, 8,…
## $ functional      <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ"…
## $ fireplaces      <dbl> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ fireplace_qu    <chr> "Gd", NA, NA, "TA", "TA", "Gd", NA, NA, "TA", "TA", "T…
## $ garage_type     <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd", "Att…
## $ garage_yr_blt   <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ garage_finish   <chr> "Fin", "Unf", "Unf", "Fin", "Fin", "Fin", "Fin", "RFn"…
## $ garage_cars     <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ garage_area     <dbl> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 440,…
## $ garage_qual     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ garage_cond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ paved_drive     <chr> "P", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ wood_deck_sf    <dbl> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 483, …
## $ open_porch_sf   <dbl> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 21, 75, 0, 5…
## $ enclosed_porch  <dbl> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ x3ssn_porch     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ screen_porch    <dbl> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 140, 210…
## $ pool_area       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pool_qc         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ fence           <chr> NA, "MnPrv", NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, …
## $ misc_feature    <chr> NA, NA, "Gar2", NA, NA, NA, NA, NA, NA, NA, NA, "Shed"…
## $ misc_val        <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0, …
## $ mo_sold         <dbl> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 3, 5, 2, 6, 6, 6, 6, …
## $ yr_sold         <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
## $ sale_type       <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
## $ sale_condition  <chr> "Normal", "Normal", "Normal", "Normal", "Normal", "Nor…
## $ sale_price      <dbl> 215000, 105000, 172000, 244000, 189900, 195500, 213500…

2. Summary Statistics

skim(ames)

Data summary
Name	ames
Number of rows	2930
Number of columns	82
_______________________
Column type frequency:
character	45
numeric	37
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
pid	0	1.00	10	10	2930
ms_sub_class	0	1.00	3	3	16
ms_zoning	0	1.00	2	7	7
street	0	1.00	4	4	2
alley	2732	0.07	4	4	2
lot_shape	0	1.00	3	3	4
land_contour	0	1.00	3	3	4
utilities	0	1.00	6	6	3
lot_config	0	1.00	3	7	5
land_slope	0	1.00	3	3	3
neighborhood	0	1.00	5	7	28
condition_1	0	1.00	4	6	9
condition_2	0	1.00	4	6	8
bldg_type	0	1.00	4	6	5
house_style	0	1.00	4	6	8
roof_style	0	1.00	3	7	6
roof_matl	0	1.00	4	7	8
exterior_1st	0	1.00	5	7	16
exterior_2nd	0	1.00	5	7	17
mas_vnr_type	23	0.99	4	7	5
exter_qual	0	1.00	2	2	4
exter_cond	0	1.00	2	2	5
foundation	0	1.00	4	6	6
bsmt_qual	80	0.97	2	2	5
bsmt_cond	80	0.97	2	2	5
bsmt_exposure	83	0.97	2	2	4
bsmt_fin_type_1	80	0.97	3	3	6
bsmt_fin_type_2	81	0.97	3	3	6
heating	0	1.00	4	5	6
heating_qc	0	1.00	2	2	5
central_air	0	1.00	1	1	2
electrical	1	1.00	3	5	5
kitchen_qual	0	1.00	2	2	5
functional	0	1.00	3	4	8
fireplace_qu	1422	0.51	2	2	5
garage_type	157	0.95	6	7	6
garage_finish	159	0.95	3	3	3
garage_qual	159	0.95	2	2	5
garage_cond	159	0.95	2	2	5
paved_drive	0	1.00	1	1	3
pool_qc	2917	0.00	2	2	4
fence	2358	0.20	4	5	4
misc_feature	2824	0.04	4	4	5
sale_type	0	1.00	2	5	10
sale_condition	0	1.00	6	7	6

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
order	0	1.00	1465.50	845.96	1	733.25	1465.5	2197.75	2930	▇▇▇▇▇
lot_frontage	490	0.83	69.22	23.37	21	58.00	68.0	80.00	313	▇▃▁▁▁
lot_area	0	1.00	10147.92	7880.02	1300	7440.25	9436.5	11555.25	215245	▇▁▁▁▁
overall_qual	0	1.00	6.09	1.41	1	5.00	6.0	7.00	10	▁▂▇▅▁
overall_cond	0	1.00	5.56	1.11	1	5.00	5.0	6.00	9	▁▁▇▅▁
year_built	0	1.00	1971.36	30.25	1872	1954.00	1973.0	2001.00	2010	▁▂▃▆▇
year_remod_add	0	1.00	1984.27	20.86	1950	1965.00	1993.0	2004.00	2010	▅▂▂▃▇
mas_vnr_area	23	0.99	101.90	179.11	0	0.00	0.0	164.00	1600	▇▁▁▁▁
bsmt_fin_sf_1	1	1.00	442.63	455.59	0	0.00	370.0	734.00	5644	▇▁▁▁▁
bsmt_fin_sf_2	1	1.00	49.72	169.17	0	0.00	0.0	0.00	1526	▇▁▁▁▁
bsmt_unf_sf	1	1.00	559.26	439.49	0	219.00	466.0	802.00	2336	▇▅▂▁▁
total_bsmt_sf	1	1.00	1051.61	440.62	0	793.00	990.0	1302.00	6110	▇▃▁▁▁
x1st_flr_sf	0	1.00	1159.56	391.89	334	876.25	1084.0	1384.00	5095	▇▃▁▁▁
x2nd_flr_sf	0	1.00	335.46	428.40	0	0.00	0.0	703.75	2065	▇▃▂▁▁
low_qual_fin_sf	0	1.00	4.68	46.31	0	0.00	0.0	0.00	1064	▇▁▁▁▁
gr_liv_area	0	1.00	1499.69	505.51	334	1126.00	1442.0	1742.75	5642	▇▇▁▁▁
bsmt_full_bath	2	1.00	0.43	0.52	0	0.00	0.0	1.00	3	▇▆▁▁▁
bsmt_half_bath	2	1.00	0.06	0.25	0	0.00	0.0	0.00	2	▇▁▁▁▁
full_bath	0	1.00	1.57	0.55	0	1.00	2.0	2.00	4	▁▇▇▁▁
half_bath	0	1.00	0.38	0.50	0	0.00	0.0	1.00	2	▇▁▅▁▁
bedroom_abv_gr	0	1.00	2.85	0.83	0	2.00	3.0	3.00	8	▁▇▂▁▁
kitchen_abv_gr	0	1.00	1.04	0.21	0	1.00	1.0	1.00	3	▁▇▁▁▁
tot_rms_abv_grd	0	1.00	6.44	1.57	2	5.00	6.0	7.00	15	▁▇▂▁▁
fireplaces	0	1.00	0.60	0.65	0	0.00	1.0	1.00	4	▇▇▁▁▁
garage_yr_blt	159	0.95	1978.13	25.53	1895	1960.00	1979.0	2002.00	2207	▂▇▁▁▁
garage_cars	1	1.00	1.77	0.76	0	1.00	2.0	2.00	5	▅▇▂▁▁
garage_area	1	1.00	472.82	215.05	0	320.00	480.0	576.00	1488	▃▇▃▁▁
wood_deck_sf	0	1.00	93.75	126.36	0	0.00	0.0	168.00	1424	▇▁▁▁▁
open_porch_sf	0	1.00	47.53	67.48	0	0.00	27.0	70.00	742	▇▁▁▁▁
enclosed_porch	0	1.00	23.01	64.14	0	0.00	0.0	0.00	1012	▇▁▁▁▁
x3ssn_porch	0	1.00	2.59	25.14	0	0.00	0.0	0.00	508	▇▁▁▁▁
screen_porch	0	1.00	16.00	56.09	0	0.00	0.0	0.00	576	▇▁▁▁▁
pool_area	0	1.00	2.24	35.60	0	0.00	0.0	0.00	800	▇▁▁▁▁
misc_val	0	1.00	50.64	566.34	0	0.00	0.0	0.00	17000	▇▁▁▁▁
mo_sold	0	1.00	6.22	2.71	1	4.00	6.0	8.00	12	▅▆▇▃▃
yr_sold	0	1.00	2007.79	1.32	2006	2007.00	2008.0	2009.00	2010	▇▇▇▇▃
sale_price	0	1.00	180796.06	79886.69	12789	129500.00	160000.0	213500.00	755000	▇▇▁▁▁

3. Missing Values Overview

ames %>% 
summarise(across(everything(), ~sum(is.na(.)))) %>% 
gather(variable, missing_count) %>% 
arrange(desc(missing_count))

## # A tibble: 82 × 2
##    variable      missing_count
##    <chr>                 <int>
##  1 pool_qc                2917
##  2 misc_feature           2824
##  3 alley                  2732
##  4 fence                  2358
##  5 fireplace_qu           1422
##  6 lot_frontage            490
##  7 garage_yr_blt           159
##  8 garage_finish           159
##  9 garage_qual             159
## 10 garage_cond             159
## # ℹ 72 more rows

4. Key Numeric Distributions

ames %>% 
select(sale_price, gr_liv_area, total_bsmt_sf, garage_area, lot_area) %>%
gather(variable, value) %>%
ggplot(aes(value)) +
geom_histogram(bins = 30) +
facet_wrap(~ variable, scales = "free") +
theme_minimal()

5. Pairwise Relationships

ames %>%
drop_na(sale_price, gr_liv_area, overall_qual, garage_area, year_built) %>%
select(sale_price, gr_liv_area, overall_qual, garage_area, year_built) %>%
ggpairs()

6. Data Cleaning & Feature Engineering

ames_clean <- ames %>% 
mutate(
across(where(is.character), as.factor),
fireplace_qu = fct_na_value_to_level(fireplace_qu, "None"),
garage_qual  = fct_na_value_to_level(garage_qual, "None"),
garage_type  = fct_na_value_to_level(garage_type, "None"),
alley        = fct_na_value_to_level(alley, "None"),
pool_qc      = fct_na_value_to_level(pool_qc, "None"),
misc_feature = fct_na_value_to_level(misc_feature, "None")
)

6.1 Remove Highly Skewed Outliers (Highly recommended by most ML textbooks I have come across)

ames_clean <- ames_clean %>%
filter(
sale_price < quantile(sale_price, 0.99),  # remove top 1% price outliers
gr_liv_area < 4000                        # common Ames rule: drop mega-houses
)

6.2 Check Missing Values After Cleaning

ames_clean %>% 
summarise(across(everything(), ~sum(is.na(.)))) %>% 
gather(variable, missing_count) %>% 
filter(missing_count > 0) %>%
arrange(desc(missing_count))

## # A tibble: 21 × 2
##    variable        missing_count
##    <chr>                   <int>
##  1 fence                    2328
##  2 lot_frontage              489
##  3 garage_yr_blt             159
##  4 garage_finish             159
##  5 garage_cond               159
##  6 bsmt_exposure              83
##  7 bsmt_fin_type_2            81
##  8 bsmt_qual                  80
##  9 bsmt_cond                  80
## 10 bsmt_fin_type_1            80
## # ℹ 11 more rows

6.3 Train/Test Split

data_split <- initial_split(ames_clean, prop = 0.8, strata = sale_price)
train_data <- training(data_split)
test_data  <- testing(data_split)

train_data %>% head()

## # A tibble: 6 × 82
##   order pid        ms_sub_class ms_zoning lot_frontage lot_area street alley
##   <dbl> <fct>      <fct>        <fct>            <dbl>    <dbl> <fct>  <fct>
## 1    27 0527404120 020          RL                  70     8400 Pave   None 
## 2    30 0527451180 160          RM                  21     1680 Pave   None 
## 3    31 0527451330 160          RM                  21     1680 Pave   None 
## 4    32 0527451410 160          RM                  21     1680 Pave   None 
## 5    33 0527452190 120          RL                  53     4043 Pave   None 
## 6    35 0527453150 120          RL                  24     2280 Pave   None 
## # ℹ 74 more variables: lot_shape <fct>, land_contour <fct>, utilities <fct>,
## #   lot_config <fct>, land_slope <fct>, neighborhood <fct>, condition_1 <fct>,
## #   condition_2 <fct>, bldg_type <fct>, house_style <fct>, overall_qual <dbl>,
## #   overall_cond <dbl>, year_built <dbl>, year_remod_add <dbl>,
## #   roof_style <fct>, roof_matl <fct>, exterior_1st <fct>, exterior_2nd <fct>,
## #   mas_vnr_type <fct>, mas_vnr_area <dbl>, exter_qual <fct>, exter_cond <fct>,
## #   foundation <fct>, bsmt_qual <fct>, bsmt_cond <fct>, bsmt_exposure <fct>, …

train_model <- train_data %>%
  mutate(log_sale_price = log(sale_price + 1)) %>%
  select(-sale_price)

test_model <- test_data %>%
  mutate(log_sale_price = log(sale_price + 1)) %>%
  select(-sale_price)

7. Feature Engineering Recipe (Tidymodels)

numeric_vars <- train_data %>%
  select(where(is.numeric)) %>%
  select(-sale_price) %>% 
  pivot_longer(everything(), names_to = "var", values_to = "value") %>%
  group_by(var) %>%
  summarise(skew = abs(skewness(value, na.rm = TRUE)), .groups = "drop") %>%
  filter(skew > 1) %>%
  pull(var)

numeric_vars

##  [1] "bsmt_fin_sf_2"   "bsmt_half_bath"  "enclosed_porch"  "kitchen_abv_gr" 
##  [5] "lot_area"        "lot_frontage"    "low_qual_fin_sf" "mas_vnr_area"   
##  [9] "misc_val"        "open_porch_sf"   "pool_area"       "screen_porch"   
## [13] "wood_deck_sf"    "x3ssn_porch"

7.1 Build the Recipe

ames_recipe <- recipe(log_sale_price ~ ., data = train_model) %>%

  # Imputation
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%

  # Log-transform skewed predictors
  step_log(all_of(numeric_vars), offset = 1e-6) %>%

  # Encoding
  step_dummy(all_nominal_predictors()) %>%

  # Cleanup
  step_zv(all_predictors()) %>%
  step_lincomb(all_predictors()) %>%
  step_corr(all_numeric_predictors(), threshold = 0.9) %>%
  step_normalize(all_numeric_predictors())

7.2 Prep the Recipe

ames_prep <- prep(ames_recipe)
train_processed <- bake(ames_prep, new_data = NULL)
test_processed <- bake(ames_prep, new_data = test_model)
train_processed %>% head()

## # A tibble: 6 × 2,316
##   lot_frontage lot_area overall_qual overall_cond year_built year_remod_add
##          <dbl>    <dbl>        <dbl>        <dbl>      <dbl>          <dbl>
## 1        0.201  -0.0961      -1.50         -0.522   -0.0279          -0.669
## 2       -3.47   -3.26        -0.0357       -0.522    0.00526         -0.622
## 3       -3.47   -3.26        -0.767        -0.522    0.00526         -0.622
## 4       -3.47   -3.26        -0.0357       -0.522    0.00526         -0.622
## 5       -0.647  -1.53        -0.0357       -0.522    0.204           -0.334
## 6       -3.06   -2.66         0.695         0.379    0.138           -0.430
## # ℹ 2,310 more variables: mas_vnr_area <dbl>, bsmt_fin_sf_1 <dbl>,
## #   bsmt_fin_sf_2 <dbl>, bsmt_unf_sf <dbl>, total_bsmt_sf <dbl>,
## #   x1st_flr_sf <dbl>, x2nd_flr_sf <dbl>, low_qual_fin_sf <dbl>,
## #   gr_liv_area <dbl>, bsmt_full_bath <dbl>, bsmt_half_bath <dbl>,
## #   full_bath <dbl>, half_bath <dbl>, bedroom_abv_gr <dbl>,
## #   kitchen_abv_gr <dbl>, tot_rms_abv_grd <dbl>, fireplaces <dbl>,
## #   garage_yr_blt <dbl>, garage_cars <dbl>, garage_area <dbl>, …

8. Model Training

folds <- vfold_cv(train_model, v = 5)

9. Linear Regression Model

lin_mod <- linear_reg() %>%
set_engine("lm")
lin_wf <- workflow() %>%
  add_model(lin_mod) %>%
  add_recipe(ames_recipe)

lin_res <- fit_resamples(
  lin_wf,
  folds,
  metrics = metric_set(rmse, rsq),
  control = control_resamples(save_pred = TRUE)
)
collect_metrics(lin_res)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config        
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
## 1 rmse    standard   0.764     5  0.0663 pre0_mod0_post0
## 2 rsq     standard   0.144     5  0.0429 pre0_mod0_post0

10. Random Forest Model

# 10.1 RF model specification
rf_mod <- rand_forest(
mtry  = tune(),
trees = 800,
min_n = tune()
) %>%
set_engine("ranger") %>%
set_mode("regression")

# 10.2 Workflow
rf_wf <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(ames_recipe)

# 10.3 RF hyperparameter grid
rf_grid <- grid_regular(
mtry(range = c(5, 15)),
min_n(range = c(2, 20)),
levels = 5
)

# 10.4 Tune with CV
rf_res <- tune_grid(
rf_wf,
resamples = folds,
grid = rf_grid,
metrics = metric_set(rmse, rsq),
control = control_grid(save_pred = TRUE)
)
rf_res

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 5
##   splits             id    .metrics          .notes           .predictions
##   <list>             <chr> <list>            <list>           <list>      
## 1 <split [1852/464]> Fold1 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 2 <split [1853/463]> Fold2 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 3 <split [1853/463]> Fold3 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 4 <split [1853/463]> Fold4 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>    
## 5 <split [1853/463]> Fold5 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>

11. Select best RF model

rf_best <- select_best(rf_res, metric = "rmse")

rf_final_wf <- finalize_workflow(rf_wf, rf_best)

rf_final_fit <- fit(rf_final_wf, data = train_model)

rf_test_preds <- predict(
  rf_final_fit,
  new_data = test_model
) %>%
  mutate(
    sale_price = test_data$sale_price,
    .pred = exp(.pred) - 1
  )

rf_metrics <- rf_test_preds %>%
  metrics(truth = sale_price, estimate = .pred)

rf_metrics

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard   42998.   
## 2 rsq     standard       0.835
## 3 mae     standard   26760.

12. XGBoost Model

library(xgboost)

# 12.1 Model specification
xgb_mod <- boost_tree(
  trees = tune(),
  learn_rate = tune(),
  mtry = tune(),
  tree_depth = tune(),
  loss_reduction = tune(),
  min_n = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# 12.2 Workflow
xgb_wf <- workflow() %>%
  add_recipe(ames_recipe) %>%
  add_model(xgb_mod)

# 12.3 Improved tuning grid
xgb_grid <- grid_space_filling(
  trees(range = c(500, 1500)),
  learn_rate(range = c(0.05, 0.3)),
  mtry(range = c(10, 40)),
  tree_depth(range = c(4, 10)),
  loss_reduction(range = c(0, 5)),
  min_n(range = c(5, 30)),
  size = 20
)

# 12.4 Tune with cross-validation
set.seed(123)
xgb_res <- tune_grid(
  xgb_wf,
  resamples = folds,
  grid = xgb_grid,
  metrics = metric_set(rmse, rsq),
  control = control_grid(
    save_pred = TRUE,
    verbose = FALSE
  )
)
xgb_res

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 5
##   splits             id    .metrics           .notes            .predictions
##   <list>             <chr> <list>             <list>            <list>      
## 1 <split [1852/464]> Fold1 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 2 <split [1853/463]> Fold2 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 3 <split [1853/463]> Fold3 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 4 <split [1853/463]> Fold4 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 5 <split [1853/463]> Fold5 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>    
## 
## There were issues with some computations:
## 
##   - Warning(s) x60: A correlation computation is required, but `estimate` is constant...
## 
## Run `show_notes(.Last.tune.result)` for more information.

13. Select Best XGBoost Model

xgb_best <- select_best(xgb_res, metric = "rmse")

xgb_final_wf <- finalize_workflow(xgb_wf, xgb_best)

xgb_final_fit <- fit(xgb_final_wf, data = train_model)

xgb_test_preds <- predict(
  xgb_final_fit,
  new_data = test_model
) %>%
  mutate(
    sale_price = test_data$sale_price,
    .pred = exp(.pred) - 1
  )

xgb_metrics <- xgb_test_preds %>%
  metrics(truth = sale_price, estimate = .pred)

xgb_metrics

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard   32330.   
## 2 rsq     standard       0.808
## 3 mae     standard   23579.

14. Model Comparison Table (Linear vs RF vs XGB)

model_comparison <- bind_rows(
  lin_res %>%
    collect_metrics() %>%
    filter(.metric == "rmse") %>%
    mutate(model = "Linear Regression"),
  
  rf_res %>%
    collect_metrics() %>%
    filter(.metric == "rmse") %>%
    mutate(model = "Random Forest"),
  
  xgb_res %>%
    collect_metrics() %>%
    filter(.metric == "rmse") %>%
    mutate(model = "XGBoost")
) %>%
  select(model, mean, std_err) %>%
  arrange(mean)

model_comparison

## # A tibble: 46 × 3
##    model          mean std_err
##    <chr>         <dbl>   <dbl>
##  1 XGBoost       0.200 0.00778
##  2 XGBoost       0.208 0.0122 
##  3 XGBoost       0.223 0.00664
##  4 Random Forest 0.224 0.00926
##  5 Random Forest 0.224 0.00981
##  6 Random Forest 0.224 0.00987
##  7 Random Forest 0.224 0.00990
##  8 Random Forest 0.224 0.00978
##  9 Random Forest 0.238 0.00924
## 10 Random Forest 0.238 0.00982
## # ℹ 36 more rows

15. Cross-Validation RMSE Plot (All Models)

library(ggplot2)

cv_results <- bind_rows(
  lin_res %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "Linear"),
  rf_res  %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "Random Forest"),
  xgb_res %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "XGBoost")
)

ggplot(cv_results, aes(x = model, y = mean, fill = model)) +
  geom_col(width = 0.6) +
  scale_fill_manual(
    values = c(
      "Linear" = "#4C72B0",
      "Random Forest" = "#55A868",
      "XGBoost" = "#C44E52"
    )
  ) +
  labs(
    title = "Cross-Validation RMSE Comparison (Log Sale Price)",
    x = "",
    y = "RMSE"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

16. Final Test-Set Predictions Plot

ggplot(xgb_test_preds, aes(x = sale_price, y = .pred)) +
  geom_point(alpha = 0.4, color = "#4C72B0") +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed",
              color = "#C44E52") +
  labs(
    title = "XGBoost Predictions vs Actual Sale Price",
    x = "Actual Sale Price",
    y = "Predicted Sale Price"
  ) +
  theme_minimal(base_size = 13)

17. Residual Diagnostics (XGBoost) - Residuals are the standard way to show model fit quality

xgb_test_preds <- xgb_test_preds %>%
  mutate(residual = sale_price - .pred)

ggplot(xgb_test_preds, aes(x = .pred, y = residual)) +
  geom_point(alpha = 0.35, color = "#55A868") +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             color = "#C44E52") +
  labs(
    title = "Residuals vs Predicted Values (XGBoost)",
    x = "Predicted Sale Price",
    y = "Residuals"
  ) +
  theme_minimal(base_size = 13)

18. Distribution of Prediction Errors - Standard check for error symmetry, well-behaviour, and not wild skewness

ggplot(xgb_test_preds, aes(residual)) +
  geom_histogram(bins = 30,
                 fill = "#4C72B0",
                 alpha = 0.85) +
  labs(
    title = "Distribution of Prediction Errors (XGBoost)",
    x = "Prediction Error",
    y = "Count"
  ) +
  theme_minimal(base_size = 13)

19: Real-Life Impact of Price Predictions

Among the models, XGBoost demonstrated the strongest predictive performance on the test set. The XGBoost model is very accurate in predicting actual house prices:

# Add prediction errors
xgb_test_preds <- xgb_test_preds %>%
  mutate(
    error = abs(.pred - sale_price),
    pct_error = abs(.pred - sale_price) / sale_price * 100,
    underpriced = .pred > sale_price,
    potential_gain = if_else(underpriced, .pred - sale_price, 0)
  )

# 1. Model R² report

rsq_value <- rsq(xgb_test_preds, truth = sale_price, estimate = .pred)$.estimate
rsq_value

## [1] 0.80782

cat("The XGBoost model achieves an R² of", round(rsq_value, 3), "on the test set, indicating strong predictive performance.\n")

## The XGBoost model achieves an R² of 0.808 on the test set, indicating strong predictive performance.

# 2. Plot predicted vs actual prices

ggplot(xgb_test_preds, aes(x=.pred, y=sale_price)) +
  geom_point(alpha=0.5, color="blue") +
  geom_abline(slope=1, intercept=0, color='red', linetype="dashed") +
  labs(
    title="Predicted vs Actual Sale Price",
    x="Predicted Price",
    y="Actual Sale Price"
  ) +
  theme_minimal()

20. Real-World Prediction Example:

# This simulates a real client scenario, where a new house is defined and the predictive model predicts its market value based on the data it trained on.
new_house <- test_data[1, ] %>%
  mutate(
    lot_area = 8500,
    gr_liv_area = 1800,
    overall_qual = 7,
    overall_cond = 5,
    year_built = 2005,
    garage_area = 450,
    total_bsmt_sf = 900,
    neighborhood = "NridgHt",
    house_style = "2Story",
    bldg_type = "1Fam",
    foundation = "PConc"
  ) %>%
  select(-sale_price)

new_house_pred <- predict(
  xgb_final_fit,
  new_data = new_house
) %>%
  mutate(predicted_price = exp(.pred) - 1)
new_house_pred

## # A tibble: 1 × 2
##   .pred predicted_price
##   <dbl>           <dbl>
## 1  11.8         128642.

21. Feature Importance: What actually drives price?

vip(
  xgb_final_fit,
  num_features = 15,
  geom = "col",
  aesthetics = list(fill = "#4C72B0")
) +
  labs(
    title = "Top 15 Important Features in House Price Prediction",
    x = "Importance",
    y = ""
  ) +
  theme_minimal(base_size = 13)

22. Sensitivity Analysis: Living Area vs Price

template_house <- test_data[1, ] %>%
  select(-sale_price)

area_grid <- template_house[rep(1, 23), ] %>%
  mutate(
    gr_liv_area = seq(800, 3000, by = 100),
    lot_area = 8500,
    overall_qual = 7,
    overall_cond = 5,
    year_built = 2005,
    garage_area = 450,
    total_bsmt_sf = 900,
    neighborhood = "NridgHt",
    house_style = "2Story",
    bldg_type = "1Fam",
    foundation = "PConc"
  )

area_preds <- predict(
  xgb_final_fit,
  new_data = area_grid
) %>%
  mutate(predicted_price = exp(.pred) - 1) %>%
  bind_cols(area_grid)

ggplot(area_preds, aes(gr_liv_area, predicted_price)) +
  geom_point(alpha = 0.4) +
  geom_line(linewidth = 1.2) +
  labs(
    title = "Effect of Living Area on Predicted House Price",
    x = "Above-Ground Living Area (sq ft)",
    y = "Predicted Sale Price"
  ) +
  theme_minimal(base_size = 13)

cat("The shape shows a sharp increase in predicted price from ~800 to ~1000 sq ft. Additional square footage adds little value afterwards.")

## The shape shows a sharp increase in predicted price from ~800 to ~1000 sq ft. Additional square footage adds little value afterwards.

Predicting House Prices by Training Models in R

Joshua Kohlmeyer

1. Load the Data

2. Summary Statistics

3. Missing Values Overview

4. Key Numeric Distributions

5. Pairwise Relationships

6. Data Cleaning & Feature Engineering

6.1 Remove Highly Skewed Outliers (Highly recommended by most ML textbooks I have come across)

6.2 Check Missing Values After Cleaning

6.3 Train/Test Split

7. Feature Engineering Recipe (Tidymodels)

7.1 Build the Recipe

7.2 Prep the Recipe

8. Model Training

9. Linear Regression Model

10. Random Forest Model

11. Select best RF model

12. XGBoost Model

13. Select Best XGBoost Model

14. Model Comparison Table (Linear vs RF vs XGB)

15. Cross-Validation RMSE Plot (All Models)

16. Final Test-Set Predictions Plot

17. Residual Diagnostics (XGBoost) - Residuals are the standard way to show model fit quality

18. Distribution of Prediction Errors - Standard check for error symmetry, well-behaviour, and not wild skewness

19: Real-Life Impact of Price Predictions

Among the models, XGBoost demonstrated the strongest predictive performance on the test set. The XGBoost model is very accurate in predicting actual house prices:

20. Real-World Prediction Example:

21. Feature Importance: What actually drives price?

22. Sensitivity Analysis: Living Area vs Price