ames <- read_csv("../data/ames.csv") %>%
clean_names()
glimpse(ames)
## Rows: 2,930
## Columns: 82
## $ order <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ pid <chr> "0526301100", "0526350040", "0526351010", "0526353030"…
## $ ms_sub_class <chr> "020", "020", "020", "020", "060", "060", "120", "120"…
## $ ms_zoning <chr> "RL", "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
## $ lot_frontage <dbl> 141, 80, 81, 93, 74, 78, 41, 43, 39, 60, 75, NA, 63, 8…
## $ lot_area <dbl> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005, 5…
## $ street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave"…
## $ alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lot_shape <chr> "IR1", "Reg", "IR1", "Reg", "IR1", "IR1", "Reg", "IR1"…
## $ land_contour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "HLS"…
## $ utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "All…
## $ lot_config <chr> "Corner", "Inside", "Corner", "Corner", "Inside", "Ins…
## $ land_slope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl"…
## $ neighborhood <chr> "NAmes", "NAmes", "NAmes", "NAmes", "Gilbert", "Gilber…
## $ condition_1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm…
## $ condition_2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm"…
## $ bldg_type <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "Twnhs…
## $ house_style <chr> "1Story", "1Story", "1Story", "1Story", "2Story", "2St…
## $ overall_qual <dbl> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, …
## $ overall_cond <dbl> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, …
## $ year_built <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ year_remod_add <dbl> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996, …
## $ roof_style <chr> "Hip", "Gable", "Hip", "Hip", "Gable", "Gable", "Gable…
## $ roof_matl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg",…
## $ exterior_1st <chr> "BrkFace", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
## $ exterior_2nd <chr> "Plywood", "VinylSd", "Wd Sdng", "BrkFace", "VinylSd",…
## $ mas_vnr_type <chr> "Stone", "None", "BrkFace", "None", "None", "BrkFace",…
## $ mas_vnr_area <dbl> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603,…
## $ exter_qual <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "Gd", "Gd", "Gd", …
## $ exter_cond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ foundation <chr> "CBlock", "CBlock", "CBlock", "CBlock", "PConc", "PCon…
## $ bsmt_qual <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", …
## $ bsmt_cond <chr> "Gd", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ bsmt_exposure <chr> "Gd", "No", "No", "No", "No", "No", "Mn", "No", "No", …
## $ bsmt_fin_type_1 <chr> "BLQ", "Rec", "ALQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ"…
## $ bsmt_fin_sf_1 <dbl> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0, 9…
## $ bsmt_fin_type_2 <chr> "Unf", "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf"…
## $ bsmt_fin_sf_2 <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0…
## $ bsmt_unf_sf <dbl> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, 76…
## $ total_bsmt_sf <dbl> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 994…
## $ heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA"…
## $ heating_qc <chr> "Fa", "TA", "TA", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", …
## $ central_air <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", …
## $ x1st_flr_sf <dbl> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 102…
## $ x2nd_flr_sf <dbl> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 0, 676, 0, 0,…
## $ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gr_liv_area <dbl> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616, 1…
## $ bsmt_full_bath <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, …
## $ bsmt_half_bath <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ full_bath <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, …
## $ half_bath <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ bedroom_abv_gr <dbl> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 1, 4, 4, 1, …
## $ kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ kitchen_qual <chr> "TA", "TA", "Gd", "Ex", "TA", "Gd", "Gd", "Gd", "Gd", …
## $ tot_rms_abv_grd <dbl> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 6, 7, 5, 4, 12, 8, 8,…
## $ functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ"…
## $ fireplaces <dbl> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ fireplace_qu <chr> "Gd", NA, NA, "TA", "TA", "Gd", NA, NA, "TA", "TA", "T…
## $ garage_type <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd", "Att…
## $ garage_yr_blt <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ garage_finish <chr> "Fin", "Unf", "Unf", "Fin", "Fin", "Fin", "Fin", "RFn"…
## $ garage_cars <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ garage_area <dbl> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 440,…
## $ garage_qual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ garage_cond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
## $ paved_drive <chr> "P", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ wood_deck_sf <dbl> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 483, …
## $ open_porch_sf <dbl> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 21, 75, 0, 5…
## $ enclosed_porch <dbl> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ x3ssn_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ screen_porch <dbl> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 140, 210…
## $ pool_area <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pool_qc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ fence <chr> NA, "MnPrv", NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, …
## $ misc_feature <chr> NA, NA, "Gar2", NA, NA, NA, NA, NA, NA, NA, NA, "Shed"…
## $ misc_val <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0, …
## $ mo_sold <dbl> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 3, 5, 2, 6, 6, 6, 6, …
## $ yr_sold <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
## $ sale_type <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
## $ sale_condition <chr> "Normal", "Normal", "Normal", "Normal", "Normal", "Nor…
## $ sale_price <dbl> 215000, 105000, 172000, 244000, 189900, 195500, 213500…
skim(ames)
| Name | ames |
| Number of rows | 2930 |
| Number of columns | 82 |
| _______________________ | |
| Column type frequency: | |
| character | 45 |
| numeric | 37 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| pid | 0 | 1.00 | 10 | 10 | 0 | 2930 | 0 |
| ms_sub_class | 0 | 1.00 | 3 | 3 | 0 | 16 | 0 |
| ms_zoning | 0 | 1.00 | 2 | 7 | 0 | 7 | 0 |
| street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| alley | 2732 | 0.07 | 4 | 4 | 0 | 2 | 0 |
| lot_shape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| land_contour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| utilities | 0 | 1.00 | 6 | 6 | 0 | 3 | 0 |
| lot_config | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
| land_slope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| neighborhood | 0 | 1.00 | 5 | 7 | 0 | 28 | 0 |
| condition_1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
| condition_2 | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| bldg_type | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| house_style | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| roof_style | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
| roof_matl | 0 | 1.00 | 4 | 7 | 0 | 8 | 0 |
| exterior_1st | 0 | 1.00 | 5 | 7 | 0 | 16 | 0 |
| exterior_2nd | 0 | 1.00 | 5 | 7 | 0 | 17 | 0 |
| mas_vnr_type | 23 | 0.99 | 4 | 7 | 0 | 5 | 0 |
| exter_qual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| exter_cond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
| bsmt_qual | 80 | 0.97 | 2 | 2 | 0 | 5 | 0 |
| bsmt_cond | 80 | 0.97 | 2 | 2 | 0 | 5 | 0 |
| bsmt_exposure | 83 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| bsmt_fin_type_1 | 80 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| bsmt_fin_type_2 | 81 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
| heating_qc | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| central_air | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| electrical | 1 | 1.00 | 3 | 5 | 0 | 5 | 0 |
| kitchen_qual | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| functional | 0 | 1.00 | 3 | 4 | 0 | 8 | 0 |
| fireplace_qu | 1422 | 0.51 | 2 | 2 | 0 | 5 | 0 |
| garage_type | 157 | 0.95 | 6 | 7 | 0 | 6 | 0 |
| garage_finish | 159 | 0.95 | 3 | 3 | 0 | 3 | 0 |
| garage_qual | 159 | 0.95 | 2 | 2 | 0 | 5 | 0 |
| garage_cond | 159 | 0.95 | 2 | 2 | 0 | 5 | 0 |
| paved_drive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| pool_qc | 2917 | 0.00 | 2 | 2 | 0 | 4 | 0 |
| fence | 2358 | 0.20 | 4 | 5 | 0 | 4 | 0 |
| misc_feature | 2824 | 0.04 | 4 | 4 | 0 | 5 | 0 |
| sale_type | 0 | 1.00 | 2 | 5 | 0 | 10 | 0 |
| sale_condition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| order | 0 | 1.00 | 1465.50 | 845.96 | 1 | 733.25 | 1465.5 | 2197.75 | 2930 | ▇▇▇▇▇ |
| lot_frontage | 490 | 0.83 | 69.22 | 23.37 | 21 | 58.00 | 68.0 | 80.00 | 313 | ▇▃▁▁▁ |
| lot_area | 0 | 1.00 | 10147.92 | 7880.02 | 1300 | 7440.25 | 9436.5 | 11555.25 | 215245 | ▇▁▁▁▁ |
| overall_qual | 0 | 1.00 | 6.09 | 1.41 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▂▇▅▁ |
| overall_cond | 0 | 1.00 | 5.56 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| year_built | 0 | 1.00 | 1971.36 | 30.25 | 1872 | 1954.00 | 1973.0 | 2001.00 | 2010 | ▁▂▃▆▇ |
| year_remod_add | 0 | 1.00 | 1984.27 | 20.86 | 1950 | 1965.00 | 1993.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
| mas_vnr_area | 23 | 0.99 | 101.90 | 179.11 | 0 | 0.00 | 0.0 | 164.00 | 1600 | ▇▁▁▁▁ |
| bsmt_fin_sf_1 | 1 | 1.00 | 442.63 | 455.59 | 0 | 0.00 | 370.0 | 734.00 | 5644 | ▇▁▁▁▁ |
| bsmt_fin_sf_2 | 1 | 1.00 | 49.72 | 169.17 | 0 | 0.00 | 0.0 | 0.00 | 1526 | ▇▁▁▁▁ |
| bsmt_unf_sf | 1 | 1.00 | 559.26 | 439.49 | 0 | 219.00 | 466.0 | 802.00 | 2336 | ▇▅▂▁▁ |
| total_bsmt_sf | 1 | 1.00 | 1051.61 | 440.62 | 0 | 793.00 | 990.0 | 1302.00 | 6110 | ▇▃▁▁▁ |
| x1st_flr_sf | 0 | 1.00 | 1159.56 | 391.89 | 334 | 876.25 | 1084.0 | 1384.00 | 5095 | ▇▃▁▁▁ |
| x2nd_flr_sf | 0 | 1.00 | 335.46 | 428.40 | 0 | 0.00 | 0.0 | 703.75 | 2065 | ▇▃▂▁▁ |
| low_qual_fin_sf | 0 | 1.00 | 4.68 | 46.31 | 0 | 0.00 | 0.0 | 0.00 | 1064 | ▇▁▁▁▁ |
| gr_liv_area | 0 | 1.00 | 1499.69 | 505.51 | 334 | 1126.00 | 1442.0 | 1742.75 | 5642 | ▇▇▁▁▁ |
| bsmt_full_bath | 2 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
| bsmt_half_bath | 2 | 1.00 | 0.06 | 0.25 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
| full_bath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▇▇▁▁ |
| half_bath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
| bedroom_abv_gr | 0 | 1.00 | 2.85 | 0.83 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
| kitchen_abv_gr | 0 | 1.00 | 1.04 | 0.21 | 0 | 1.00 | 1.0 | 1.00 | 3 | ▁▇▁▁▁ |
| tot_rms_abv_grd | 0 | 1.00 | 6.44 | 1.57 | 2 | 5.00 | 6.0 | 7.00 | 15 | ▁▇▂▁▁ |
| fireplaces | 0 | 1.00 | 0.60 | 0.65 | 0 | 0.00 | 1.0 | 1.00 | 4 | ▇▇▁▁▁ |
| garage_yr_blt | 159 | 0.95 | 1978.13 | 25.53 | 1895 | 1960.00 | 1979.0 | 2002.00 | 2207 | ▂▇▁▁▁ |
| garage_cars | 1 | 1.00 | 1.77 | 0.76 | 0 | 1.00 | 2.0 | 2.00 | 5 | ▅▇▂▁▁ |
| garage_area | 1 | 1.00 | 472.82 | 215.05 | 0 | 320.00 | 480.0 | 576.00 | 1488 | ▃▇▃▁▁ |
| wood_deck_sf | 0 | 1.00 | 93.75 | 126.36 | 0 | 0.00 | 0.0 | 168.00 | 1424 | ▇▁▁▁▁ |
| open_porch_sf | 0 | 1.00 | 47.53 | 67.48 | 0 | 0.00 | 27.0 | 70.00 | 742 | ▇▁▁▁▁ |
| enclosed_porch | 0 | 1.00 | 23.01 | 64.14 | 0 | 0.00 | 0.0 | 0.00 | 1012 | ▇▁▁▁▁ |
| x3ssn_porch | 0 | 1.00 | 2.59 | 25.14 | 0 | 0.00 | 0.0 | 0.00 | 508 | ▇▁▁▁▁ |
| screen_porch | 0 | 1.00 | 16.00 | 56.09 | 0 | 0.00 | 0.0 | 0.00 | 576 | ▇▁▁▁▁ |
| pool_area | 0 | 1.00 | 2.24 | 35.60 | 0 | 0.00 | 0.0 | 0.00 | 800 | ▇▁▁▁▁ |
| misc_val | 0 | 1.00 | 50.64 | 566.34 | 0 | 0.00 | 0.0 | 0.00 | 17000 | ▇▁▁▁▁ |
| mo_sold | 0 | 1.00 | 6.22 | 2.71 | 1 | 4.00 | 6.0 | 8.00 | 12 | ▅▆▇▃▃ |
| yr_sold | 0 | 1.00 | 2007.79 | 1.32 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▃ |
| sale_price | 0 | 1.00 | 180796.06 | 79886.69 | 12789 | 129500.00 | 160000.0 | 213500.00 | 755000 | ▇▇▁▁▁ |
ames %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
gather(variable, missing_count) %>%
arrange(desc(missing_count))
## # A tibble: 82 × 2
## variable missing_count
## <chr> <int>
## 1 pool_qc 2917
## 2 misc_feature 2824
## 3 alley 2732
## 4 fence 2358
## 5 fireplace_qu 1422
## 6 lot_frontage 490
## 7 garage_yr_blt 159
## 8 garage_finish 159
## 9 garage_qual 159
## 10 garage_cond 159
## # ℹ 72 more rows
ames %>%
select(sale_price, gr_liv_area, total_bsmt_sf, garage_area, lot_area) %>%
gather(variable, value) %>%
ggplot(aes(value)) +
geom_histogram(bins = 30) +
facet_wrap(~ variable, scales = "free") +
theme_minimal()
ames %>%
drop_na(sale_price, gr_liv_area, overall_qual, garage_area, year_built) %>%
select(sale_price, gr_liv_area, overall_qual, garage_area, year_built) %>%
ggpairs()
ames_clean <- ames %>%
mutate(
across(where(is.character), as.factor),
fireplace_qu = fct_na_value_to_level(fireplace_qu, "None"),
garage_qual = fct_na_value_to_level(garage_qual, "None"),
garage_type = fct_na_value_to_level(garage_type, "None"),
alley = fct_na_value_to_level(alley, "None"),
pool_qc = fct_na_value_to_level(pool_qc, "None"),
misc_feature = fct_na_value_to_level(misc_feature, "None")
)
ames_clean <- ames_clean %>%
filter(
sale_price < quantile(sale_price, 0.99), # remove top 1% price outliers
gr_liv_area < 4000 # common Ames rule: drop mega-houses
)
ames_clean %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
gather(variable, missing_count) %>%
filter(missing_count > 0) %>%
arrange(desc(missing_count))
## # A tibble: 21 × 2
## variable missing_count
## <chr> <int>
## 1 fence 2328
## 2 lot_frontage 489
## 3 garage_yr_blt 159
## 4 garage_finish 159
## 5 garage_cond 159
## 6 bsmt_exposure 83
## 7 bsmt_fin_type_2 81
## 8 bsmt_qual 80
## 9 bsmt_cond 80
## 10 bsmt_fin_type_1 80
## # ℹ 11 more rows
data_split <- initial_split(ames_clean, prop = 0.8, strata = sale_price)
train_data <- training(data_split)
test_data <- testing(data_split)
train_data %>% head()
## # A tibble: 6 × 82
## order pid ms_sub_class ms_zoning lot_frontage lot_area street alley
## <dbl> <fct> <fct> <fct> <dbl> <dbl> <fct> <fct>
## 1 27 0527404120 020 RL 70 8400 Pave None
## 2 30 0527451180 160 RM 21 1680 Pave None
## 3 31 0527451330 160 RM 21 1680 Pave None
## 4 32 0527451410 160 RM 21 1680 Pave None
## 5 33 0527452190 120 RL 53 4043 Pave None
## 6 35 0527453150 120 RL 24 2280 Pave None
## # ℹ 74 more variables: lot_shape <fct>, land_contour <fct>, utilities <fct>,
## # lot_config <fct>, land_slope <fct>, neighborhood <fct>, condition_1 <fct>,
## # condition_2 <fct>, bldg_type <fct>, house_style <fct>, overall_qual <dbl>,
## # overall_cond <dbl>, year_built <dbl>, year_remod_add <dbl>,
## # roof_style <fct>, roof_matl <fct>, exterior_1st <fct>, exterior_2nd <fct>,
## # mas_vnr_type <fct>, mas_vnr_area <dbl>, exter_qual <fct>, exter_cond <fct>,
## # foundation <fct>, bsmt_qual <fct>, bsmt_cond <fct>, bsmt_exposure <fct>, …
train_model <- train_data %>%
mutate(log_sale_price = log(sale_price + 1)) %>%
select(-sale_price)
test_model <- test_data %>%
mutate(log_sale_price = log(sale_price + 1)) %>%
select(-sale_price)
numeric_vars <- train_data %>%
select(where(is.numeric)) %>%
select(-sale_price) %>%
pivot_longer(everything(), names_to = "var", values_to = "value") %>%
group_by(var) %>%
summarise(skew = abs(skewness(value, na.rm = TRUE)), .groups = "drop") %>%
filter(skew > 1) %>%
pull(var)
numeric_vars
## [1] "bsmt_fin_sf_2" "bsmt_half_bath" "enclosed_porch" "kitchen_abv_gr"
## [5] "lot_area" "lot_frontage" "low_qual_fin_sf" "mas_vnr_area"
## [9] "misc_val" "open_porch_sf" "pool_area" "screen_porch"
## [13] "wood_deck_sf" "x3ssn_porch"
ames_recipe <- recipe(log_sale_price ~ ., data = train_model) %>%
# Imputation
step_impute_median(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
# Log-transform skewed predictors
step_log(all_of(numeric_vars), offset = 1e-6) %>%
# Encoding
step_dummy(all_nominal_predictors()) %>%
# Cleanup
step_zv(all_predictors()) %>%
step_lincomb(all_predictors()) %>%
step_corr(all_numeric_predictors(), threshold = 0.9) %>%
step_normalize(all_numeric_predictors())
ames_prep <- prep(ames_recipe)
train_processed <- bake(ames_prep, new_data = NULL)
test_processed <- bake(ames_prep, new_data = test_model)
train_processed %>% head()
## # A tibble: 6 × 2,316
## lot_frontage lot_area overall_qual overall_cond year_built year_remod_add
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.201 -0.0961 -1.50 -0.522 -0.0279 -0.669
## 2 -3.47 -3.26 -0.0357 -0.522 0.00526 -0.622
## 3 -3.47 -3.26 -0.767 -0.522 0.00526 -0.622
## 4 -3.47 -3.26 -0.0357 -0.522 0.00526 -0.622
## 5 -0.647 -1.53 -0.0357 -0.522 0.204 -0.334
## 6 -3.06 -2.66 0.695 0.379 0.138 -0.430
## # ℹ 2,310 more variables: mas_vnr_area <dbl>, bsmt_fin_sf_1 <dbl>,
## # bsmt_fin_sf_2 <dbl>, bsmt_unf_sf <dbl>, total_bsmt_sf <dbl>,
## # x1st_flr_sf <dbl>, x2nd_flr_sf <dbl>, low_qual_fin_sf <dbl>,
## # gr_liv_area <dbl>, bsmt_full_bath <dbl>, bsmt_half_bath <dbl>,
## # full_bath <dbl>, half_bath <dbl>, bedroom_abv_gr <dbl>,
## # kitchen_abv_gr <dbl>, tot_rms_abv_grd <dbl>, fireplaces <dbl>,
## # garage_yr_blt <dbl>, garage_cars <dbl>, garage_area <dbl>, …
folds <- vfold_cv(train_model, v = 5)
lin_mod <- linear_reg() %>%
set_engine("lm")
lin_wf <- workflow() %>%
add_model(lin_mod) %>%
add_recipe(ames_recipe)
lin_res <- fit_resamples(
lin_wf,
folds,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE)
)
collect_metrics(lin_res)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.764 5 0.0663 pre0_mod0_post0
## 2 rsq standard 0.144 5 0.0429 pre0_mod0_post0
# 10.1 RF model specification
rf_mod <- rand_forest(
mtry = tune(),
trees = 800,
min_n = tune()
) %>%
set_engine("ranger") %>%
set_mode("regression")
# 10.2 Workflow
rf_wf <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(ames_recipe)
# 10.3 RF hyperparameter grid
rf_grid <- grid_regular(
mtry(range = c(5, 15)),
min_n(range = c(2, 20)),
levels = 5
)
# 10.4 Tune with CV
rf_res <- tune_grid(
rf_wf,
resamples = folds,
grid = rf_grid,
metrics = metric_set(rmse, rsq),
control = control_grid(save_pred = TRUE)
)
rf_res
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [1852/464]> Fold1 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>
## 2 <split [1853/463]> Fold2 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>
## 3 <split [1853/463]> Fold3 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>
## 4 <split [1853/463]> Fold4 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>
## 5 <split [1853/463]> Fold5 <tibble [50 × 6]> <tibble [0 × 4]> <tibble>
rf_best <- select_best(rf_res, metric = "rmse")
rf_final_wf <- finalize_workflow(rf_wf, rf_best)
rf_final_fit <- fit(rf_final_wf, data = train_model)
rf_test_preds <- predict(
rf_final_fit,
new_data = test_model
) %>%
mutate(
sale_price = test_data$sale_price,
.pred = exp(.pred) - 1
)
rf_metrics <- rf_test_preds %>%
metrics(truth = sale_price, estimate = .pred)
rf_metrics
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 42998.
## 2 rsq standard 0.835
## 3 mae standard 26760.
library(xgboost)
# 12.1 Model specification
xgb_mod <- boost_tree(
trees = tune(),
learn_rate = tune(),
mtry = tune(),
tree_depth = tune(),
loss_reduction = tune(),
min_n = tune()
) %>%
set_engine("xgboost") %>%
set_mode("regression")
# 12.2 Workflow
xgb_wf <- workflow() %>%
add_recipe(ames_recipe) %>%
add_model(xgb_mod)
# 12.3 Improved tuning grid
xgb_grid <- grid_space_filling(
trees(range = c(500, 1500)),
learn_rate(range = c(0.05, 0.3)),
mtry(range = c(10, 40)),
tree_depth(range = c(4, 10)),
loss_reduction(range = c(0, 5)),
min_n(range = c(5, 30)),
size = 20
)
# 12.4 Tune with cross-validation
set.seed(123)
xgb_res <- tune_grid(
xgb_wf,
resamples = folds,
grid = xgb_grid,
metrics = metric_set(rmse, rsq),
control = control_grid(
save_pred = TRUE,
verbose = FALSE
)
)
xgb_res
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [1852/464]> Fold1 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>
## 2 <split [1853/463]> Fold2 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>
## 3 <split [1853/463]> Fold3 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>
## 4 <split [1853/463]> Fold4 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>
## 5 <split [1853/463]> Fold5 <tibble [40 × 10]> <tibble [12 × 4]> <tibble>
##
## There were issues with some computations:
##
## - Warning(s) x60: A correlation computation is required, but `estimate` is constant...
##
## Run `show_notes(.Last.tune.result)` for more information.
xgb_best <- select_best(xgb_res, metric = "rmse")
xgb_final_wf <- finalize_workflow(xgb_wf, xgb_best)
xgb_final_fit <- fit(xgb_final_wf, data = train_model)
xgb_test_preds <- predict(
xgb_final_fit,
new_data = test_model
) %>%
mutate(
sale_price = test_data$sale_price,
.pred = exp(.pred) - 1
)
xgb_metrics <- xgb_test_preds %>%
metrics(truth = sale_price, estimate = .pred)
xgb_metrics
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 32330.
## 2 rsq standard 0.808
## 3 mae standard 23579.
model_comparison <- bind_rows(
lin_res %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
mutate(model = "Linear Regression"),
rf_res %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
mutate(model = "Random Forest"),
xgb_res %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
mutate(model = "XGBoost")
) %>%
select(model, mean, std_err) %>%
arrange(mean)
model_comparison
## # A tibble: 46 × 3
## model mean std_err
## <chr> <dbl> <dbl>
## 1 XGBoost 0.200 0.00778
## 2 XGBoost 0.208 0.0122
## 3 XGBoost 0.223 0.00664
## 4 Random Forest 0.224 0.00926
## 5 Random Forest 0.224 0.00981
## 6 Random Forest 0.224 0.00987
## 7 Random Forest 0.224 0.00990
## 8 Random Forest 0.224 0.00978
## 9 Random Forest 0.238 0.00924
## 10 Random Forest 0.238 0.00982
## # ℹ 36 more rows
library(ggplot2)
cv_results <- bind_rows(
lin_res %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "Linear"),
rf_res %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "Random Forest"),
xgb_res %>% collect_metrics() %>% filter(.metric == "rmse") %>% mutate(model = "XGBoost")
)
ggplot(cv_results, aes(x = model, y = mean, fill = model)) +
geom_col(width = 0.6) +
scale_fill_manual(
values = c(
"Linear" = "#4C72B0",
"Random Forest" = "#55A868",
"XGBoost" = "#C44E52"
)
) +
labs(
title = "Cross-Validation RMSE Comparison (Log Sale Price)",
x = "",
y = "RMSE"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")
ggplot(xgb_test_preds, aes(x = sale_price, y = .pred)) +
geom_point(alpha = 0.4, color = "#4C72B0") +
geom_abline(slope = 1, intercept = 0,
linetype = "dashed",
color = "#C44E52") +
labs(
title = "XGBoost Predictions vs Actual Sale Price",
x = "Actual Sale Price",
y = "Predicted Sale Price"
) +
theme_minimal(base_size = 13)
xgb_test_preds <- xgb_test_preds %>%
mutate(residual = sale_price - .pred)
ggplot(xgb_test_preds, aes(x = .pred, y = residual)) +
geom_point(alpha = 0.35, color = "#55A868") +
geom_hline(yintercept = 0,
linetype = "dashed",
color = "#C44E52") +
labs(
title = "Residuals vs Predicted Values (XGBoost)",
x = "Predicted Sale Price",
y = "Residuals"
) +
theme_minimal(base_size = 13)
ggplot(xgb_test_preds, aes(residual)) +
geom_histogram(bins = 30,
fill = "#4C72B0",
alpha = 0.85) +
labs(
title = "Distribution of Prediction Errors (XGBoost)",
x = "Prediction Error",
y = "Count"
) +
theme_minimal(base_size = 13)
# Add prediction errors
xgb_test_preds <- xgb_test_preds %>%
mutate(
error = abs(.pred - sale_price),
pct_error = abs(.pred - sale_price) / sale_price * 100,
underpriced = .pred > sale_price,
potential_gain = if_else(underpriced, .pred - sale_price, 0)
)
# 1. Model R² report
rsq_value <- rsq(xgb_test_preds, truth = sale_price, estimate = .pred)$.estimate
rsq_value
## [1] 0.80782
cat("The XGBoost model achieves an R² of", round(rsq_value, 3), "on the test set, indicating strong predictive performance.\n")
## The XGBoost model achieves an R² of 0.808 on the test set, indicating strong predictive performance.
# 2. Plot predicted vs actual prices
ggplot(xgb_test_preds, aes(x=.pred, y=sale_price)) +
geom_point(alpha=0.5, color="blue") +
geom_abline(slope=1, intercept=0, color='red', linetype="dashed") +
labs(
title="Predicted vs Actual Sale Price",
x="Predicted Price",
y="Actual Sale Price"
) +
theme_minimal()
# This simulates a real client scenario, where a new house is defined and the predictive model predicts its market value based on the data it trained on.
new_house <- test_data[1, ] %>%
mutate(
lot_area = 8500,
gr_liv_area = 1800,
overall_qual = 7,
overall_cond = 5,
year_built = 2005,
garage_area = 450,
total_bsmt_sf = 900,
neighborhood = "NridgHt",
house_style = "2Story",
bldg_type = "1Fam",
foundation = "PConc"
) %>%
select(-sale_price)
new_house_pred <- predict(
xgb_final_fit,
new_data = new_house
) %>%
mutate(predicted_price = exp(.pred) - 1)
new_house_pred
## # A tibble: 1 × 2
## .pred predicted_price
## <dbl> <dbl>
## 1 11.8 128642.
vip(
xgb_final_fit,
num_features = 15,
geom = "col",
aesthetics = list(fill = "#4C72B0")
) +
labs(
title = "Top 15 Important Features in House Price Prediction",
x = "Importance",
y = ""
) +
theme_minimal(base_size = 13)
template_house <- test_data[1, ] %>%
select(-sale_price)
area_grid <- template_house[rep(1, 23), ] %>%
mutate(
gr_liv_area = seq(800, 3000, by = 100),
lot_area = 8500,
overall_qual = 7,
overall_cond = 5,
year_built = 2005,
garage_area = 450,
total_bsmt_sf = 900,
neighborhood = "NridgHt",
house_style = "2Story",
bldg_type = "1Fam",
foundation = "PConc"
)
area_preds <- predict(
xgb_final_fit,
new_data = area_grid
) %>%
mutate(predicted_price = exp(.pred) - 1) %>%
bind_cols(area_grid)
ggplot(area_preds, aes(gr_liv_area, predicted_price)) +
geom_point(alpha = 0.4) +
geom_line(linewidth = 1.2) +
labs(
title = "Effect of Living Area on Predicted House Price",
x = "Above-Ground Living Area (sq ft)",
y = "Predicted Sale Price"
) +
theme_minimal(base_size = 13)
cat("The shape shows a sharp increase in predicted price from ~800 to ~1000 sq ft. Additional square footage adds little value afterwards.")
## The shape shows a sharp increase in predicted price from ~800 to ~1000 sq ft. Additional square footage adds little value afterwards.