top of page

Housing Data Cleaning with SQL

Dataset: https://kaggle.com

Software: SQL Server Management Studio (SSMS)

SELECT * FROM HousingProject..HousingData

--Standardize date format

SELECT SaleDate, CAST(SaleDate AS date)
FROM HousingProject..HousingData

UPDATE HousingData
SET SaleDate = CAST(SaleDate AS date)

--Query successful but does not show up converted
--Workaround:

ALTER TABLE HousingData
ADD CastSaleDate date;

UPDATE HousingData
SET CastSaleDate = CAST(SaleDate AS date)

--Populate property address data

SELECT PropertyAddress
FROM HousingProject..HousingData

SELECT PropertyAddress
FROM HousingProject..HousingData
WHERE PropertyAddress IS NULL

SELECT *
FROM HousingProject..HousingData
WHERE PropertyAddress IS NULL

SELECT *
FROM HousingProject..HousingData
ORDER BY ParcelID

SELECT *
FROM HousingProject..HousingData AS hda
JOIN HousingProject..HousingData AS hdb
         
ON hda.ParcelID = hdb.ParcelID
         
AND hda.[UniqueID ] <> hdb.[UniqueID ]

SELECT hda.ParcelID, hda.PropertyAddress, hdb.ParcelID, hdb.PropertyAddress, ISNULL(hda.PropertyAddress, hdb.PropertyAddress)
FROM HousingProject..HousingData AS hda
JOIN HousingProject..HousingData AS hdb
         
ON hda.ParcelID = hdb.ParcelID
         
AND hda.[UniqueID ] <> hdb.[UniqueID ]
WHERE hda.PropertyAddress IS NULL

UPDATE hda
SET PropertyAddress = ISNULL(hda.PropertyAddress, hdb.PropertyAddress)
FROM HousingProject..HousingData AS hda
JOIN HousingProject..HousingData AS hdb
         
ON hda.ParcelID = hdb.ParcelID
         
AND hda.[UniqueID ] <> hdb.[UniqueID ]
WHERE hda.PropertyAddress IS NULL

--Break out property address into individual columns (address, city) using SUBSTRING

SELECT PropertyAddress
FROM HousingProject..HousingData

SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1) AS Address
FROM HousingProject..HousingData

SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1) AS Address,
SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress)) AS Address
FROM HousingProject..HousingData

ALTER TABLE HousingData
ADD SplitAddress nvarchar(255);

UPDATE HousingData
SET SplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1)

ALTER TABLE HousingData
ADD SplitCity nvarchar(255);

UPDATE HousingData
SET SplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress))

sp_rename 'HousingData.SplitAddress', 'SplitPropertyAddress', 'COLUMN';
sp_rename 'HousingData.SplitCity', 'SplitPropertyCity', 'COLUMN';

--Break out Owner Address into individual columns (address, city, state) using PARSENAME

SELECT *

FROM HousingProject..HousingData

SELECT OwnerAddress
FROM HousingProject..HousingData

SELECT
PARSENAME(REPLACE(OwnerAddress, ',', '.'),3),
PARSENAME(REPLACE(OwnerAddress, ',', '.'),2),
PARSENAME(REPLACE(OwnerAddress, ',', '.'),1)
FROM HousingProject..HousingData

ALTER TABLE HousingData
ADD SplitOwnerAddress nvarchar(255);

UPDATE HousingData
SET SplitOwnerAddress = PARSENAME(REPLACE(OwnerAddress, ',', '.'),3)

ALTER TABLE HousingData
ADD SplitOwnerCity nvarchar(255);

UPDATE HousingData
SET SplitOwnerCity = PARSENAME(REPLACE(OwnerAddress, ',', '.'),2)

ALTER TABLE HousingData
ADD SplitOwnerState nvarchar(255);

UPDATE HousingData
SET SplitOwnerState = PARSENAME(REPLACE(OwnerAddress, ',', '.'),1)

--Change Y and N to Yes and No in "Sold as Vacant" field

SELECT DISTINCT(SoldAsVacant), COUNT(SoldAsVacant)
FROM HousingProject..HousingData
GROUP BY SoldAsVacant
ORDER BY 2

SELECT SoldAsVacant,
CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
          WHEN SoldAsVacant = 'N' THEN 'No'
          ELSE SoldAsVacant
         
END
FROM HousingProject..HousingData

UPDATE HousingData
SET SoldAsVacant = CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
          WHEN SoldAsVacant = 'N' THEN 'No'
          ELSE SoldAsVacant
         
END

--Remove duplicate columns

WITH RowNumCTE AS
(
SELECT *, 
   
ROW_NUMBER() OVER(
 
 PARTITION BY ParcelID, 
                             PropertyAddress, 
                             SalePrice, 
                             SaleDate, 
                             LegalReference 
   
ORDER BY UniqueID) AS RowNum
FROM HousingProject..HousingData
)

DELETE FROM RowNumCTE
WHERE RowNum > 1

--Delete unused columns

SELECT *
FROM HousingProject..HousingData

ALTER TABLE HousingProject..HousingData
DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress

ALTER TABLE HousingProject..HousingData
DROP COLUMN SaleDate

bottom of page