Housing Data Cleaning with SQL
Dataset: https://kaggle.com
Software: SQL Server Management Studio (SSMS)
SELECT * FROM HousingProject..HousingData
--Standardize date format
SELECT SaleDate, CAST(SaleDate AS date)
FROM HousingProject..HousingData
UPDATE HousingData
SET SaleDate = CAST(SaleDate AS date)
--Query successful but does not show up converted
--Workaround:
ALTER TABLE HousingData
ADD CastSaleDate date;
UPDATE HousingData
SET CastSaleDate = CAST(SaleDate AS date)
--Populate property address data
SELECT PropertyAddress
FROM HousingProject..HousingData
SELECT PropertyAddress
FROM HousingProject..HousingData
WHERE PropertyAddress IS NULL
SELECT *
FROM HousingProject..HousingData
WHERE PropertyAddress IS NULL
SELECT *
FROM HousingProject..HousingData
ORDER BY ParcelID
SELECT *
FROM HousingProject..HousingData AS hda
JOIN HousingProject..HousingData AS hdb
ON hda.ParcelID = hdb.ParcelID
AND hda.[UniqueID ] <> hdb.[UniqueID ]
SELECT hda.ParcelID, hda.PropertyAddress, hdb.ParcelID, hdb.PropertyAddress, ISNULL(hda.PropertyAddress, hdb.PropertyAddress)
FROM HousingProject..HousingData AS hda
JOIN HousingProject..HousingData AS hdb
ON hda.ParcelID = hdb.ParcelID
AND hda.[UniqueID ] <> hdb.[UniqueID ]
WHERE hda.PropertyAddress IS NULL
UPDATE hda
SET PropertyAddress = ISNULL(hda.PropertyAddress, hdb.PropertyAddress)
FROM HousingProject..HousingData AS hda
JOIN HousingProject..HousingData AS hdb
ON hda.ParcelID = hdb.ParcelID
AND hda.[UniqueID ] <> hdb.[UniqueID ]
WHERE hda.PropertyAddress IS NULL
--Break out property address into individual columns (address, city) using SUBSTRING
SELECT PropertyAddress
FROM HousingProject..HousingData
SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1) AS Address
FROM HousingProject..HousingData
SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1) AS Address,
SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress)) AS Address
FROM HousingProject..HousingData
ALTER TABLE HousingData
ADD SplitAddress nvarchar(255);
UPDATE HousingData
SET SplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1)
ALTER TABLE HousingData
ADD SplitCity nvarchar(255);
UPDATE HousingData
SET SplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress))
sp_rename 'HousingData.SplitAddress', 'SplitPropertyAddress', 'COLUMN';
sp_rename 'HousingData.SplitCity', 'SplitPropertyCity', 'COLUMN';
--Break out Owner Address into individual columns (address, city, state) using PARSENAME
SELECT *
FROM HousingProject..HousingData
SELECT OwnerAddress
FROM HousingProject..HousingData
SELECT
PARSENAME(REPLACE(OwnerAddress, ',', '.'),3),
PARSENAME(REPLACE(OwnerAddress, ',', '.'),2),
PARSENAME(REPLACE(OwnerAddress, ',', '.'),1)
FROM HousingProject..HousingData
ALTER TABLE HousingData
ADD SplitOwnerAddress nvarchar(255);
UPDATE HousingData
SET SplitOwnerAddress = PARSENAME(REPLACE(OwnerAddress, ',', '.'),3)
ALTER TABLE HousingData
ADD SplitOwnerCity nvarchar(255);
UPDATE HousingData
SET SplitOwnerCity = PARSENAME(REPLACE(OwnerAddress, ',', '.'),2)
ALTER TABLE HousingData
ADD SplitOwnerState nvarchar(255);
UPDATE HousingData
SET SplitOwnerState = PARSENAME(REPLACE(OwnerAddress, ',', '.'),1)
--Change Y and N to Yes and No in "Sold as Vacant" field
SELECT DISTINCT(SoldAsVacant), COUNT(SoldAsVacant)
FROM HousingProject..HousingData
GROUP BY SoldAsVacant
ORDER BY 2
SELECT SoldAsVacant,
CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END
FROM HousingProject..HousingData
UPDATE HousingData
SET SoldAsVacant = CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END
--Remove duplicate columns
WITH RowNumCTE AS
(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY UniqueID) AS RowNum
FROM HousingProject..HousingData
)
DELETE FROM RowNumCTE
WHERE RowNum > 1
--Delete unused columns
SELECT *
FROM HousingProject..HousingData
ALTER TABLE HousingProject..HousingData
DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress
ALTER TABLE HousingProject..HousingData
DROP COLUMN SaleDate