-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_convert_data.py
243 lines (201 loc) · 11.2 KB
/
read_convert_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# --------------------------------------------------------------------------- #
# Data Processing Tools for AttomData Housing Index Database
# Author: Cory Ye
# Purpose: ECE C247 project code to prepare data for ML.
# Permissions: Shared with Dimitri Zafirov (source of data) and ECE C247.
# --------------------------------------------------------------------------- #
import json
import os
import numpy as np
import pandas as pd
import time
# Recursive Method to Parse Architecture of Nested Dictionary Data for Database Analysis
# def recursive_items(dictionary, lvl):
# for key, value in dictionary.items():
# # Append key, dictionary, nesting level,
# # and if the data is a leaf/dead-end, then print data!
# if type(value) is dict:
# yield (key, value, lvl, False)
# yield from recursive_items(value, lvl+1)
# else:
# yield (key, value, lvl, True)
def convert_time(t, date=True):
"""
Convert YYYY-MM-DD string to integer time step if date is True.
Otherwise, convert integer time step to YYYY-MM-DD string.
"""
if date:
t_split = t.split('-')
y = int(t_split[0])
m = int(t_split[1])
d = int(t_split[2])
return y * 365 + m * 31 + d
else:
y = t // 365
m = (t % 365) // 31
d = (t % 365) % 31
return str(y)+'-'+str(m)+'-'+str(d)
def convert_loc(px, py, gran=15, map=True):
"""
Convert (px=longitude, py=latitude) to integer coordinates for spatial analysis and convolution on a 2-D grid.
gran: Second-level granularity of the mapping. 1 second (or 1/3600 degrees of) longitude/latitude is about 30 meters.
If map=False, then retrieve an approximate longitude and latitude from the integer coordinate on the 2-D grid.
"""
if map:
return round(float(px) * 3600 / gran), round(float(py) * 3600 / gran)
else:
return px * gran / 3600, py * gran / 3600
def spacetime_integrate(data, time_frame, space_frame, center, complete=True):
"""
Integrate a sufficient amount of data over a fixed period of time.
:param data: NumPy array with processed property transaction data. Sorted in time and space in order (t,x,y).
:param time_frame: Time interval (in day-length time steps) of integration.
:param space_frame: Dimension (in spatial units) of the (square-cropped) spatial map.
Sets the gran-resolution spatial dimension of the 2-D feature matrix to an odd integer S.
:param center: Center (longitude, latitude) coordinates in spatial units of a region, i.e. city.
:return: (T, S, S, K) time-series data tensor that represents the 2-D heatmap of property transactions with
T time-steps of time_frame intervals and K property characteristics (i.e. price, proptype, etc.).
"""
# Initialize list of time-slice heat-maps. Concatenate elements to construct a time-series of heat-maps.
map_cache = []
# Normalize the spatial coordinates of the data to center.
data[:, 1] -= center[0]
data[:, 2] -= center[1]
# Loop over intervals of time.
t = data[0, 0] # Initial time of data subset.
k = 0 # Time interval multiplier.
while True:
# Compute index mask of data within the time interval
# t + [k*time_frame, (k+1)*time_frame).
time_index = np.logical_and(data[:, 0] >= t + k*time_frame,
data[:, 0] < t + (k+1)*time_frame)
if not np.any(time_index):
# All data integrated, assuming that there are no temporal
# discontinuities in the data with length greater than time_frame.
break
# Filter out the data in time interval/slice k.
data_tsf = data[time_index, :]
# Intialize time-slice spatial matrix with features, i.e. excepting time and space.
dim = int(np.floor(space_frame/2) + np.ceil(space_frame/2) + 1)
if complete:
spat_map = np.zeros((dim, dim, data.shape[1] - 3))
else:
spat_map = np.zeros((dim, dim))
# Extract and crop the spatial coordinates to dimension space_frame.
for x in range(-int(np.floor(space_frame/2)), int(np.ceil(space_frame/2) + 1)):
for y in range(-int(np.floor(space_frame/2)), int(np.ceil(space_frame/2) + 1)):
# Compute index mask of data for the fixed spatial coordinates (x,y).
space_index = np.logical_and(data_tsf[:, 1] == x, data_tsf[:, 2] == y)
if not np.any(space_index):
# No transaction at coordinate (x,y). Continue extraction.
continue
# Extract and interpolate the non-spatial data in the time interval.
data_avg = np.mean(data_tsf[space_index, :], axis=0)
# Insert the averaged data in the time-slice matrix at (x,y).
if complete:
spat_map[x, y, :] = data_avg[3:]
else:
spat_map[x, y] = data_avg[3:]
# Cache the time-slice spatial feature matrix in chronological order.
map_cache.append(spat_map)
# Increment time interval.
k += 1
# Concatenate all elements in the cache of spatial feature matrix to a time-series of heat-maps.
time_map = np.array(map_cache, dtype=np.single)
# Return the heat-map time series with dimension (T,S,S,K).
return time_map
# ------------------------------------------------- #
# Data Processing Method for AttomData Database
# ------------------------------------------------- #
def data_process(tf, sf, spat_res, center, dir, region, exog_features, complete=True):
# data_cache is a list of tuples containing housing transaction data.
# data_cache = [ ..., (saleTransDate, longitude, latitude, saleamt, proptype, yearbuilt, elevation,
# lotSize1, universalsize, beds, bathstotal, priceperbed, pricepersizeunit, ...), ... ]
data_cache = []
# Search CWD for all JSON data in format .txt.
for filename in os.listdir(dir):
# Analyze data files, specify county .
if filename.endswith(".txt") and filename.find(region) > -1:
# Read file to list of dictionaries.
f = open(dir + "/" + filename, 'r')
data = json.loads('[' + f.read() + ']')
print(filename)
# Extract and process the data.
for d in data:
if (d.__len__() > 1): # there's transaction data in that file
for p in d['property']:
# # Map out architecture of source data.
# for key, value, lvl, leaf in recursive_items(p, 0):
# print(lvl*'\t'+str(key))
# if leaf == True:
# print(lvl*'\t'+str(value))
# Get exogenous variables for the year of the observation.
t = p['sale']['salesearchdate']
# Extract and process training and description data for ML.
time = convert_time(t)
x, y = convert_loc(p['location']['longitude'], p['location']['latitude'], gran=spat_res)
price = p['sale']['amount']['saleamt']
prop = float(p['summary']['propIndicator'])
origin = p['summary']['yearbuilt']
elev = p['location']['elevation']
p_size = p['lot']['lotSize1']
b_size = p['building']['size']['universalsize']
beds = p['building']['rooms']['beds']
baths = p['building']['rooms']['bathstotal']
ppb = p['sale']['calculation']['priceperbed']
ppu = p['sale']['calculation']['pricepersizeunit']
# Append data as tuple to data cache.
year = int(t.split('-')[0])
if complete: # Input data. Exogenous features also appended here.
if year <= 2018: # Exogenous features only exist until 2018.
exogs = exog_features.loc[exogenous_features['date'] == year]
data_cache.append([time, x, y, price, prop, origin, elev, p_size, b_size, beds, baths, ppb, ppu] + exogs.drop(columns='date').values.tolist()[0])
else: # Output data.
if year > 2018:
data_cache.append((time, x, y, price))
# Close file.
f.close()
continue
else:
continue
# Sort all data with respect to time and space.
data_array = np.array(sorted(data_cache, key=lambda z: (z[0], z[1], z[2])), dtype=np.single)
print(data_array.shape)
# Convert longitudinal/latitudinal degrees to res-granularity spatial units for space_frame.
s, _ = convert_loc(sf, 0, gran=spat_res)
cx, cy = convert_loc(center[0], center[1], gran=spat_res)
# Compress the data through spatio-temporal interpolation.
out = spacetime_integrate(data_array, tf, s, (cx, cy), complete=complete)
print(out.shape)
# Convert array to .npy file with np.save(). Extract compressed file with np.load().
# Change the file name as necessary to organize the processed data!
if complete:
np.save(region + "_heatmap_time_data_input", out)
else:
np.save(region + "_heatmap_time_data_output", out)
# -------------------------------------------------------------------------------------------------------------- #
# Timer
# start = time.time()
# Execute data processing method on data in CWD.
"""
tf is the time frame of interpolation in days.
sf is the space frame in degrees longitude/latitude.
spat_res is the resolution of the spatial unit in conversion to heatmap.
center is the center coordinate of the data in degrees longitude/latitude.
region is the county being processed
exog_features is the exogenous features being added to that county every year
complete is the flag for whether to include the other features in the data.
"""
dir = "C:/Users/cory0/Downloads/C247 Project Cache/"
counties = pd.read_csv(dir+'cities_by_major_county_with_centers.csv') # source of lat/lon county centroids: https://en.wikipedia.org/wiki/User:Michael_J/County_table
exogenous_features = pd.read_csv(dir+'county macro data/selected_data.csv')
exogenous_features = exogenous_features.drop(exogenous_features.columns[0], axis=1)
for i in range(1, len(counties)):
print(counties.iloc[i]['countystate'])
print(counties.iloc[i]['countynum'])
df = exogenous_features.loc[exogenous_features['countynum'] == counties.iloc[i]['countynum']]
df = df.drop(columns='countynum')
data_process(tf=365, sf=0.5, spat_res=15, center=(counties.iloc[i]['longitude'], counties.iloc[i]['latitude']),
dir=dir+"countydata", region=counties.iloc[i]['file'], exog_features=df, complete=True)
# end = time.time()
# print("Time Elapsed: ", end - start)