-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_2_create_modelled_chronologies.py
105 lines (81 loc) · 4.1 KB
/
3_2_create_modelled_chronologies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from functions.extract_data import get_downscaled_cru_ts
from functions.extract_data import get_soil
# Load overview table
overview = pd.read_csv('overview_after_1_2.csv', sep=';')
# Load points array and convert to list of tuples
points = np.load('prediction_points.npy')
pred_coords = [(x, y) for x, y in zip(points[:, 0], points[:, 1])]
# Get soil- and meteo data
soil = get_soil(pred_coords)
meteo = get_downscaled_cru_ts(pred_coords)
# Create counter for the amount of temporary files created
tempfilecounter = 0
# Loop over the points, for each point create data records like in the training table
for i in tqdm(range(len(points)), desc="Creating prediction table", smoothing=0):
# Access meteo data, calculate centered rolling mean and deviation from rolling mean and combine into one
point_meteo = pd.DataFrame(meteo[i])
point_meteo.index = range(1901, 2022)
point_meteo_avg = point_meteo.rolling(15, min_periods=1, center=True, axis=0).mean()
point_meteo_dev = point_meteo - point_meteo_avg
meteo_full = pd.concat([point_meteo, point_meteo_avg, point_meteo_dev], axis=1)
# Access soil data and make the same length as the meteo data by duplication (soil data stays the same every year)
point_soil = pd.DataFrame(soil[i])
point_soil = pd.concat([point_soil.T] * len(point_meteo))
point_soil.index = range(1901, 2022)
# Combine soil- and meteo data
combined = pd.merge(meteo_full, meteo_full.shift(), left_index=True, right_index=True).dropna()
combined = pd.merge(combined, point_soil, left_index=True, right_index=True, suffixes=('_i', '_j')).dropna()
# Add columns for year, latitude and longitude
combined['year'] = list(combined.index)
combined['lat'] = points[i][0]
combined['lon'] = points[i][1]
# Convert to list of lists, extend if possible, else if total list non existent create
try:
total.extend(combined.to_numpy().tolist())
except NameError:
total = combined.to_numpy().tolist()
# Save to a temporary file every 2500 iterations to prevent memory overload issues
if i % 2500 == 2499:
# Convert back to numpy array
total = np.asarray(total)
# Add zeros in the columns where the training table has one-hot encoding
zeros = np.zeros((len(total), len(overview)))
total = np.hstack((total, zeros))
# Save array to file
np.save('temp/prediction_table' + str(tempfilecounter) + '.npy', total)
tempfilecounter += 1
# Remove 'total' from memory
del total
# The same operation as the last part within the loop, but now for the remainder
total = np.asarray(total)
zeros = np.zeros((len(total), len(overview)))
total = np.hstack((total, zeros))
np.save('temp/prediction_table_' + str(tempfilecounter) + '.npy', total)
del total
del soil
del meteo
# Recombine the temp files into one big array and remove temporary files
for filename in os.listdir('temp'):
f = os.path.join('temp', filename)
try:
prediction_table = np.vstack((prediction_table, np.load(f)))
except NameError:
prediction_table = np.load(f)
os.remove(f)
np.save('prediction_table.npy', prediction_table)
# Iterate over the records in the overview table (chronologies)
for i, record in tqdm(overview.iterrows(), desc="Creating modelled chronologies", total=len(overview)):
# Calculate the index of the record in the overview data (counts front-to-back)
record_index = len(overview['name']) - i
# Load the random forest model to be used for the current record
rf = joblib.load('2_1_random_forests/rf_without_' + str(record['name']) + '.joblib')
# Use the random forest model to predict the values in the prediction table
modelled_crn = rf.predict(prediction_table)
modelled_crn = modelled_crn.reshape(int(len(prediction_table) / 120), 120)
# Save array to file
np.save('2_2_modelled_chronologies/modelled_crn_without_' + str(record['name']) + '.npy', modelled_crn)