Files
MidasEngine/src/nn_tests/RNN-LSTM/output.txt
2025-01-25 15:29:52 -05:00

39520 lines
1.7 MiB

Total valid daily bars used: 227
First day: 2024-01-08 O=59.23 H=60.68 L=58.82 C=59.64 V=124629
Last day: 2024-11-29 O=64.45 H=64.45 L=63.00 C=63.77 V=62082
Target Min: 40.86, Target Max: 74.47
Normalized Targets (First 5 Samples):
Sample 0: 0.933
Sample 1: 0.930
Sample 2: 0.965
Sample 3: 1.000
Sample 4: 0.534
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.677, o_gate[0] = 0.522, c_hat[0] = -0.501
c_state[0] = -0.032, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.066, f_gate[0] = 0.703, o_gate[0] = 0.514, c_hat[0] = -0.465
c_state[0] = -0.053, h_state[0] = -0.027
Time Step 2:
i_gate[0] = 0.064, f_gate[0] = 0.701, o_gate[0] = 0.521, c_hat[0] = -0.467
c_state[0] = -0.067, h_state[0] = -0.035
Time Step 3:
i_gate[0] = 0.060, f_gate[0] = 0.730, o_gate[0] = 0.502, c_hat[0] = -0.485
c_state[0] = -0.078, h_state[0] = -0.039
Time Step 4:
i_gate[0] = 0.055, f_gate[0] = 0.746, o_gate[0] = 0.530, c_hat[0] = -0.576
c_state[0] = -0.090, h_state[0] = -0.048
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.061, dc_hat[0] = 0.016
Gradient do_[0] = -0.050
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.061, dc_hat[0] = 0.019
Gradient do_[0] = -0.047
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.085, dc_hat[0] = 0.029
Gradient do_[0] = -0.050
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.083, dc_hat[0] = 0.029
Gradient do_[0] = -0.040
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.676, o_gate[0] = 0.523, c_hat[0] = -0.504
c_state[0] = -0.033, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.066, f_gate[0] = 0.702, o_gate[0] = 0.516, c_hat[0] = -0.470
c_state[0] = -0.054, h_state[0] = -0.028
Time Step 2:
i_gate[0] = 0.065, f_gate[0] = 0.700, o_gate[0] = 0.522, c_hat[0] = -0.472
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 3:
i_gate[0] = 0.061, f_gate[0] = 0.729, o_gate[0] = 0.503, c_hat[0] = -0.489
c_state[0] = -0.079, h_state[0] = -0.040
Time Step 4:
i_gate[0] = 0.056, f_gate[0] = 0.745, o_gate[0] = 0.530, c_hat[0] = -0.580
c_state[0] = -0.091, h_state[0] = -0.048
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.050
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.048
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.085, dc_hat[0] = 0.029
Gradient do_[0] = -0.050
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.083, dc_hat[0] = 0.029
Gradient do_[0] = -0.040
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.675, o_gate[0] = 0.524, c_hat[0] = -0.508
c_state[0] = -0.033, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.066, f_gate[0] = 0.701, o_gate[0] = 0.517, c_hat[0] = -0.474
c_state[0] = -0.054, h_state[0] = -0.028
Time Step 2:
i_gate[0] = 0.065, f_gate[0] = 0.699, o_gate[0] = 0.523, c_hat[0] = -0.476
c_state[0] = -0.069, h_state[0] = -0.036
Time Step 3:
i_gate[0] = 0.061, f_gate[0] = 0.728, o_gate[0] = 0.504, c_hat[0] = -0.493
c_state[0] = -0.080, h_state[0] = -0.040
Time Step 4:
i_gate[0] = 0.056, f_gate[0] = 0.743, o_gate[0] = 0.531, c_hat[0] = -0.583
c_state[0] = -0.092, h_state[0] = -0.049
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.059, dc_hat[0] = 0.015
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.048
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.085, dc_hat[0] = 0.029
Gradient do_[0] = -0.050
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.674, o_gate[0] = 0.525, c_hat[0] = -0.511
c_state[0] = -0.033, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.066, f_gate[0] = 0.700, o_gate[0] = 0.518, c_hat[0] = -0.478
c_state[0] = -0.055, h_state[0] = -0.028
Time Step 2:
i_gate[0] = 0.065, f_gate[0] = 0.698, o_gate[0] = 0.524, c_hat[0] = -0.481
c_state[0] = -0.070, h_state[0] = -0.036
Time Step 3:
i_gate[0] = 0.061, f_gate[0] = 0.727, o_gate[0] = 0.504, c_hat[0] = -0.497
c_state[0] = -0.081, h_state[0] = -0.041
Time Step 4:
i_gate[0] = 0.056, f_gate[0] = 0.742, o_gate[0] = 0.531, c_hat[0] = -0.587
c_state[0] = -0.093, h_state[0] = -0.049
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.015
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.049
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.084, dc_hat[0] = 0.029
Gradient do_[0] = -0.051
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.673, o_gate[0] = 0.526, c_hat[0] = -0.515
c_state[0] = -0.033, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.066, f_gate[0] = 0.699, o_gate[0] = 0.519, c_hat[0] = -0.483
c_state[0] = -0.055, h_state[0] = -0.029
Time Step 2:
i_gate[0] = 0.065, f_gate[0] = 0.697, o_gate[0] = 0.525, c_hat[0] = -0.485
c_state[0] = -0.070, h_state[0] = -0.037
Time Step 3:
i_gate[0] = 0.062, f_gate[0] = 0.726, o_gate[0] = 0.505, c_hat[0] = -0.501
c_state[0] = -0.082, h_state[0] = -0.041
Time Step 4:
i_gate[0] = 0.057, f_gate[0] = 0.741, o_gate[0] = 0.532, c_hat[0] = -0.590
c_state[0] = -0.094, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.015
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.049
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.084, dc_hat[0] = 0.029
Gradient do_[0] = -0.051
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.672, o_gate[0] = 0.528, c_hat[0] = -0.518
c_state[0] = -0.034, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.066, f_gate[0] = 0.698, o_gate[0] = 0.520, c_hat[0] = -0.487
c_state[0] = -0.056, h_state[0] = -0.029
Time Step 2:
i_gate[0] = 0.066, f_gate[0] = 0.696, o_gate[0] = 0.526, c_hat[0] = -0.489
c_state[0] = -0.071, h_state[0] = -0.037
Time Step 3:
i_gate[0] = 0.062, f_gate[0] = 0.724, o_gate[0] = 0.506, c_hat[0] = -0.505
c_state[0] = -0.083, h_state[0] = -0.042
Time Step 4:
i_gate[0] = 0.057, f_gate[0] = 0.739, o_gate[0] = 0.532, c_hat[0] = -0.593
c_state[0] = -0.095, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.050
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.084, dc_hat[0] = 0.028
Gradient do_[0] = -0.051
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.671, o_gate[0] = 0.529, c_hat[0] = -0.521
c_state[0] = -0.034, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.067, f_gate[0] = 0.698, o_gate[0] = 0.521, c_hat[0] = -0.491
c_state[0] = -0.056, h_state[0] = -0.029
Time Step 2:
i_gate[0] = 0.066, f_gate[0] = 0.695, o_gate[0] = 0.527, c_hat[0] = -0.493
c_state[0] = -0.072, h_state[0] = -0.038
Time Step 3:
i_gate[0] = 0.062, f_gate[0] = 0.723, o_gate[0] = 0.507, c_hat[0] = -0.509
c_state[0] = -0.084, h_state[0] = -0.042
Time Step 4:
i_gate[0] = 0.057, f_gate[0] = 0.738, o_gate[0] = 0.533, c_hat[0] = -0.597
c_state[0] = -0.096, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.050
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028
Gradient do_[0] = -0.042
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.670, o_gate[0] = 0.530, c_hat[0] = -0.524
c_state[0] = -0.034, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.067, f_gate[0] = 0.697, o_gate[0] = 0.523, c_hat[0] = -0.495
c_state[0] = -0.057, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.066, f_gate[0] = 0.694, o_gate[0] = 0.528, c_hat[0] = -0.497
c_state[0] = -0.072, h_state[0] = -0.038
Time Step 3:
i_gate[0] = 0.063, f_gate[0] = 0.722, o_gate[0] = 0.507, c_hat[0] = -0.513
c_state[0] = -0.084, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.058, f_gate[0] = 0.737, o_gate[0] = 0.533, c_hat[0] = -0.600
c_state[0] = -0.097, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.014
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.051
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028
Gradient do_[0] = -0.042
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.025
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.669, o_gate[0] = 0.531, c_hat[0] = -0.527
c_state[0] = -0.034, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.067, f_gate[0] = 0.696, o_gate[0] = 0.524, c_hat[0] = -0.499
c_state[0] = -0.057, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.066, f_gate[0] = 0.693, o_gate[0] = 0.529, c_hat[0] = -0.501
c_state[0] = -0.073, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.063, f_gate[0] = 0.721, o_gate[0] = 0.508, c_hat[0] = -0.517
c_state[0] = -0.085, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.058, f_gate[0] = 0.736, o_gate[0] = 0.534, c_hat[0] = -0.603
c_state[0] = -0.098, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.055, dc_hat[0] = 0.014
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.051
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028
Gradient do_[0] = -0.042
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.025
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.668, o_gate[0] = 0.532, c_hat[0] = -0.530
c_state[0] = -0.034, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.067, f_gate[0] = 0.695, o_gate[0] = 0.525, c_hat[0] = -0.503
c_state[0] = -0.058, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.067, f_gate[0] = 0.692, o_gate[0] = 0.530, c_hat[0] = -0.505
c_state[0] = -0.074, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.063, f_gate[0] = 0.720, o_gate[0] = 0.509, c_hat[0] = -0.520
c_state[0] = -0.086, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.058, f_gate[0] = 0.734, o_gate[0] = 0.534, c_hat[0] = -0.606
c_state[0] = -0.098, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.052
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.668, o_gate[0] = 0.533, c_hat[0] = -0.533
c_state[0] = -0.035, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.067, f_gate[0] = 0.694, o_gate[0] = 0.526, c_hat[0] = -0.507
c_state[0] = -0.058, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.067, f_gate[0] = 0.691, o_gate[0] = 0.531, c_hat[0] = -0.509
c_state[0] = -0.074, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.509, c_hat[0] = -0.524
c_state[0] = -0.087, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.059, f_gate[0] = 0.733, o_gate[0] = 0.535, c_hat[0] = -0.609
c_state[0] = -0.099, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.052
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.667, o_gate[0] = 0.534, c_hat[0] = -0.536
c_state[0] = -0.035, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.068, f_gate[0] = 0.693, o_gate[0] = 0.527, c_hat[0] = -0.510
c_state[0] = -0.059, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.067, f_gate[0] = 0.691, o_gate[0] = 0.532, c_hat[0] = -0.513
c_state[0] = -0.075, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.510, c_hat[0] = -0.527
c_state[0] = -0.087, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.059, f_gate[0] = 0.732, o_gate[0] = 0.535, c_hat[0] = -0.612
c_state[0] = -0.100, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.052
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027
Gradient do_[0] = -0.053
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.666, o_gate[0] = 0.535, c_hat[0] = -0.539
c_state[0] = -0.035, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.068, f_gate[0] = 0.693, o_gate[0] = 0.528, c_hat[0] = -0.514
c_state[0] = -0.059, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.067, f_gate[0] = 0.690, o_gate[0] = 0.532, c_hat[0] = -0.516
c_state[0] = -0.076, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.064, f_gate[0] = 0.717, o_gate[0] = 0.511, c_hat[0] = -0.531
c_state[0] = -0.088, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.059, f_gate[0] = 0.731, o_gate[0] = 0.535, c_hat[0] = -0.615
c_state[0] = -0.101, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.053
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027
Gradient do_[0] = -0.053
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.665, o_gate[0] = 0.536, c_hat[0] = -0.541
c_state[0] = -0.035, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.068, f_gate[0] = 0.692, o_gate[0] = 0.529, c_hat[0] = -0.517
c_state[0] = -0.060, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.068, f_gate[0] = 0.689, o_gate[0] = 0.533, c_hat[0] = -0.520
c_state[0] = -0.076, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.064, f_gate[0] = 0.717, o_gate[0] = 0.511, c_hat[0] = -0.534
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.060, f_gate[0] = 0.730, o_gate[0] = 0.536, c_hat[0] = -0.618
c_state[0] = -0.102, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.053
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027
Gradient do_[0] = -0.053
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.665, o_gate[0] = 0.536, c_hat[0] = -0.544
c_state[0] = -0.036, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.068, f_gate[0] = 0.691, o_gate[0] = 0.530, c_hat[0] = -0.520
c_state[0] = -0.060, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.068, f_gate[0] = 0.688, o_gate[0] = 0.534, c_hat[0] = -0.523
c_state[0] = -0.077, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.065, f_gate[0] = 0.716, o_gate[0] = 0.512, c_hat[0] = -0.537
c_state[0] = -0.090, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.060, f_gate[0] = 0.729, o_gate[0] = 0.536, c_hat[0] = -0.621
c_state[0] = -0.103, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.053
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027
Gradient do_[0] = -0.053
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.664, o_gate[0] = 0.537, c_hat[0] = -0.546
c_state[0] = -0.036, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.068, f_gate[0] = 0.691, o_gate[0] = 0.531, c_hat[0] = -0.524
c_state[0] = -0.060, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.068, f_gate[0] = 0.688, o_gate[0] = 0.535, c_hat[0] = -0.526
c_state[0] = -0.077, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.065, f_gate[0] = 0.715, o_gate[0] = 0.512, c_hat[0] = -0.540
c_state[0] = -0.090, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.060, f_gate[0] = 0.728, o_gate[0] = 0.536, c_hat[0] = -0.623
c_state[0] = -0.103, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.053
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.079, dc_hat[0] = 0.026
Gradient do_[0] = -0.053
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.663, o_gate[0] = 0.538, c_hat[0] = -0.548
c_state[0] = -0.036, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.068, f_gate[0] = 0.690, o_gate[0] = 0.531, c_hat[0] = -0.527
c_state[0] = -0.061, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.068, f_gate[0] = 0.687, o_gate[0] = 0.535, c_hat[0] = -0.529
c_state[0] = -0.078, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.065, f_gate[0] = 0.714, o_gate[0] = 0.513, c_hat[0] = -0.543
c_state[0] = -0.091, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.060, f_gate[0] = 0.728, o_gate[0] = 0.537, c_hat[0] = -0.626
c_state[0] = -0.104, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.053
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.079, dc_hat[0] = 0.026
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027
Gradient do_[0] = -0.044
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.663, o_gate[0] = 0.539, c_hat[0] = -0.550
c_state[0] = -0.036, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.069, f_gate[0] = 0.689, o_gate[0] = 0.532, c_hat[0] = -0.529
c_state[0] = -0.061, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.069, f_gate[0] = 0.686, o_gate[0] = 0.536, c_hat[0] = -0.532
c_state[0] = -0.078, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.065, f_gate[0] = 0.713, o_gate[0] = 0.513, c_hat[0] = -0.546
c_state[0] = -0.092, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.061, f_gate[0] = 0.727, o_gate[0] = 0.537, c_hat[0] = -0.628
c_state[0] = -0.105, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.053
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.078, dc_hat[0] = 0.026
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027
Gradient do_[0] = -0.044
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.662, o_gate[0] = 0.539, c_hat[0] = -0.553
c_state[0] = -0.036, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.069, f_gate[0] = 0.689, o_gate[0] = 0.533, c_hat[0] = -0.532
c_state[0] = -0.062, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.069, f_gate[0] = 0.686, o_gate[0] = 0.536, c_hat[0] = -0.535
c_state[0] = -0.079, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.066, f_gate[0] = 0.713, o_gate[0] = 0.513, c_hat[0] = -0.548
c_state[0] = -0.092, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.061, f_gate[0] = 0.726, o_gate[0] = 0.537, c_hat[0] = -0.631
c_state[0] = -0.105, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.049
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.053
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.026
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027
Gradient do_[0] = -0.044
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.662, o_gate[0] = 0.540, c_hat[0] = -0.555
c_state[0] = -0.036, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.069, f_gate[0] = 0.688, o_gate[0] = 0.533, c_hat[0] = -0.535
c_state[0] = -0.062, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.069, f_gate[0] = 0.685, o_gate[0] = 0.537, c_hat[0] = -0.538
c_state[0] = -0.080, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.066, f_gate[0] = 0.712, o_gate[0] = 0.514, c_hat[0] = -0.551
c_state[0] = -0.093, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.061, f_gate[0] = 0.725, o_gate[0] = 0.537, c_hat[0] = -0.633
c_state[0] = -0.106, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.048
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.053, dc_hat[0] = 0.017
Gradient do_[0] = -0.052
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025
Gradient do_[0] = -0.052
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.026
Gradient do_[0] = -0.044
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.661, o_gate[0] = 0.540, c_hat[0] = -0.556
c_state[0] = -0.036, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.069, f_gate[0] = 0.688, o_gate[0] = 0.534, c_hat[0] = -0.538
c_state[0] = -0.062, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.069, f_gate[0] = 0.685, o_gate[0] = 0.537, c_hat[0] = -0.541
c_state[0] = -0.080, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.066, f_gate[0] = 0.711, o_gate[0] = 0.514, c_hat[0] = -0.554
c_state[0] = -0.094, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.061, f_gate[0] = 0.724, o_gate[0] = 0.537, c_hat[0] = -0.635
c_state[0] = -0.107, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.048
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.017
Gradient do_[0] = -0.052
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025
Gradient do_[0] = -0.051
Backward Time Step 1:
Gradient di[0] = -0.019, df[0] = 0.079, dc_hat[0] = 0.026
Gradient do_[0] = -0.044
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.660, o_gate[0] = 0.541, c_hat[0] = -0.558
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.069, f_gate[0] = 0.687, o_gate[0] = 0.534, c_hat[0] = -0.540
c_state[0] = -0.063, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.069, f_gate[0] = 0.684, o_gate[0] = 0.538, c_hat[0] = -0.543
c_state[0] = -0.081, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.066, f_gate[0] = 0.711, o_gate[0] = 0.514, c_hat[0] = -0.556
c_state[0] = -0.094, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.062, f_gate[0] = 0.724, o_gate[0] = 0.537, c_hat[0] = -0.638
c_state[0] = -0.108, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.048
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.052
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.025
Gradient do_[0] = -0.051
Backward Time Step 1:
Gradient di[0] = -0.019, df[0] = 0.079, dc_hat[0] = 0.026
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.023
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.660, o_gate[0] = 0.541, c_hat[0] = -0.560
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.069, f_gate[0] = 0.687, o_gate[0] = 0.535, c_hat[0] = -0.543
c_state[0] = -0.063, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.538, c_hat[0] = -0.546
c_state[0] = -0.081, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.067, f_gate[0] = 0.710, o_gate[0] = 0.514, c_hat[0] = -0.558
c_state[0] = -0.095, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.062, f_gate[0] = 0.723, o_gate[0] = 0.537, c_hat[0] = -0.640
c_state[0] = -0.108, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011
Gradient do_[0] = -0.047
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.051, dc_hat[0] = 0.016
Gradient do_[0] = -0.051
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.024
Gradient do_[0] = -0.051
Backward Time Step 1:
Gradient di[0] = -0.019, df[0] = 0.078, dc_hat[0] = 0.026
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.023
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.660, o_gate[0] = 0.541, c_hat[0] = -0.562
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.069, f_gate[0] = 0.687, o_gate[0] = 0.535, c_hat[0] = -0.545
c_state[0] = -0.063, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.070, f_gate[0] = 0.683, o_gate[0] = 0.538, c_hat[0] = -0.548
c_state[0] = -0.081, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.067, f_gate[0] = 0.710, o_gate[0] = 0.514, c_hat[0] = -0.561
c_state[0] = -0.095, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.062, f_gate[0] = 0.722, o_gate[0] = 0.537, c_hat[0] = -0.642
c_state[0] = -0.109, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.011, df[0] = 0.044, dc_hat[0] = 0.011
Gradient do_[0] = -0.047
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.050, dc_hat[0] = 0.016
Gradient do_[0] = -0.050
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.024
Gradient do_[0] = -0.050
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.078, dc_hat[0] = 0.026
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.659, o_gate[0] = 0.542, c_hat[0] = -0.563
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.686, o_gate[0] = 0.535, c_hat[0] = -0.547
c_state[0] = -0.063, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.070, f_gate[0] = 0.683, o_gate[0] = 0.538, c_hat[0] = -0.551
c_state[0] = -0.082, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.067, f_gate[0] = 0.709, o_gate[0] = 0.514, c_hat[0] = -0.563
c_state[0] = -0.096, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.062, f_gate[0] = 0.722, o_gate[0] = 0.537, c_hat[0] = -0.644
c_state[0] = -0.109, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.011
Gradient do_[0] = -0.046
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.050
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.024
Gradient do_[0] = -0.050
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.025
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.659, o_gate[0] = 0.542, c_hat[0] = -0.565
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.686, o_gate[0] = 0.536, c_hat[0] = -0.550
c_state[0] = -0.064, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.070, f_gate[0] = 0.682, o_gate[0] = 0.538, c_hat[0] = -0.553
c_state[0] = -0.082, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.067, f_gate[0] = 0.709, o_gate[0] = 0.514, c_hat[0] = -0.565
c_state[0] = -0.096, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.721, o_gate[0] = 0.537, c_hat[0] = -0.646
c_state[0] = -0.110, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.011
Gradient do_[0] = -0.045
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.015
Gradient do_[0] = -0.049
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.023
Gradient do_[0] = -0.049
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.658, o_gate[0] = 0.542, c_hat[0] = -0.566
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.552
c_state[0] = -0.064, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.070, f_gate[0] = 0.682, o_gate[0] = 0.538, c_hat[0] = -0.555
c_state[0] = -0.083, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.068, f_gate[0] = 0.708, o_gate[0] = 0.514, c_hat[0] = -0.567
c_state[0] = -0.097, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.721, o_gate[0] = 0.537, c_hat[0] = -0.648
c_state[0] = -0.111, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.010
Gradient do_[0] = -0.045
Backward Time Step 3:
Gradient di[0] = -0.011, df[0] = 0.046, dc_hat[0] = 0.015
Gradient do_[0] = -0.048
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.023
Gradient do_[0] = -0.049
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.658, o_gate[0] = 0.542, c_hat[0] = -0.568
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.554
c_state[0] = -0.064, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.538, c_hat[0] = -0.557
c_state[0] = -0.083, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.068, f_gate[0] = 0.708, o_gate[0] = 0.514, c_hat[0] = -0.569
c_state[0] = -0.097, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.720, o_gate[0] = 0.537, c_hat[0] = -0.650
c_state[0] = -0.111, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.010
Gradient do_[0] = -0.044
Backward Time Step 3:
Gradient di[0] = -0.011, df[0] = 0.045, dc_hat[0] = 0.014
Gradient do_[0] = -0.048
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.023
Gradient do_[0] = -0.048
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.025
Gradient do_[0] = -0.043
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.023
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.658, o_gate[0] = 0.542, c_hat[0] = -0.569
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.556
c_state[0] = -0.065, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.538, c_hat[0] = -0.559
c_state[0] = -0.084, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.068, f_gate[0] = 0.707, o_gate[0] = 0.513, c_hat[0] = -0.571
c_state[0] = -0.098, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.720, o_gate[0] = 0.536, c_hat[0] = -0.651
c_state[0] = -0.112, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.010
Gradient do_[0] = -0.043
Backward Time Step 3:
Gradient di[0] = -0.011, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.047
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.022
Gradient do_[0] = -0.048
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.024
Gradient do_[0] = -0.042
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.657, o_gate[0] = 0.542, c_hat[0] = -0.571
c_state[0] = -0.037, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.557
c_state[0] = -0.065, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.538, c_hat[0] = -0.561
c_state[0] = -0.084, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.068, f_gate[0] = 0.707, o_gate[0] = 0.513, c_hat[0] = -0.573
c_state[0] = -0.098, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.536, c_hat[0] = -0.653
c_state[0] = -0.112, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.010
Gradient do_[0] = -0.043
Backward Time Step 3:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.046
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.067, dc_hat[0] = 0.022
Gradient do_[0] = -0.047
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.024
Gradient do_[0] = -0.042
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.657, o_gate[0] = 0.542, c_hat[0] = -0.572
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.536, c_hat[0] = -0.559
c_state[0] = -0.065, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.538, c_hat[0] = -0.563
c_state[0] = -0.084, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.068, f_gate[0] = 0.707, o_gate[0] = 0.513, c_hat[0] = -0.575
c_state[0] = -0.099, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.536, c_hat[0] = -0.655
c_state[0] = -0.113, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.009
Gradient do_[0] = -0.042
Backward Time Step 3:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.045
Backward Time Step 2:
Gradient di[0] = -0.017, df[0] = 0.066, dc_hat[0] = 0.022
Gradient do_[0] = -0.047
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.024
Gradient do_[0] = -0.042
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.657, o_gate[0] = 0.542, c_hat[0] = -0.573
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.536, c_hat[0] = -0.561
c_state[0] = -0.065, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.538, c_hat[0] = -0.564
c_state[0] = -0.085, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.512, c_hat[0] = -0.576
c_state[0] = -0.099, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.535, c_hat[0] = -0.656
c_state[0] = -0.113, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.009
Gradient do_[0] = -0.041
Backward Time Step 3:
Gradient di[0] = -0.010, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.044
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.021
Gradient do_[0] = -0.046
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.024
Gradient do_[0] = -0.042
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.542, c_hat[0] = -0.574
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.536, c_hat[0] = -0.562
c_state[0] = -0.065, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.538, c_hat[0] = -0.566
c_state[0] = -0.085, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.512, c_hat[0] = -0.578
c_state[0] = -0.100, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.535, c_hat[0] = -0.657
c_state[0] = -0.114, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.009
Gradient do_[0] = -0.040
Backward Time Step 3:
Gradient di[0] = -0.010, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.043
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.021
Gradient do_[0] = -0.045
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.072, dc_hat[0] = 0.023
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.575
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.684, o_gate[0] = 0.535, c_hat[0] = -0.564
c_state[0] = -0.066, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.680, o_gate[0] = 0.537, c_hat[0] = -0.567
c_state[0] = -0.085, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.512, c_hat[0] = -0.579
c_state[0] = -0.100, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.534, c_hat[0] = -0.659
c_state[0] = -0.114, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.009, df[0] = 0.035, dc_hat[0] = 0.009
Gradient do_[0] = -0.040
Backward Time Step 3:
Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.042
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.021
Gradient do_[0] = -0.045
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.071, dc_hat[0] = 0.023
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.576
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.535, c_hat[0] = -0.565
c_state[0] = -0.066, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.680, o_gate[0] = 0.537, c_hat[0] = -0.569
c_state[0] = -0.085, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.511, c_hat[0] = -0.580
c_state[0] = -0.100, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.534, c_hat[0] = -0.660
c_state[0] = -0.114, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.008
Gradient do_[0] = -0.039
Backward Time Step 3:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.041
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020
Gradient do_[0] = -0.044
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.071, dc_hat[0] = 0.023
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.073, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.577
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.535, c_hat[0] = -0.567
c_state[0] = -0.066, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.536, c_hat[0] = -0.570
c_state[0] = -0.086, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.511, c_hat[0] = -0.582
c_state[0] = -0.101, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.533, c_hat[0] = -0.661
c_state[0] = -0.115, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.008
Gradient do_[0] = -0.038
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = 0.037, dc_hat[0] = 0.012
Gradient do_[0] = -0.041
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.020
Gradient do_[0] = -0.044
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.070, dc_hat[0] = 0.023
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.073, dc_hat[0] = 0.022
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.578
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.535, c_hat[0] = -0.568
c_state[0] = -0.066, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.536, c_hat[0] = -0.571
c_state[0] = -0.086, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.510, c_hat[0] = -0.583
c_state[0] = -0.101, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.533, c_hat[0] = -0.663
c_state[0] = -0.115, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.008
Gradient do_[0] = -0.037
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.040
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.020
Gradient do_[0] = -0.043
Backward Time Step 1:
Gradient di[0] = -0.018, df[0] = 0.069, dc_hat[0] = 0.022
Gradient do_[0] = -0.040
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.540, c_hat[0] = -0.579
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.534, c_hat[0] = -0.569
c_state[0] = -0.066, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.536, c_hat[0] = -0.573
c_state[0] = -0.086, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.509, c_hat[0] = -0.584
c_state[0] = -0.101, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.532, c_hat[0] = -0.664
c_state[0] = -0.116, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.008
Gradient do_[0] = -0.036
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.039
Backward Time Step 2:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.042
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.022
Gradient do_[0] = -0.040
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.540, c_hat[0] = -0.580
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.534, c_hat[0] = -0.571
c_state[0] = -0.066, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.535, c_hat[0] = -0.574
c_state[0] = -0.087, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.509, c_hat[0] = -0.585
c_state[0] = -0.102, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.532, c_hat[0] = -0.665
c_state[0] = -0.116, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.008
Gradient do_[0] = -0.036
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.011
Gradient do_[0] = -0.038
Backward Time Step 2:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.042
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.022
Gradient do_[0] = -0.040
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.539, c_hat[0] = -0.581
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.533, c_hat[0] = -0.572
c_state[0] = -0.067, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.534, c_hat[0] = -0.575
c_state[0] = -0.087, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.705, o_gate[0] = 0.508, c_hat[0] = -0.586
c_state[0] = -0.102, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.531, c_hat[0] = -0.666
c_state[0] = -0.116, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.007
Gradient do_[0] = -0.035
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.037
Backward Time Step 2:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.019
Gradient do_[0] = -0.041
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.022
Gradient do_[0] = -0.040
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.539, c_hat[0] = -0.581
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.533, c_hat[0] = -0.573
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.534, c_hat[0] = -0.576
c_state[0] = -0.087, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.705, o_gate[0] = 0.507, c_hat[0] = -0.587
c_state[0] = -0.102, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.530, c_hat[0] = -0.667
c_state[0] = -0.117, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.007
Gradient do_[0] = -0.034
Backward Time Step 3:
Gradient di[0] = -0.008, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.036
Backward Time Step 2:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.019
Gradient do_[0] = -0.041
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.067, dc_hat[0] = 0.022
Gradient do_[0] = -0.039
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.539, c_hat[0] = -0.582
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.532, c_hat[0] = -0.574
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.533, c_hat[0] = -0.577
c_state[0] = -0.087, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.507, c_hat[0] = -0.588
c_state[0] = -0.102, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.529, c_hat[0] = -0.668
c_state[0] = -0.117, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.007
Gradient do_[0] = -0.033
Backward Time Step 3:
Gradient di[0] = -0.008, df[0] = 0.031, dc_hat[0] = 0.010
Gradient do_[0] = -0.035
Backward Time Step 2:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.040
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.066, dc_hat[0] = 0.021
Gradient do_[0] = -0.039
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.538, c_hat[0] = -0.583
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.532, c_hat[0] = -0.575
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.533, c_hat[0] = -0.578
c_state[0] = -0.087, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.506, c_hat[0] = -0.589
c_state[0] = -0.103, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.529, c_hat[0] = -0.668
c_state[0] = -0.117, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.007
Gradient do_[0] = -0.033
Backward Time Step 3:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.035
Backward Time Step 2:
Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.018
Gradient do_[0] = -0.040
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.066, dc_hat[0] = 0.021
Gradient do_[0] = -0.039
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.537, c_hat[0] = -0.583
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.531, c_hat[0] = -0.576
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.532, c_hat[0] = -0.578
c_state[0] = -0.087, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.505, c_hat[0] = -0.590
c_state[0] = -0.103, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.528, c_hat[0] = -0.669
c_state[0] = -0.117, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = 0.027, dc_hat[0] = 0.007
Gradient do_[0] = -0.032
Backward Time Step 3:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.034
Backward Time Step 2:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.018
Gradient do_[0] = -0.039
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.065, dc_hat[0] = 0.021
Gradient do_[0] = -0.039
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.021
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.537, c_hat[0] = -0.584
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.531, c_hat[0] = -0.576
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.531, c_hat[0] = -0.579
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.504, c_hat[0] = -0.590
c_state[0] = -0.103, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.527, c_hat[0] = -0.670
c_state[0] = -0.118, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.006
Gradient do_[0] = -0.031
Backward Time Step 3:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.033
Backward Time Step 2:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.018
Gradient do_[0] = -0.039
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.065, dc_hat[0] = 0.021
Gradient do_[0] = -0.038
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.536, c_hat[0] = -0.585
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.530, c_hat[0] = -0.577
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.530, c_hat[0] = -0.580
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.503, c_hat[0] = -0.591
c_state[0] = -0.103, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.526, c_hat[0] = -0.671
c_state[0] = -0.118, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.006
Gradient do_[0] = -0.031
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = 0.028, dc_hat[0] = 0.009
Gradient do_[0] = -0.033
Backward Time Step 2:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.017
Gradient do_[0] = -0.038
Backward Time Step 1:
Gradient di[0] = -0.017, df[0] = 0.064, dc_hat[0] = 0.021
Gradient do_[0] = -0.038
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.536, c_hat[0] = -0.585
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.529, c_hat[0] = -0.578
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.530, c_hat[0] = -0.580
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.502, c_hat[0] = -0.592
c_state[0] = -0.103, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.525, c_hat[0] = -0.671
c_state[0] = -0.118, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.006
Gradient do_[0] = -0.030
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = 0.028, dc_hat[0] = 0.009
Gradient do_[0] = -0.032
Backward Time Step 2:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.017
Gradient do_[0] = -0.038
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.020
Gradient do_[0] = -0.038
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.535, c_hat[0] = -0.586
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.529, c_hat[0] = -0.578
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.529, c_hat[0] = -0.581
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.502, c_hat[0] = -0.592
c_state[0] = -0.103, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.524, c_hat[0] = -0.672
c_state[0] = -0.118, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.006
Gradient do_[0] = -0.030
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.031
Backward Time Step 2:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.017
Gradient do_[0] = -0.037
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.020
Gradient do_[0] = -0.038
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.534, c_hat[0] = -0.586
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.528, c_hat[0] = -0.579
c_state[0] = -0.067, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.528, c_hat[0] = -0.582
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.501, c_hat[0] = -0.593
c_state[0] = -0.104, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.523, c_hat[0] = -0.672
c_state[0] = -0.118, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.006
Gradient do_[0] = -0.029
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.031
Backward Time Step 2:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.017
Gradient do_[0] = -0.037
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.020
Gradient do_[0] = -0.038
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.534, c_hat[0] = -0.587
c_state[0] = -0.038, h_state[0] = -0.021
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.527, c_hat[0] = -0.580
c_state[0] = -0.068, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.527, c_hat[0] = -0.582
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.500, c_hat[0] = -0.593
c_state[0] = -0.104, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.522, c_hat[0] = -0.673
c_state[0] = -0.118, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.006
Gradient do_[0] = -0.029
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.030
Backward Time Step 2:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.017
Gradient do_[0] = -0.037
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020
Gradient do_[0] = -0.038
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Epoch 50, Train Loss=0.070382, Weight Norm=8.702058
Sample Predictions at Epoch 50:
Day 192 (2024-10-11) => Predicted: 65.196, Actual: 63.870, Error: 1.33
Day 193 (2024-10-14) => Predicted: 65.778, Actual: 66.550, Error: 0.77
Day 194 (2024-10-15) => Predicted: 66.128, Actual: 66.000, Error: 0.13
Day 195 (2024-10-16) => Predicted: 66.036, Actual: 67.200, Error: 1.16
Day 196 (2024-10-17) => Predicted: 65.810, Actual: 66.760, Error: 0.95
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.533, c_hat[0] = -0.587
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.526, c_hat[0] = -0.580
c_state[0] = -0.068, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.526, c_hat[0] = -0.582
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.499, c_hat[0] = -0.594
c_state[0] = -0.104, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.522, c_hat[0] = -0.673
c_state[0] = -0.118, h_state[0] = -0.062
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.006
Gradient do_[0] = -0.028
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.008
Gradient do_[0] = -0.030
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.532, c_hat[0] = -0.588
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.525, c_hat[0] = -0.581
c_state[0] = -0.068, h_state[0] = -0.036
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.525, c_hat[0] = -0.583
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.498, c_hat[0] = -0.594
c_state[0] = -0.104, h_state[0] = -0.052
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.521, c_hat[0] = -0.674
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.005
Gradient do_[0] = -0.028
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.008
Gradient do_[0] = -0.029
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.532, c_hat[0] = -0.588
c_state[0] = -0.038, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.525, c_hat[0] = -0.581
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.524, c_hat[0] = -0.583
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.497, c_hat[0] = -0.594
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.520, c_hat[0] = -0.674
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.005
Gradient do_[0] = -0.027
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.024, dc_hat[0] = 0.008
Gradient do_[0] = -0.029
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.020
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.531, c_hat[0] = -0.589
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.524, c_hat[0] = -0.582
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.523, c_hat[0] = -0.584
c_state[0] = -0.088, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.496, c_hat[0] = -0.595
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.519, c_hat[0] = -0.674
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.005
Gradient do_[0] = -0.027
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.028
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.016
Gradient do_[0] = -0.035
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.019
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.530, c_hat[0] = -0.589
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.523, c_hat[0] = -0.582
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.522, c_hat[0] = -0.584
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.495, c_hat[0] = -0.595
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.518, c_hat[0] = -0.675
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.005
Gradient do_[0] = -0.027
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.028
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.016
Gradient do_[0] = -0.035
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.019
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.068, dc_hat[0] = 0.020
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.530, c_hat[0] = -0.589
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.522, c_hat[0] = -0.582
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.521, c_hat[0] = -0.584
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.494, c_hat[0] = -0.595
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.517, c_hat[0] = -0.675
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.005
Gradient do_[0] = -0.026
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.027
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.016
Gradient do_[0] = -0.035
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.068, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.529, c_hat[0] = -0.590
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.521, c_hat[0] = -0.583
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.521, c_hat[0] = -0.584
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.493, c_hat[0] = -0.595
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.516, c_hat[0] = -0.675
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.005
Gradient do_[0] = -0.026
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.027
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.015
Gradient do_[0] = -0.035
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.528, c_hat[0] = -0.590
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.520, c_hat[0] = -0.583
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.520, c_hat[0] = -0.585
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.492, c_hat[0] = -0.596
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.515, c_hat[0] = -0.676
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005
Gradient do_[0] = -0.026
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007
Gradient do_[0] = -0.027
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.015
Gradient do_[0] = -0.034
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.019
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.527, c_hat[0] = -0.591
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.520, c_hat[0] = -0.583
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.519, c_hat[0] = -0.585
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.491, c_hat[0] = -0.596
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.514, c_hat[0] = -0.676
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007
Gradient do_[0] = -0.027
Backward Time Step 2:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.015
Gradient do_[0] = -0.034
Backward Time Step 1:
Gradient di[0] = -0.016, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.037
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.527, c_hat[0] = -0.591
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.519, c_hat[0] = -0.584
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.518, c_hat[0] = -0.585
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.490, c_hat[0] = -0.596
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.513, c_hat[0] = -0.676
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007
Gradient do_[0] = -0.026
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.015
Gradient do_[0] = -0.034
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.526, c_hat[0] = -0.591
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.518, c_hat[0] = -0.584
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.517, c_hat[0] = -0.585
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.489, c_hat[0] = -0.596
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.512, c_hat[0] = -0.676
c_state[0] = -0.119, h_state[0] = -0.061
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007
Gradient do_[0] = -0.026
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015
Gradient do_[0] = -0.034
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.525, c_hat[0] = -0.592
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.517, c_hat[0] = -0.584
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.516, c_hat[0] = -0.585
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.488, c_hat[0] = -0.596
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.511, c_hat[0] = -0.677
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.007
Gradient do_[0] = -0.026
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015
Gradient do_[0] = -0.034
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.524, c_hat[0] = -0.592
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.516, c_hat[0] = -0.585
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.515, c_hat[0] = -0.586
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.487, c_hat[0] = -0.597
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.510, c_hat[0] = -0.677
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.007
Gradient do_[0] = -0.026
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015
Gradient do_[0] = -0.034
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.019
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.524, c_hat[0] = -0.592
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.515, c_hat[0] = -0.585
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.514, c_hat[0] = -0.586
c_state[0] = -0.089, h_state[0] = -0.046
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.486, c_hat[0] = -0.597
c_state[0] = -0.104, h_state[0] = -0.051
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.509, c_hat[0] = -0.677
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.026
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015
Gradient do_[0] = -0.034
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.523, c_hat[0] = -0.593
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.514, c_hat[0] = -0.585
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.513, c_hat[0] = -0.586
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.485, c_hat[0] = -0.597
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.508, c_hat[0] = -0.677
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.015
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.522, c_hat[0] = -0.593
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.514, c_hat[0] = -0.585
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.512, c_hat[0] = -0.586
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.484, c_hat[0] = -0.597
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.507, c_hat[0] = -0.677
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.015
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.521, c_hat[0] = -0.593
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.513, c_hat[0] = -0.585
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.511, c_hat[0] = -0.586
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.483, c_hat[0] = -0.597
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.506, c_hat[0] = -0.678
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.015
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.521, c_hat[0] = -0.593
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.512, c_hat[0] = -0.586
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.510, c_hat[0] = -0.586
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.482, c_hat[0] = -0.597
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.505, c_hat[0] = -0.678
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.520, c_hat[0] = -0.594
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.511, c_hat[0] = -0.586
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.509, c_hat[0] = -0.586
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.481, c_hat[0] = -0.597
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.504, c_hat[0] = -0.678
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.519, c_hat[0] = -0.594
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.510, c_hat[0] = -0.586
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.508, c_hat[0] = -0.587
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.480, c_hat[0] = -0.598
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.503, c_hat[0] = -0.678
c_state[0] = -0.119, h_state[0] = -0.060
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.518, c_hat[0] = -0.594
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.509, c_hat[0] = -0.586
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.507, c_hat[0] = -0.587
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.479, c_hat[0] = -0.598
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.502, c_hat[0] = -0.678
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.518, c_hat[0] = -0.595
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.508, c_hat[0] = -0.587
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.506, c_hat[0] = -0.587
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.478, c_hat[0] = -0.598
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.501, c_hat[0] = -0.678
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.517, c_hat[0] = -0.595
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.507, c_hat[0] = -0.587
c_state[0] = -0.068, h_state[0] = -0.035
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.506, c_hat[0] = -0.587
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.477, c_hat[0] = -0.598
c_state[0] = -0.104, h_state[0] = -0.050
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.500, c_hat[0] = -0.679
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.516, c_hat[0] = -0.595
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.507, c_hat[0] = -0.587
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.505, c_hat[0] = -0.587
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.476, c_hat[0] = -0.598
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.499, c_hat[0] = -0.679
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.019
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.515, c_hat[0] = -0.596
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.506, c_hat[0] = -0.587
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.504, c_hat[0] = -0.587
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.475, c_hat[0] = -0.598
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.498, c_hat[0] = -0.679
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.515, c_hat[0] = -0.596
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.505, c_hat[0] = -0.587
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.503, c_hat[0] = -0.587
c_state[0] = -0.089, h_state[0] = -0.045
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.474, c_hat[0] = -0.598
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.497, c_hat[0] = -0.679
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.514, c_hat[0] = -0.596
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.504, c_hat[0] = -0.588
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.502, c_hat[0] = -0.588
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.474, c_hat[0] = -0.599
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.496, c_hat[0] = -0.679
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.513, c_hat[0] = -0.596
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.503, c_hat[0] = -0.588
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.501, c_hat[0] = -0.588
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.473, c_hat[0] = -0.599
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.495, c_hat[0] = -0.679
c_state[0] = -0.119, h_state[0] = -0.059
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.513, c_hat[0] = -0.597
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.502, c_hat[0] = -0.588
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.500, c_hat[0] = -0.588
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.472, c_hat[0] = -0.599
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.494, c_hat[0] = -0.680
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.512, c_hat[0] = -0.597
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.502, c_hat[0] = -0.588
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.499, c_hat[0] = -0.588
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.471, c_hat[0] = -0.599
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.494, c_hat[0] = -0.680
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.511, c_hat[0] = -0.597
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.501, c_hat[0] = -0.589
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.498, c_hat[0] = -0.588
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.470, c_hat[0] = -0.599
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.493, c_hat[0] = -0.680
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.510, c_hat[0] = -0.597
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.500, c_hat[0] = -0.589
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.498, c_hat[0] = -0.588
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.469, c_hat[0] = -0.599
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.492, c_hat[0] = -0.680
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.510, c_hat[0] = -0.598
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.499, c_hat[0] = -0.589
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.497, c_hat[0] = -0.589
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.468, c_hat[0] = -0.600
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.491, c_hat[0] = -0.680
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.509, c_hat[0] = -0.598
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.498, c_hat[0] = -0.589
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.496, c_hat[0] = -0.589
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.467, c_hat[0] = -0.600
c_state[0] = -0.104, h_state[0] = -0.049
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.490, c_hat[0] = -0.681
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.508, c_hat[0] = -0.598
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.498, c_hat[0] = -0.590
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.495, c_hat[0] = -0.589
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.466, c_hat[0] = -0.600
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.489, c_hat[0] = -0.681
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.508, c_hat[0] = -0.599
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.497, c_hat[0] = -0.590
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.494, c_hat[0] = -0.589
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.466, c_hat[0] = -0.600
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.488, c_hat[0] = -0.681
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.507, c_hat[0] = -0.599
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.496, c_hat[0] = -0.590
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.493, c_hat[0] = -0.589
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.465, c_hat[0] = -0.600
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.488, c_hat[0] = -0.681
c_state[0] = -0.119, h_state[0] = -0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.014
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.506, c_hat[0] = -0.599
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.495, c_hat[0] = -0.590
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.493, c_hat[0] = -0.589
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.464, c_hat[0] = -0.601
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.487, c_hat[0] = -0.681
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.506, c_hat[0] = -0.599
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.495, c_hat[0] = -0.591
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.492, c_hat[0] = -0.590
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.463, c_hat[0] = -0.601
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.486, c_hat[0] = -0.682
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.505, c_hat[0] = -0.600
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.494, c_hat[0] = -0.591
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.491, c_hat[0] = -0.590
c_state[0] = -0.089, h_state[0] = -0.044
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.462, c_hat[0] = -0.601
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.485, c_hat[0] = -0.682
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.504, c_hat[0] = -0.600
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.493, c_hat[0] = -0.591
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.490, c_hat[0] = -0.590
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.462, c_hat[0] = -0.601
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.484, c_hat[0] = -0.682
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.504, c_hat[0] = -0.600
c_state[0] = -0.039, h_state[0] = -0.020
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.492, c_hat[0] = -0.591
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.490, c_hat[0] = -0.590
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.461, c_hat[0] = -0.602
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.483, c_hat[0] = -0.682
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.503, c_hat[0] = -0.601
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.492, c_hat[0] = -0.592
c_state[0] = -0.068, h_state[0] = -0.034
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.489, c_hat[0] = -0.591
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.460, c_hat[0] = -0.602
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.483, c_hat[0] = -0.682
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.502, c_hat[0] = -0.601
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.491, c_hat[0] = -0.592
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.488, c_hat[0] = -0.591
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.459, c_hat[0] = -0.602
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.482, c_hat[0] = -0.683
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.502, c_hat[0] = -0.601
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.490, c_hat[0] = -0.592
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.487, c_hat[0] = -0.591
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.458, c_hat[0] = -0.602
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.481, c_hat[0] = -0.683
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.501, c_hat[0] = -0.602
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.490, c_hat[0] = -0.592
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.486, c_hat[0] = -0.591
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.458, c_hat[0] = -0.603
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.480, c_hat[0] = -0.683
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.500, c_hat[0] = -0.602
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.489, c_hat[0] = -0.593
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.486, c_hat[0] = -0.592
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.457, c_hat[0] = -0.603
c_state[0] = -0.104, h_state[0] = -0.048
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.479, c_hat[0] = -0.683
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.500, c_hat[0] = -0.602
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.488, c_hat[0] = -0.593
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.485, c_hat[0] = -0.592
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.456, c_hat[0] = -0.603
c_state[0] = -0.104, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.479, c_hat[0] = -0.684
c_state[0] = -0.119, h_state[0] = -0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.499, c_hat[0] = -0.602
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.487, c_hat[0] = -0.593
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.484, c_hat[0] = -0.592
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.455, c_hat[0] = -0.603
c_state[0] = -0.104, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.478, c_hat[0] = -0.684
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.499, c_hat[0] = -0.603
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.487, c_hat[0] = -0.594
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.484, c_hat[0] = -0.592
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.455, c_hat[0] = -0.604
c_state[0] = -0.104, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.477, c_hat[0] = -0.684
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018
Gradient do_[0] = -0.024
Epoch 100, Train Loss=0.032092, Weight Norm=8.704198
Sample Predictions at Epoch 100:
Day 192 (2024-10-11) => Predicted: 61.580, Actual: 63.870, Error: 2.29
Day 193 (2024-10-14) => Predicted: 62.203, Actual: 66.550, Error: 4.35
Day 194 (2024-10-15) => Predicted: 62.507, Actual: 66.000, Error: 3.49
Day 195 (2024-10-16) => Predicted: 62.350, Actual: 67.200, Error: 4.85
Day 196 (2024-10-17) => Predicted: 62.186, Actual: 66.760, Error: 4.57
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.498, c_hat[0] = -0.603
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.486, c_hat[0] = -0.594
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.483, c_hat[0] = -0.593
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.454, c_hat[0] = -0.604
c_state[0] = -0.104, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.476, c_hat[0] = -0.684
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.497, c_hat[0] = -0.603
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.485, c_hat[0] = -0.594
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.482, c_hat[0] = -0.593
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.453, c_hat[0] = -0.604
c_state[0] = -0.104, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.476, c_hat[0] = -0.685
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.497, c_hat[0] = -0.604
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.485, c_hat[0] = -0.595
c_state[0] = -0.068, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.481, c_hat[0] = -0.593
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.453, c_hat[0] = -0.604
c_state[0] = -0.104, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.475, c_hat[0] = -0.685
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.496, c_hat[0] = -0.604
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.484, c_hat[0] = -0.595
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.481, c_hat[0] = -0.594
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.452, c_hat[0] = -0.605
c_state[0] = -0.104, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.474, c_hat[0] = -0.685
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.496, c_hat[0] = -0.604
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.483, c_hat[0] = -0.595
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.480, c_hat[0] = -0.594
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.451, c_hat[0] = -0.605
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.474, c_hat[0] = -0.686
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.495, c_hat[0] = -0.604
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.483, c_hat[0] = -0.595
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.479, c_hat[0] = -0.594
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.450, c_hat[0] = -0.605
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.473, c_hat[0] = -0.686
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.494, c_hat[0] = -0.605
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.482, c_hat[0] = -0.596
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.479, c_hat[0] = -0.594
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.450, c_hat[0] = -0.606
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.472, c_hat[0] = -0.686
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.494, c_hat[0] = -0.605
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.482, c_hat[0] = -0.596
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.478, c_hat[0] = -0.595
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.449, c_hat[0] = -0.606
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.471, c_hat[0] = -0.686
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.493, c_hat[0] = -0.605
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.481, c_hat[0] = -0.596
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.477, c_hat[0] = -0.595
c_state[0] = -0.089, h_state[0] = -0.043
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.448, c_hat[0] = -0.606
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.471, c_hat[0] = -0.687
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.493, c_hat[0] = -0.606
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.480, c_hat[0] = -0.597
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.477, c_hat[0] = -0.595
c_state[0] = -0.089, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.448, c_hat[0] = -0.607
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.470, c_hat[0] = -0.687
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.492, c_hat[0] = -0.606
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.480, c_hat[0] = -0.597
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.476, c_hat[0] = -0.596
c_state[0] = -0.089, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.447, c_hat[0] = -0.607
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.469, c_hat[0] = -0.687
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.025
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.492, c_hat[0] = -0.606
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.479, c_hat[0] = -0.597
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.475, c_hat[0] = -0.596
c_state[0] = -0.089, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.446, c_hat[0] = -0.607
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.469, c_hat[0] = -0.688
c_state[0] = -0.119, h_state[0] = -0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.491, c_hat[0] = -0.606
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.478, c_hat[0] = -0.598
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.475, c_hat[0] = -0.596
c_state[0] = -0.089, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.446, c_hat[0] = -0.608
c_state[0] = -0.105, h_state[0] = -0.047
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.468, c_hat[0] = -0.688
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.490, c_hat[0] = -0.607
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.478, c_hat[0] = -0.598
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.474, c_hat[0] = -0.597
c_state[0] = -0.089, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.445, c_hat[0] = -0.608
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.467, c_hat[0] = -0.688
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.490, c_hat[0] = -0.607
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.477, c_hat[0] = -0.598
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.474, c_hat[0] = -0.597
c_state[0] = -0.089, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.444, c_hat[0] = -0.608
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.467, c_hat[0] = -0.688
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.489, c_hat[0] = -0.607
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.477, c_hat[0] = -0.599
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.473, c_hat[0] = -0.597
c_state[0] = -0.089, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.444, c_hat[0] = -0.609
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.466, c_hat[0] = -0.689
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.489, c_hat[0] = -0.608
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.476, c_hat[0] = -0.599
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.472, c_hat[0] = -0.598
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.443, c_hat[0] = -0.609
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.465, c_hat[0] = -0.689
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.488, c_hat[0] = -0.608
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.475, c_hat[0] = -0.599
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.472, c_hat[0] = -0.598
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.443, c_hat[0] = -0.609
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.465, c_hat[0] = -0.689
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.488, c_hat[0] = -0.608
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.475, c_hat[0] = -0.600
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.471, c_hat[0] = -0.598
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.442, c_hat[0] = -0.610
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.464, c_hat[0] = -0.690
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.487, c_hat[0] = -0.608
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.474, c_hat[0] = -0.600
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.471, c_hat[0] = -0.599
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.441, c_hat[0] = -0.610
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.464, c_hat[0] = -0.690
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.487, c_hat[0] = -0.609
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.474, c_hat[0] = -0.600
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.470, c_hat[0] = -0.599
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.441, c_hat[0] = -0.610
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.463, c_hat[0] = -0.690
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.486, c_hat[0] = -0.609
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.473, c_hat[0] = -0.601
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.469, c_hat[0] = -0.599
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.440, c_hat[0] = -0.611
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.462, c_hat[0] = -0.691
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.486, c_hat[0] = -0.609
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.473, c_hat[0] = -0.601
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.469, c_hat[0] = -0.600
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.439, c_hat[0] = -0.611
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.462, c_hat[0] = -0.691
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.485, c_hat[0] = -0.610
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.472, c_hat[0] = -0.601
c_state[0] = -0.069, h_state[0] = -0.033
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.468, c_hat[0] = -0.600
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.439, c_hat[0] = -0.611
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.461, c_hat[0] = -0.691
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.024
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.485, c_hat[0] = -0.610
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.472, c_hat[0] = -0.602
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.468, c_hat[0] = -0.600
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.438, c_hat[0] = -0.612
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.460, c_hat[0] = -0.692
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.484, c_hat[0] = -0.610
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.471, c_hat[0] = -0.602
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.467, c_hat[0] = -0.601
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.438, c_hat[0] = -0.612
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.460, c_hat[0] = -0.692
c_state[0] = -0.119, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.484, c_hat[0] = -0.610
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.470, c_hat[0] = -0.602
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.466, c_hat[0] = -0.601
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.437, c_hat[0] = -0.612
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.459, c_hat[0] = -0.692
c_state[0] = -0.120, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.483, c_hat[0] = -0.611
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.470, c_hat[0] = -0.603
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.466, c_hat[0] = -0.601
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.437, c_hat[0] = -0.613
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.459, c_hat[0] = -0.692
c_state[0] = -0.120, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.483, c_hat[0] = -0.611
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.469, c_hat[0] = -0.603
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.465, c_hat[0] = -0.602
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.436, c_hat[0] = -0.613
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.458, c_hat[0] = -0.693
c_state[0] = -0.120, h_state[0] = -0.055
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.024
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.482, c_hat[0] = -0.611
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.469, c_hat[0] = -0.603
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.465, c_hat[0] = -0.602
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.435, c_hat[0] = -0.613
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.457, c_hat[0] = -0.693
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.482, c_hat[0] = -0.612
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.468, c_hat[0] = -0.604
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.464, c_hat[0] = -0.602
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.435, c_hat[0] = -0.614
c_state[0] = -0.105, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.457, c_hat[0] = -0.693
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.481, c_hat[0] = -0.612
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.679, o_gate[0] = 0.468, c_hat[0] = -0.604
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.464, c_hat[0] = -0.603
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.434, c_hat[0] = -0.614
c_state[0] = -0.106, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.456, c_hat[0] = -0.694
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.012
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.481, c_hat[0] = -0.612
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.467, c_hat[0] = -0.604
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.463, c_hat[0] = -0.603
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.434, c_hat[0] = -0.614
c_state[0] = -0.106, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.456, c_hat[0] = -0.694
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.480, c_hat[0] = -0.612
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.467, c_hat[0] = -0.605
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.463, c_hat[0] = -0.603
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.433, c_hat[0] = -0.615
c_state[0] = -0.106, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.455, c_hat[0] = -0.694
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.480, c_hat[0] = -0.613
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.466, c_hat[0] = -0.605
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.462, c_hat[0] = -0.604
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.433, c_hat[0] = -0.615
c_state[0] = -0.106, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.455, c_hat[0] = -0.695
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.479, c_hat[0] = -0.613
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.466, c_hat[0] = -0.605
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.462, c_hat[0] = -0.604
c_state[0] = -0.090, h_state[0] = -0.042
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.432, c_hat[0] = -0.615
c_state[0] = -0.106, h_state[0] = -0.046
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.454, c_hat[0] = -0.695
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.479, c_hat[0] = -0.613
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.465, c_hat[0] = -0.606
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.461, c_hat[0] = -0.604
c_state[0] = -0.090, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.432, c_hat[0] = -0.616
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.453, c_hat[0] = -0.695
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.003
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.478, c_hat[0] = -0.613
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.465, c_hat[0] = -0.606
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.460, c_hat[0] = -0.605
c_state[0] = -0.090, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.431, c_hat[0] = -0.616
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.453, c_hat[0] = -0.696
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.478, c_hat[0] = -0.614
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.464, c_hat[0] = -0.606
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.460, c_hat[0] = -0.605
c_state[0] = -0.090, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.430, c_hat[0] = -0.616
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.452, c_hat[0] = -0.696
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.477, c_hat[0] = -0.614
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.464, c_hat[0] = -0.607
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.459, c_hat[0] = -0.605
c_state[0] = -0.090, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.430, c_hat[0] = -0.617
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.452, c_hat[0] = -0.696
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.477, c_hat[0] = -0.614
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.463, c_hat[0] = -0.607
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.459, c_hat[0] = -0.606
c_state[0] = -0.090, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.429, c_hat[0] = -0.617
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.451, c_hat[0] = -0.696
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.477, c_hat[0] = -0.614
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.463, c_hat[0] = -0.607
c_state[0] = -0.069, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.458, c_hat[0] = -0.606
c_state[0] = -0.090, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.429, c_hat[0] = -0.618
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.451, c_hat[0] = -0.697
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.023
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.476, c_hat[0] = -0.615
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.462, c_hat[0] = -0.608
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.458, c_hat[0] = -0.606
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.428, c_hat[0] = -0.618
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.450, c_hat[0] = -0.697
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.476, c_hat[0] = -0.615
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.462, c_hat[0] = -0.608
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.457, c_hat[0] = -0.607
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.428, c_hat[0] = -0.618
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.450, c_hat[0] = -0.697
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.475, c_hat[0] = -0.615
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.461, c_hat[0] = -0.608
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.457, c_hat[0] = -0.607
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.427, c_hat[0] = -0.619
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.449, c_hat[0] = -0.698
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.475, c_hat[0] = -0.616
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.461, c_hat[0] = -0.609
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.456, c_hat[0] = -0.607
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.427, c_hat[0] = -0.619
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.449, c_hat[0] = -0.698
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.474, c_hat[0] = -0.616
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.460, c_hat[0] = -0.609
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.456, c_hat[0] = -0.608
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.426, c_hat[0] = -0.619
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.448, c_hat[0] = -0.698
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.474, c_hat[0] = -0.616
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.460, c_hat[0] = -0.609
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.455, c_hat[0] = -0.608
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.426, c_hat[0] = -0.620
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.448, c_hat[0] = -0.699
c_state[0] = -0.120, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.036
Backward Time Step 0:
Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.473, c_hat[0] = -0.616
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.459, c_hat[0] = -0.610
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.455, c_hat[0] = -0.608
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.425, c_hat[0] = -0.620
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.447, c_hat[0] = -0.699
c_state[0] = -0.121, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.473, c_hat[0] = -0.617
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.459, c_hat[0] = -0.610
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.455, c_hat[0] = -0.609
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.425, c_hat[0] = -0.620
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.447, c_hat[0] = -0.699
c_state[0] = -0.121, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.016
Gradient do_[0] = -0.024
Epoch 150, Train Loss=0.022807, Weight Norm=8.707580
Sample Predictions at Epoch 150:
Day 192 (2024-10-11) => Predicted: 59.591, Actual: 63.870, Error: 4.28
Day 193 (2024-10-14) => Predicted: 60.233, Actual: 66.550, Error: 6.32
Day 194 (2024-10-15) => Predicted: 60.512, Actual: 66.000, Error: 5.49
Day 195 (2024-10-16) => Predicted: 60.396, Actual: 67.200, Error: 6.80
Day 196 (2024-10-17) => Predicted: 60.265, Actual: 66.760, Error: 6.49
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.473, c_hat[0] = -0.617
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.459, c_hat[0] = -0.610
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.454, c_hat[0] = -0.609
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.424, c_hat[0] = -0.621
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.446, c_hat[0] = -0.699
c_state[0] = -0.121, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.472, c_hat[0] = -0.617
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.458, c_hat[0] = -0.611
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.454, c_hat[0] = -0.609
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.424, c_hat[0] = -0.621
c_state[0] = -0.106, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.446, c_hat[0] = -0.700
c_state[0] = -0.121, h_state[0] = -0.054
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.472, c_hat[0] = -0.617
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.458, c_hat[0] = -0.611
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.453, c_hat[0] = -0.610
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.423, c_hat[0] = -0.621
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.445, c_hat[0] = -0.700
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.022
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.471, c_hat[0] = -0.618
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.457, c_hat[0] = -0.611
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.453, c_hat[0] = -0.610
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.423, c_hat[0] = -0.622
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.445, c_hat[0] = -0.700
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.022
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.030
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.471, c_hat[0] = -0.618
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.457, c_hat[0] = -0.612
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.452, c_hat[0] = -0.610
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.423, c_hat[0] = -0.622
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.444, c_hat[0] = -0.701
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.035, dc_hat[0] = 0.011
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.471, c_hat[0] = -0.618
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.456, c_hat[0] = -0.612
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.452, c_hat[0] = -0.611
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.422, c_hat[0] = -0.622
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.444, c_hat[0] = -0.701
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.470, c_hat[0] = -0.618
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.456, c_hat[0] = -0.612
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.451, c_hat[0] = -0.611
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.422, c_hat[0] = -0.623
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.443, c_hat[0] = -0.701
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.470, c_hat[0] = -0.619
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.455, c_hat[0] = -0.613
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.451, c_hat[0] = -0.611
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.421, c_hat[0] = -0.623
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.443, c_hat[0] = -0.702
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.469, c_hat[0] = -0.619
c_state[0] = -0.039, h_state[0] = -0.019
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.455, c_hat[0] = -0.613
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.450, c_hat[0] = -0.612
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.421, c_hat[0] = -0.623
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.442, c_hat[0] = -0.702
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.469, c_hat[0] = -0.619
c_state[0] = -0.039, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.455, c_hat[0] = -0.613
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.450, c_hat[0] = -0.612
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.420, c_hat[0] = -0.624
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.442, c_hat[0] = -0.702
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.469, c_hat[0] = -0.619
c_state[0] = -0.039, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.454, c_hat[0] = -0.614
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.450, c_hat[0] = -0.612
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.420, c_hat[0] = -0.624
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.441, c_hat[0] = -0.703
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.468, c_hat[0] = -0.619
c_state[0] = -0.039, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.454, c_hat[0] = -0.614
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.449, c_hat[0] = -0.613
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.419, c_hat[0] = -0.624
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.441, c_hat[0] = -0.703
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.468, c_hat[0] = -0.620
c_state[0] = -0.039, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.453, c_hat[0] = -0.614
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.449, c_hat[0] = -0.613
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.419, c_hat[0] = -0.625
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.440, c_hat[0] = -0.703
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.467, c_hat[0] = -0.620
c_state[0] = -0.039, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.678, o_gate[0] = 0.453, c_hat[0] = -0.615
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.448, c_hat[0] = -0.613
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.418, c_hat[0] = -0.625
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.440, c_hat[0] = -0.703
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.467, c_hat[0] = -0.620
c_state[0] = -0.039, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.678, o_gate[0] = 0.453, c_hat[0] = -0.615
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.448, c_hat[0] = -0.614
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.418, c_hat[0] = -0.625
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.439, c_hat[0] = -0.704
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.021
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.021
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.467, c_hat[0] = -0.620
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.678, o_gate[0] = 0.452, c_hat[0] = -0.615
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.447, c_hat[0] = -0.614
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.418, c_hat[0] = -0.626
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.439, c_hat[0] = -0.704
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.466, c_hat[0] = -0.621
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.452, c_hat[0] = -0.616
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.447, c_hat[0] = -0.614
c_state[0] = -0.091, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.417, c_hat[0] = -0.626
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.439, c_hat[0] = -0.704
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.466, c_hat[0] = -0.621
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.451, c_hat[0] = -0.616
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.447, c_hat[0] = -0.615
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.417, c_hat[0] = -0.626
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.438, c_hat[0] = -0.705
c_state[0] = -0.121, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.465, c_hat[0] = -0.621
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.451, c_hat[0] = -0.616
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.446, c_hat[0] = -0.615
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.416, c_hat[0] = -0.627
c_state[0] = -0.107, h_state[0] = -0.045
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.438, c_hat[0] = -0.705
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.465, c_hat[0] = -0.621
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.451, c_hat[0] = -0.616
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.446, c_hat[0] = -0.615
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.416, c_hat[0] = -0.627
c_state[0] = -0.107, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.437, c_hat[0] = -0.705
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.465, c_hat[0] = -0.622
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.450, c_hat[0] = -0.617
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.445, c_hat[0] = -0.616
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.415, c_hat[0] = -0.627
c_state[0] = -0.107, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.437, c_hat[0] = -0.705
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.464, c_hat[0] = -0.622
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.450, c_hat[0] = -0.617
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.445, c_hat[0] = -0.616
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.415, c_hat[0] = -0.628
c_state[0] = -0.107, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.436, c_hat[0] = -0.706
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.013
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.464, c_hat[0] = -0.622
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.449, c_hat[0] = -0.617
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.444, c_hat[0] = -0.616
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.415, c_hat[0] = -0.628
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.436, c_hat[0] = -0.706
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.013
Gradient do_[0] = -0.035
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.464, c_hat[0] = -0.622
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.449, c_hat[0] = -0.618
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.444, c_hat[0] = -0.617
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.414, c_hat[0] = -0.628
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.436, c_hat[0] = -0.706
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.463, c_hat[0] = -0.622
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.449, c_hat[0] = -0.618
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.444, c_hat[0] = -0.617
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.414, c_hat[0] = -0.629
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.435, c_hat[0] = -0.707
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.020
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.463, c_hat[0] = -0.623
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.448, c_hat[0] = -0.618
c_state[0] = -0.070, h_state[0] = -0.032
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.443, c_hat[0] = -0.617
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.413, c_hat[0] = -0.629
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.435, c_hat[0] = -0.707
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.463, c_hat[0] = -0.623
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.448, c_hat[0] = -0.619
c_state[0] = -0.070, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.443, c_hat[0] = -0.618
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.413, c_hat[0] = -0.629
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.434, c_hat[0] = -0.707
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.462, c_hat[0] = -0.623
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.447, c_hat[0] = -0.619
c_state[0] = -0.070, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.443, c_hat[0] = -0.618
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.413, c_hat[0] = -0.630
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.434, c_hat[0] = -0.707
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.462, c_hat[0] = -0.623
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.447, c_hat[0] = -0.619
c_state[0] = -0.070, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.442, c_hat[0] = -0.618
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.412, c_hat[0] = -0.630
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.433, c_hat[0] = -0.708
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.461, c_hat[0] = -0.624
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.447, c_hat[0] = -0.619
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.442, c_hat[0] = -0.619
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.412, c_hat[0] = -0.630
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.433, c_hat[0] = -0.708
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.009
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.461, c_hat[0] = -0.624
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.446, c_hat[0] = -0.620
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.441, c_hat[0] = -0.619
c_state[0] = -0.092, h_state[0] = -0.041
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.411, c_hat[0] = -0.631
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.433, c_hat[0] = -0.708
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.009
Gradient do_[0] = -0.028
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.461, c_hat[0] = -0.624
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.446, c_hat[0] = -0.620
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.441, c_hat[0] = -0.619
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.411, c_hat[0] = -0.631
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.432, c_hat[0] = -0.709
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.460, c_hat[0] = -0.624
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.446, c_hat[0] = -0.620
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.441, c_hat[0] = -0.620
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.411, c_hat[0] = -0.631
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.432, c_hat[0] = -0.709
c_state[0] = -0.122, h_state[0] = -0.053
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.460, c_hat[0] = -0.624
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.445, c_hat[0] = -0.621
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.440, c_hat[0] = -0.620
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.410, c_hat[0] = -0.632
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.431, c_hat[0] = -0.709
c_state[0] = -0.122, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.460, c_hat[0] = -0.625
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.445, c_hat[0] = -0.621
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.440, c_hat[0] = -0.620
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.410, c_hat[0] = -0.632
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.431, c_hat[0] = -0.709
c_state[0] = -0.122, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.459, c_hat[0] = -0.625
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.445, c_hat[0] = -0.621
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.439, c_hat[0] = -0.621
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.409, c_hat[0] = -0.632
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.431, c_hat[0] = -0.710
c_state[0] = -0.122, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.459, c_hat[0] = -0.625
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.444, c_hat[0] = -0.622
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.439, c_hat[0] = -0.621
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.409, c_hat[0] = -0.633
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.430, c_hat[0] = -0.710
c_state[0] = -0.122, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.019
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.459, c_hat[0] = -0.625
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.444, c_hat[0] = -0.622
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.439, c_hat[0] = -0.621
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.409, c_hat[0] = -0.633
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.430, c_hat[0] = -0.710
c_state[0] = -0.122, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.024
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.458, c_hat[0] = -0.625
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.443, c_hat[0] = -0.622
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.438, c_hat[0] = -0.622
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.408, c_hat[0] = -0.633
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.429, c_hat[0] = -0.711
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.458, c_hat[0] = -0.626
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.443, c_hat[0] = -0.622
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.438, c_hat[0] = -0.622
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.408, c_hat[0] = -0.634
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.429, c_hat[0] = -0.711
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.458, c_hat[0] = -0.626
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.443, c_hat[0] = -0.623
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.438, c_hat[0] = -0.622
c_state[0] = -0.092, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.408, c_hat[0] = -0.634
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.429, c_hat[0] = -0.711
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.457, c_hat[0] = -0.626
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.442, c_hat[0] = -0.623
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.437, c_hat[0] = -0.622
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.407, c_hat[0] = -0.634
c_state[0] = -0.108, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.428, c_hat[0] = -0.711
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.009, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.457, c_hat[0] = -0.626
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.442, c_hat[0] = -0.623
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.437, c_hat[0] = -0.623
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.407, c_hat[0] = -0.634
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.428, c_hat[0] = -0.712
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.027
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.457, c_hat[0] = -0.626
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.442, c_hat[0] = -0.624
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.437, c_hat[0] = -0.623
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.406, c_hat[0] = -0.635
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.427, c_hat[0] = -0.712
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.456, c_hat[0] = -0.627
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.441, c_hat[0] = -0.624
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.436, c_hat[0] = -0.623
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.406, c_hat[0] = -0.635
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.427, c_hat[0] = -0.712
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.054, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.456, c_hat[0] = -0.627
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.441, c_hat[0] = -0.624
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.436, c_hat[0] = -0.624
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.406, c_hat[0] = -0.635
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.427, c_hat[0] = -0.712
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013
Gradient do_[0] = -0.034
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.456, c_hat[0] = -0.627
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.441, c_hat[0] = -0.624
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.436, c_hat[0] = -0.624
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.405, c_hat[0] = -0.636
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.426, c_hat[0] = -0.713
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.455, c_hat[0] = -0.627
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.440, c_hat[0] = -0.625
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.435, c_hat[0] = -0.624
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.405, c_hat[0] = -0.636
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.426, c_hat[0] = -0.713
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.018
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.455, c_hat[0] = -0.627
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.440, c_hat[0] = -0.625
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.435, c_hat[0] = -0.625
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.405, c_hat[0] = -0.636
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.426, c_hat[0] = -0.713
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.455, c_hat[0] = -0.628
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.440, c_hat[0] = -0.625
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.434, c_hat[0] = -0.625
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.404, c_hat[0] = -0.637
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.425, c_hat[0] = -0.713
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Epoch 200, Train Loss=0.018094, Weight Norm=8.711754
Sample Predictions at Epoch 200:
Day 192 (2024-10-11) => Predicted: 58.925, Actual: 63.870, Error: 4.95
Day 193 (2024-10-14) => Predicted: 59.571, Actual: 66.550, Error: 6.98
Day 194 (2024-10-15) => Predicted: 59.838, Actual: 66.000, Error: 6.16
Day 195 (2024-10-16) => Predicted: 59.802, Actual: 67.200, Error: 7.40
Day 196 (2024-10-17) => Predicted: 59.701, Actual: 66.760, Error: 7.06
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.439, c_hat[0] = -0.625
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.434, c_hat[0] = -0.625
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.404, c_hat[0] = -0.637
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.425, c_hat[0] = -0.714
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.439, c_hat[0] = -0.626
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.434, c_hat[0] = -0.625
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.404, c_hat[0] = -0.637
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.425, c_hat[0] = -0.714
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.439, c_hat[0] = -0.626
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.433, c_hat[0] = -0.626
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.403, c_hat[0] = -0.638
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.424, c_hat[0] = -0.714
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.438, c_hat[0] = -0.626
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.433, c_hat[0] = -0.626
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.403, c_hat[0] = -0.638
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.424, c_hat[0] = -0.714
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.453, c_hat[0] = -0.628
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.438, c_hat[0] = -0.627
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.433, c_hat[0] = -0.626
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.403, c_hat[0] = -0.638
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.423, c_hat[0] = -0.715
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.453, c_hat[0] = -0.629
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.438, c_hat[0] = -0.627
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.627
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.402, c_hat[0] = -0.638
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.423, c_hat[0] = -0.715
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.017
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.026
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.453, c_hat[0] = -0.629
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.627
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.627
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.402, c_hat[0] = -0.639
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.423, c_hat[0] = -0.715
c_state[0] = -0.123, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.452, c_hat[0] = -0.629
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.627
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.627
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.402, c_hat[0] = -0.639
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.422, c_hat[0] = -0.716
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.452, c_hat[0] = -0.629
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.628
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.628
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.401, c_hat[0] = -0.639
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.422, c_hat[0] = -0.716
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.017
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.452, c_hat[0] = -0.629
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.628
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.431, c_hat[0] = -0.628
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.401, c_hat[0] = -0.640
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.422, c_hat[0] = -0.716
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.436, c_hat[0] = -0.628
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.431, c_hat[0] = -0.628
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.401, c_hat[0] = -0.640
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.421, c_hat[0] = -0.716
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.436, c_hat[0] = -0.628
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.431, c_hat[0] = -0.628
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.400, c_hat[0] = -0.640
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.421, c_hat[0] = -0.717
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.436, c_hat[0] = -0.629
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.430, c_hat[0] = -0.629
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.400, c_hat[0] = -0.640
c_state[0] = -0.109, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.421, c_hat[0] = -0.717
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.435, c_hat[0] = -0.629
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.430, c_hat[0] = -0.629
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.400, c_hat[0] = -0.641
c_state[0] = -0.110, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.420, c_hat[0] = -0.717
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.012, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.450, c_hat[0] = -0.630
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.435, c_hat[0] = -0.629
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.430, c_hat[0] = -0.629
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.399, c_hat[0] = -0.641
c_state[0] = -0.110, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.420, c_hat[0] = -0.717
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.450, c_hat[0] = -0.630
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.435, c_hat[0] = -0.629
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.429, c_hat[0] = -0.630
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.399, c_hat[0] = -0.641
c_state[0] = -0.110, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.420, c_hat[0] = -0.718
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003
Gradient do_[0] = -0.016
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.450, c_hat[0] = -0.631
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.434, c_hat[0] = -0.630
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.429, c_hat[0] = -0.630
c_state[0] = -0.093, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.399, c_hat[0] = -0.642
c_state[0] = -0.110, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.419, c_hat[0] = -0.718
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.033
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.449, c_hat[0] = -0.631
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.434, c_hat[0] = -0.630
c_state[0] = -0.071, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.429, c_hat[0] = -0.630
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.398, c_hat[0] = -0.642
c_state[0] = -0.110, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.419, c_hat[0] = -0.718
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.449, c_hat[0] = -0.631
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.434, c_hat[0] = -0.630
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.630
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.398, c_hat[0] = -0.642
c_state[0] = -0.110, h_state[0] = -0.044
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.419, c_hat[0] = -0.718
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.025
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.449, c_hat[0] = -0.631
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.434, c_hat[0] = -0.630
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.631
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.398, c_hat[0] = -0.642
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.418, c_hat[0] = -0.718
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.016
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.449, c_hat[0] = -0.631
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.433, c_hat[0] = -0.631
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.428, c_hat[0] = -0.631
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.397, c_hat[0] = -0.643
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.418, c_hat[0] = -0.719
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.448, c_hat[0] = -0.631
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.433, c_hat[0] = -0.631
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.631
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.397, c_hat[0] = -0.643
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.418, c_hat[0] = -0.719
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.448, c_hat[0] = -0.632
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.433, c_hat[0] = -0.631
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.631
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.397, c_hat[0] = -0.643
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.417, c_hat[0] = -0.719
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.448, c_hat[0] = -0.632
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.631
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.632
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.417, c_hat[0] = -0.719
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.632
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.632
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.417, c_hat[0] = -0.720
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.632
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.426, c_hat[0] = -0.632
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.416, c_hat[0] = -0.720
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.632
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.426, c_hat[0] = -0.633
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.416, c_hat[0] = -0.720
c_state[0] = -0.124, h_state[0] = -0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.431, c_hat[0] = -0.632
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.426, c_hat[0] = -0.633
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.395, c_hat[0] = -0.645
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.416, c_hat[0] = -0.720
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.431, c_hat[0] = -0.633
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.633
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.395, c_hat[0] = -0.645
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.431, c_hat[0] = -0.633
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.633
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.395, c_hat[0] = -0.645
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.633
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.634
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.394, c_hat[0] = -0.645
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002
Gradient do_[0] = -0.015
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.633
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.634
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.394, c_hat[0] = -0.646
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.445, c_hat[0] = -0.633
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.634
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.424, c_hat[0] = -0.634
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.394, c_hat[0] = -0.646
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.414, c_hat[0] = -0.722
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.445, c_hat[0] = -0.633
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.634
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.424, c_hat[0] = -0.634
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.646
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.414, c_hat[0] = -0.722
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.445, c_hat[0] = -0.633
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.634
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.424, c_hat[0] = -0.635
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.647
c_state[0] = -0.110, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.414, c_hat[0] = -0.722
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.444, c_hat[0] = -0.634
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.634
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.635
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.647
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.413, c_hat[0] = -0.722
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.444, c_hat[0] = -0.634
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.635
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.635
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.647
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.413, c_hat[0] = -0.722
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.444, c_hat[0] = -0.634
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.635
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.635
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.392, c_hat[0] = -0.647
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.413, c_hat[0] = -0.723
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.023
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.444, c_hat[0] = -0.634
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.635
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.636
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.392, c_hat[0] = -0.648
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.723
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.634
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.635
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.636
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.392, c_hat[0] = -0.648
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.723
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.634
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.635
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.422, c_hat[0] = -0.636
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.648
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.723
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.635
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.422, c_hat[0] = -0.637
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.648
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.724
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.014
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.635
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.637
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.649
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.411, c_hat[0] = -0.724
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.637
c_state[0] = -0.094, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.649
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.411, c_hat[0] = -0.724
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.023
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.637
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.649
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.411, c_hat[0] = -0.724
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.638
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.649
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.410, c_hat[0] = -0.724
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.638
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.650
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.410, c_hat[0] = -0.725
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.013
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.635
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.638
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.650
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.410, c_hat[0] = -0.725
c_state[0] = -0.125, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.636
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.638
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.389, c_hat[0] = -0.650
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.410, c_hat[0] = -0.725
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.636
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.639
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.389, c_hat[0] = -0.650
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.409, c_hat[0] = -0.725
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Epoch 250, Train Loss=0.015130, Weight Norm=8.716688
Sample Predictions at Epoch 250:
Day 192 (2024-10-11) => Predicted: 58.722, Actual: 63.870, Error: 5.15
Day 193 (2024-10-14) => Predicted: 59.364, Actual: 66.550, Error: 7.19
Day 194 (2024-10-15) => Predicted: 59.624, Actual: 66.000, Error: 6.38
Day 195 (2024-10-16) => Predicted: 59.677, Actual: 67.200, Error: 7.52
Day 196 (2024-10-17) => Predicted: 59.600, Actual: 66.760, Error: 7.16
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.636
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.639
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.389, c_hat[0] = -0.651
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.409, c_hat[0] = -0.726
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.639
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.651
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.409, c_hat[0] = -0.726
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.639
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.651
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.726
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.013
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.638
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.640
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.651
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.726
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.639
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.652
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.726
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.439, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.639
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.652
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.727
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.439, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.639
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.652
c_state[0] = -0.111, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.022
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.439, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.639
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.652
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.439, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.639
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.641
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.653
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.439, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.640
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.641
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.653
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011
Gradient do_[0] = -0.031
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.640
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.417, c_hat[0] = -0.641
c_state[0] = -0.095, h_state[0] = -0.040
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.653
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.406, c_hat[0] = -0.728
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.640
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.417, c_hat[0] = -0.641
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.653
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.406, c_hat[0] = -0.728
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.637
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.640
c_state[0] = -0.072, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.654
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.406, c_hat[0] = -0.728
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.638
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.641
c_state[0] = -0.073, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.654
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.728
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.641
c_state[0] = -0.073, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.654
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.728
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.012
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.641
c_state[0] = -0.073, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.654
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.729
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.641
c_state[0] = -0.073, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.655
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.729
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.641
c_state[0] = -0.073, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.729
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.638
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.642
c_state[0] = -0.073, h_state[0] = -0.031
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.729
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.011
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.638
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.642
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.729
c_state[0] = -0.126, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.021
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.642
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.730
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.642
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.642
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644
c_state[0] = -0.095, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.643
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.419, c_hat[0] = -0.643
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.419, c_hat[0] = -0.643
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.382, c_hat[0] = -0.657
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.402, c_hat[0] = -0.731
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.011
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.419, c_hat[0] = -0.643
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.657
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.402, c_hat[0] = -0.731
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.435, c_hat[0] = -0.639
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.643
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.657
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.402, c_hat[0] = -0.731
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.018
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.644
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.657
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.402, c_hat[0] = -0.731
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.412, c_hat[0] = -0.646
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.658
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.731
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.412, c_hat[0] = -0.646
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658
c_state[0] = -0.112, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.030
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.412, c_hat[0] = -0.646
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.412, c_hat[0] = -0.646
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.645
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.646
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.732
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.733
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.022
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.733
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.733
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.010
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.645
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.660
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.733
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.733
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.733
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.734
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734
c_state[0] = -0.127, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.646
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734
c_state[0] = -0.128, h_state[0] = -0.051
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734
c_state[0] = -0.128, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734
c_state[0] = -0.128, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.019
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.642
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.735
c_state[0] = -0.128, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.018
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.430, c_hat[0] = -0.642
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735
c_state[0] = -0.128, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.018
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.430, c_hat[0] = -0.642
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.414, c_hat[0] = -0.647
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.408, c_hat[0] = -0.650
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.377, c_hat[0] = -0.662
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735
c_state[0] = -0.128, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001
Gradient do_[0] = -0.009
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.018
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.430, c_hat[0] = -0.642
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.647
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.408, c_hat[0] = -0.650
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.377, c_hat[0] = -0.662
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735
c_state[0] = -0.128, h_state[0] = -0.050
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001
Gradient do_[0] = -0.008
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006
Gradient do_[0] = -0.018
Backward Time Step 1:
Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.009
Gradient do_[0] = -0.029
Backward Time Step 0:
Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011
Gradient do_[0] = -0.021
Epoch 300, Train Loss=0.013052, Weight Norm=8.722285
Sample Predictions at Epoch 300:
Day 192 (2024-10-11) => Predicted: 58.728, Actual: 63.870, Error: 5.14
Day 193 (2024-10-14) => Predicted: 59.362, Actual: 66.550, Error: 7.19
Day 194 (2024-10-15) => Predicted: 59.616, Actual: 66.000, Error: 6.38
Day 195 (2024-10-16) => Predicted: 59.759, Actual: 67.200, Error: 7.44
Day 196 (2024-10-17) => Predicted: 59.704, Actual: 66.760, Error: 7.06
Time Step 0:
i_gate[0] = 0.063, f_gate[0] = 0.640, o_gate[0] = 0.430, c_hat[0] = -0.642
c_state[0] = -0.040, h_state[0] = -0.017
Time Step 1:
i_gate[0] = 0.071, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.648
c_state[0] = -0.073, h_state[0] = -0.030
Time Step 2:
i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.408, c_hat[0] = -0.650
c_state[0] = -0.096, h_state[0] = -0.039
Time Step 3:
i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.377, c_hat[0] = -0.662
c_state[0] = -0.113, h_state[0] = -0.043
Time Step 4:
i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735
c_state[0] = -0.128, h_state[0] = -0.050
Validation (last 30 days):
Day 197 (2024-10-18) => Predict=60.821, Actual=61.520, Error=0.70
Day 198 (2024-10-21) => Predict=59.374, Actual=60.680, Error=1.31
Day 199 (2024-10-22) => Predict=59.129, Actual=61.020, Error=1.89
Day 200 (2024-10-23) => Predict=58.994, Actual=58.630, Error=0.36
Day 201 (2024-10-24) => Predict=59.008, Actual=59.050, Error=0.04
Day 202 (2024-10-25) => Predict=57.945, Actual=59.180, Error=1.24
Day 203 (2024-10-28) => Predict=58.681, Actual=57.290, Error=1.39
Day 204 (2024-10-29) => Predict=58.279, Actual=55.640, Error=2.64
Day 205 (2024-10-30) => Predict=57.079, Actual=59.830, Error=2.75
Day 206 (2024-10-31) => Predict=56.069, Actual=60.010, Error=3.94
Day 207 (2024-11-01) => Predict=54.329, Actual=60.490, Error=6.16
Day 208 (2024-11-04) => Predict=53.864, Actual=59.740, Error=5.88
Day 209 (2024-11-05) => Predict=53.572, Actual=58.020, Error=4.45
Day 210 (2024-11-06) => Predict=53.235, Actual=56.460, Error=3.22
Day 211 (2024-11-07) => Predict=52.301, Actual=56.340, Error=4.04
Day 212 (2024-11-08) => Predict=51.879, Actual=56.420, Error=4.54
Day 213 (2024-11-11) => Predict=51.576, Actual=59.760, Error=8.18
Day 214 (2024-11-12) => Predict=51.059, Actual=61.610, Error=10.55
Day 215 (2024-11-13) => Predict=50.151, Actual=60.580, Error=10.43
Day 216 (2024-11-14) => Predict=49.912, Actual=61.620, Error=11.71
Day 217 (2024-11-15) => Predict=48.988, Actual=59.920, Error=10.93
Day 218 (2024-11-18) => Predict=47.550, Actual=57.390, Error=9.84
Day 219 (2024-11-19) => Predict=47.246, Actual=61.190, Error=13.94
Day 220 (2024-11-20) => Predict=46.853, Actual=62.950, Error=16.10
Day 221 (2024-11-21) => Predict=47.210, Actual=64.170, Error=16.96
Day 222 (2024-11-22) => Predict=47.339, Actual=63.000, Error=15.66
Day 223 (2024-11-25) => Predict=47.516, Actual=65.060, Error=17.54
Day 224 (2024-11-26) => Predict=50.052, Actual=63.680, Error=13.63
Day 225 (2024-11-27) => Predict=51.703, Actual=63.680, Error=11.98
Total valid daily bars used: 227
First day: 2024-01-08 O=59.23 H=60.68 L=58.82 C=59.64 V=124629
Last day: 2024-11-29 O=64.45 H=64.45 L=63.00 C=63.77 V=62082
Target Min: 40.86, Target Max: 74.47
Normalized Targets (First 5 Samples):
Sample 0: 0.933
Sample 1: 0.930
Sample 2: 0.965
Sample 3: 1.000
Sample 4: 0.534
Time Step 0:
i_gate[0] = 0.539, f_gate[0] = 0.688, o_gate[0] = 0.411, c_hat[0] = 0.666
c_state[0] = 0.359, h_state[0] = 0.141
Time Step 1:
i_gate[0] = 0.484, f_gate[0] = 0.667, o_gate[0] = 0.398, c_hat[0] = 0.726
c_state[0] = 0.591, h_state[0] = 0.211
Time Step 2:
i_gate[0] = 0.451, f_gate[0] = 0.655, o_gate[0] = 0.395, c_hat[0] = 0.737
c_state[0] = 0.720, h_state[0] = 0.243
Time Step 3:
i_gate[0] = 0.422, f_gate[0] = 0.639, o_gate[0] = 0.400, c_hat[0] = 0.799
c_state[0] = 0.796, h_state[0] = 0.265
Time Step 4:
i_gate[0] = 0.398, f_gate[0] = 0.636, o_gate[0] = 0.410, c_hat[0] = 0.794
c_state[0] = 0.823, h_state[0] = 0.277
Backward Time Step 4:
Gradient di[0] = 0.031, df[0] = 0.024, dc_hat[0] = 0.024
Gradient do_[0] = 0.306
Backward Time Step 3:
Gradient di[0] = 0.033, df[0] = 0.025, dc_hat[0] = 0.026
Gradient do_[0] = 0.347
Backward Time Step 2:
Gradient di[0] = 0.057, df[0] = 0.046, dc_hat[0] = 0.064
Gradient do_[0] = 0.476
Backward Time Step 1:
Gradient di[0] = 0.075, df[0] = 0.061, dc_hat[0] = 0.095
Gradient do_[0] = 0.504
Backward Time Step 0:
Gradient di[0] = 0.085, df[0] = 0.076, dc_hat[0] = 0.155
Gradient do_[0] = 0.320
Epoch 1, Train Loss=0.036383, Weight Norm=12.005491
Sample Predictions at Epoch 1:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 49.44 | 63.87 | 14.43 |
| 193 | 2024-10-14 | 49.80 | 66.55 | 16.75 |
| 194 | 2024-10-15 | 49.96 | 66.00 | 16.04 |
| 195 | 2024-10-16 | 49.04 | 67.20 | 18.16 |
| 196 | 2024-10-17 | 49.31 | 66.76 | 17.45 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.538, f_gate[0] = 0.686, o_gate[0] = 0.399, c_hat[0] = 0.658
c_state[0] = 0.354, h_state[0] = 0.135
Time Step 1:
i_gate[0] = 0.482, f_gate[0] = 0.665, o_gate[0] = 0.383, c_hat[0] = 0.718
c_state[0] = 0.581, h_state[0] = 0.201
Time Step 2:
i_gate[0] = 0.449, f_gate[0] = 0.652, o_gate[0] = 0.379, c_hat[0] = 0.730
c_state[0] = 0.707, h_state[0] = 0.231
Time Step 3:
i_gate[0] = 0.419, f_gate[0] = 0.635, o_gate[0] = 0.383, c_hat[0] = 0.794
c_state[0] = 0.782, h_state[0] = 0.250
Time Step 4:
i_gate[0] = 0.396, f_gate[0] = 0.632, o_gate[0] = 0.391, c_hat[0] = 0.790
c_state[0] = 0.806, h_state[0] = 0.261
Backward Time Step 4:
Gradient di[0] = 0.027, df[0] = 0.021, dc_hat[0] = 0.021
Gradient do_[0] = 0.267
Backward Time Step 3:
Gradient di[0] = 0.029, df[0] = 0.022, dc_hat[0] = 0.023
Gradient do_[0] = 0.308
Backward Time Step 2:
Gradient di[0] = 0.051, df[0] = 0.042, dc_hat[0] = 0.060
Gradient do_[0] = 0.439
Backward Time Step 1:
Gradient di[0] = 0.069, df[0] = 0.057, dc_hat[0] = 0.090
Gradient do_[0] = 0.474
Backward Time Step 0:
Gradient di[0] = 0.080, df[0] = 0.072, dc_hat[0] = 0.148
Gradient do_[0] = 0.304
Time Step 0:
i_gate[0] = 0.536, f_gate[0] = 0.685, o_gate[0] = 0.387, c_hat[0] = 0.652
c_state[0] = 0.349, h_state[0] = 0.130
Time Step 1:
i_gate[0] = 0.480, f_gate[0] = 0.664, o_gate[0] = 0.370, c_hat[0] = 0.712
c_state[0] = 0.574, h_state[0] = 0.191
Time Step 2:
i_gate[0] = 0.447, f_gate[0] = 0.650, o_gate[0] = 0.364, c_hat[0] = 0.725
c_state[0] = 0.697, h_state[0] = 0.219
Time Step 3:
i_gate[0] = 0.417, f_gate[0] = 0.633, o_gate[0] = 0.367, c_hat[0] = 0.791
c_state[0] = 0.771, h_state[0] = 0.237
Time Step 4:
i_gate[0] = 0.393, f_gate[0] = 0.629, o_gate[0] = 0.373, c_hat[0] = 0.787
c_state[0] = 0.794, h_state[0] = 0.246
Backward Time Step 4:
Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.018
Gradient do_[0] = 0.234
Backward Time Step 3:
Gradient di[0] = 0.025, df[0] = 0.019, dc_hat[0] = 0.020
Gradient do_[0] = 0.275
Backward Time Step 2:
Gradient di[0] = 0.047, df[0] = 0.039, dc_hat[0] = 0.055
Gradient do_[0] = 0.407
Backward Time Step 1:
Gradient di[0] = 0.064, df[0] = 0.053, dc_hat[0] = 0.084
Gradient do_[0] = 0.448
Backward Time Step 0:
Gradient di[0] = 0.074, df[0] = 0.068, dc_hat[0] = 0.141
Gradient do_[0] = 0.291
Time Step 0:
i_gate[0] = 0.535, f_gate[0] = 0.684, o_gate[0] = 0.375, c_hat[0] = 0.648
c_state[0] = 0.346, h_state[0] = 0.125
Time Step 1:
i_gate[0] = 0.478, f_gate[0] = 0.662, o_gate[0] = 0.356, c_hat[0] = 0.709
c_state[0] = 0.568, h_state[0] = 0.183
Time Step 2:
i_gate[0] = 0.445, f_gate[0] = 0.648, o_gate[0] = 0.349, c_hat[0] = 0.722
c_state[0] = 0.690, h_state[0] = 0.209
Time Step 3:
i_gate[0] = 0.415, f_gate[0] = 0.631, o_gate[0] = 0.351, c_hat[0] = 0.789
c_state[0] = 0.763, h_state[0] = 0.225
Time Step 4:
i_gate[0] = 0.392, f_gate[0] = 0.626, o_gate[0] = 0.356, c_hat[0] = 0.785
c_state[0] = 0.785, h_state[0] = 0.233
Backward Time Step 4:
Gradient di[0] = 0.020, df[0] = 0.015, dc_hat[0] = 0.016
Gradient do_[0] = 0.206
Backward Time Step 3:
Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.018
Gradient do_[0] = 0.246
Backward Time Step 2:
Gradient di[0] = 0.043, df[0] = 0.035, dc_hat[0] = 0.051
Gradient do_[0] = 0.381
Backward Time Step 1:
Gradient di[0] = 0.059, df[0] = 0.049, dc_hat[0] = 0.079
Gradient do_[0] = 0.426
Backward Time Step 0:
Gradient di[0] = 0.070, df[0] = 0.064, dc_hat[0] = 0.134
Gradient do_[0] = 0.280
Time Step 0:
i_gate[0] = 0.534, f_gate[0] = 0.683, o_gate[0] = 0.364, c_hat[0] = 0.645
c_state[0] = 0.344, h_state[0] = 0.121
Time Step 1:
i_gate[0] = 0.476, f_gate[0] = 0.661, o_gate[0] = 0.343, c_hat[0] = 0.707
c_state[0] = 0.565, h_state[0] = 0.175
Time Step 2:
i_gate[0] = 0.444, f_gate[0] = 0.647, o_gate[0] = 0.335, c_hat[0] = 0.721
c_state[0] = 0.685, h_state[0] = 0.199
Time Step 3:
i_gate[0] = 0.414, f_gate[0] = 0.629, o_gate[0] = 0.336, c_hat[0] = 0.789
c_state[0] = 0.757, h_state[0] = 0.215
Time Step 4:
i_gate[0] = 0.390, f_gate[0] = 0.624, o_gate[0] = 0.340, c_hat[0] = 0.786
c_state[0] = 0.779, h_state[0] = 0.221
Backward Time Step 4:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.014
Gradient do_[0] = 0.184
Backward Time Step 3:
Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.016
Gradient do_[0] = 0.223
Backward Time Step 2:
Gradient di[0] = 0.039, df[0] = 0.032, dc_hat[0] = 0.047
Gradient do_[0] = 0.359
Backward Time Step 1:
Gradient di[0] = 0.055, df[0] = 0.046, dc_hat[0] = 0.074
Gradient do_[0] = 0.408
Backward Time Step 0:
Gradient di[0] = 0.066, df[0] = 0.061, dc_hat[0] = 0.128
Gradient do_[0] = 0.271
Time Step 0:
i_gate[0] = 0.533, f_gate[0] = 0.683, o_gate[0] = 0.353, c_hat[0] = 0.644
c_state[0] = 0.343, h_state[0] = 0.117
Time Step 1:
i_gate[0] = 0.475, f_gate[0] = 0.661, o_gate[0] = 0.331, c_hat[0] = 0.706
c_state[0] = 0.562, h_state[0] = 0.169
Time Step 2:
i_gate[0] = 0.442, f_gate[0] = 0.646, o_gate[0] = 0.322, c_hat[0] = 0.722
c_state[0] = 0.682, h_state[0] = 0.191
Time Step 3:
i_gate[0] = 0.412, f_gate[0] = 0.628, o_gate[0] = 0.322, c_hat[0] = 0.790
c_state[0] = 0.754, h_state[0] = 0.205
Time Step 4:
i_gate[0] = 0.388, f_gate[0] = 0.623, o_gate[0] = 0.324, c_hat[0] = 0.787
c_state[0] = 0.775, h_state[0] = 0.211
Backward Time Step 4:
Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.012
Gradient do_[0] = 0.166
Backward Time Step 3:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.014
Gradient do_[0] = 0.204
Backward Time Step 2:
Gradient di[0] = 0.036, df[0] = 0.030, dc_hat[0] = 0.043
Gradient do_[0] = 0.342
Backward Time Step 1:
Gradient di[0] = 0.051, df[0] = 0.043, dc_hat[0] = 0.069
Gradient do_[0] = 0.394
Backward Time Step 0:
Gradient di[0] = 0.062, df[0] = 0.057, dc_hat[0] = 0.121
Gradient do_[0] = 0.264
Time Step 0:
i_gate[0] = 0.533, f_gate[0] = 0.683, o_gate[0] = 0.343, c_hat[0] = 0.644
c_state[0] = 0.343, h_state[0] = 0.113
Time Step 1:
i_gate[0] = 0.474, f_gate[0] = 0.660, o_gate[0] = 0.320, c_hat[0] = 0.707
c_state[0] = 0.562, h_state[0] = 0.163
Time Step 2:
i_gate[0] = 0.441, f_gate[0] = 0.645, o_gate[0] = 0.310, c_hat[0] = 0.723
c_state[0] = 0.681, h_state[0] = 0.184
Time Step 3:
i_gate[0] = 0.411, f_gate[0] = 0.627, o_gate[0] = 0.309, c_hat[0] = 0.791
c_state[0] = 0.752, h_state[0] = 0.197
Time Step 4:
i_gate[0] = 0.387, f_gate[0] = 0.621, o_gate[0] = 0.311, c_hat[0] = 0.789
c_state[0] = 0.773, h_state[0] = 0.202
Backward Time Step 4:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.011
Gradient do_[0] = 0.154
Backward Time Step 3:
Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.012
Gradient do_[0] = 0.191
Backward Time Step 2:
Gradient di[0] = 0.034, df[0] = 0.028, dc_hat[0] = 0.040
Gradient do_[0] = 0.330
Backward Time Step 1:
Gradient di[0] = 0.049, df[0] = 0.041, dc_hat[0] = 0.065
Gradient do_[0] = 0.384
Backward Time Step 0:
Gradient di[0] = 0.060, df[0] = 0.055, dc_hat[0] = 0.116
Gradient do_[0] = 0.260
Time Step 0:
i_gate[0] = 0.532, f_gate[0] = 0.682, o_gate[0] = 0.334, c_hat[0] = 0.645
c_state[0] = 0.343, h_state[0] = 0.110
Time Step 1:
i_gate[0] = 0.474, f_gate[0] = 0.660, o_gate[0] = 0.310, c_hat[0] = 0.708
c_state[0] = 0.562, h_state[0] = 0.158
Time Step 2:
i_gate[0] = 0.441, f_gate[0] = 0.644, o_gate[0] = 0.300, c_hat[0] = 0.725
c_state[0] = 0.681, h_state[0] = 0.177
Time Step 3:
i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.298, c_hat[0] = 0.793
c_state[0] = 0.752, h_state[0] = 0.190
Time Step 4:
i_gate[0] = 0.386, f_gate[0] = 0.620, o_gate[0] = 0.300, c_hat[0] = 0.791
c_state[0] = 0.772, h_state[0] = 0.194
Backward Time Step 4:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010
Gradient do_[0] = 0.147
Backward Time Step 3:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.011
Gradient do_[0] = 0.183
Backward Time Step 2:
Gradient di[0] = 0.032, df[0] = 0.027, dc_hat[0] = 0.038
Gradient do_[0] = 0.323
Backward Time Step 1:
Gradient di[0] = 0.046, df[0] = 0.039, dc_hat[0] = 0.062
Gradient do_[0] = 0.379
Backward Time Step 0:
Gradient di[0] = 0.058, df[0] = 0.053, dc_hat[0] = 0.111
Gradient do_[0] = 0.257
Time Step 0:
i_gate[0] = 0.532, f_gate[0] = 0.682, o_gate[0] = 0.326, c_hat[0] = 0.646
c_state[0] = 0.344, h_state[0] = 0.108
Time Step 1:
i_gate[0] = 0.473, f_gate[0] = 0.659, o_gate[0] = 0.301, c_hat[0] = 0.710
c_state[0] = 0.562, h_state[0] = 0.154
Time Step 2:
i_gate[0] = 0.440, f_gate[0] = 0.644, o_gate[0] = 0.291, c_hat[0] = 0.727
c_state[0] = 0.682, h_state[0] = 0.172
Time Step 3:
i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.289, c_hat[0] = 0.796
c_state[0] = 0.752, h_state[0] = 0.184
Time Step 4:
i_gate[0] = 0.385, f_gate[0] = 0.619, o_gate[0] = 0.290, c_hat[0] = 0.794
c_state[0] = 0.772, h_state[0] = 0.188
Backward Time Step 4:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009
Gradient do_[0] = 0.143
Backward Time Step 3:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.011
Gradient do_[0] = 0.179
Backward Time Step 2:
Gradient di[0] = 0.031, df[0] = 0.026, dc_hat[0] = 0.036
Gradient do_[0] = 0.319
Backward Time Step 1:
Gradient di[0] = 0.045, df[0] = 0.038, dc_hat[0] = 0.060
Gradient do_[0] = 0.376
Backward Time Step 0:
Gradient di[0] = 0.056, df[0] = 0.052, dc_hat[0] = 0.108
Gradient do_[0] = 0.256
Time Step 0:
i_gate[0] = 0.531, f_gate[0] = 0.682, o_gate[0] = 0.319, c_hat[0] = 0.648
c_state[0] = 0.344, h_state[0] = 0.106
Time Step 1:
i_gate[0] = 0.473, f_gate[0] = 0.659, o_gate[0] = 0.294, c_hat[0] = 0.712
c_state[0] = 0.564, h_state[0] = 0.150
Time Step 2:
i_gate[0] = 0.439, f_gate[0] = 0.643, o_gate[0] = 0.283, c_hat[0] = 0.730
c_state[0] = 0.683, h_state[0] = 0.168
Time Step 3:
i_gate[0] = 0.409, f_gate[0] = 0.624, o_gate[0] = 0.280, c_hat[0] = 0.798
c_state[0] = 0.752, h_state[0] = 0.179
Time Step 4:
i_gate[0] = 0.384, f_gate[0] = 0.619, o_gate[0] = 0.281, c_hat[0] = 0.797
c_state[0] = 0.772, h_state[0] = 0.182
Backward Time Step 4:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.009
Gradient do_[0] = 0.141
Backward Time Step 3:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010
Gradient do_[0] = 0.177
Backward Time Step 2:
Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.034
Gradient do_[0] = 0.318
Backward Time Step 1:
Gradient di[0] = 0.044, df[0] = 0.036, dc_hat[0] = 0.057
Gradient do_[0] = 0.374
Backward Time Step 0:
Gradient di[0] = 0.055, df[0] = 0.050, dc_hat[0] = 0.105
Gradient do_[0] = 0.256
Time Step 0:
i_gate[0] = 0.531, f_gate[0] = 0.683, o_gate[0] = 0.312, c_hat[0] = 0.651
c_state[0] = 0.345, h_state[0] = 0.104
Time Step 1:
i_gate[0] = 0.472, f_gate[0] = 0.659, o_gate[0] = 0.286, c_hat[0] = 0.715
c_state[0] = 0.565, h_state[0] = 0.147
Time Step 2:
i_gate[0] = 0.438, f_gate[0] = 0.643, o_gate[0] = 0.275, c_hat[0] = 0.733
c_state[0] = 0.684, h_state[0] = 0.164
Time Step 3:
i_gate[0] = 0.408, f_gate[0] = 0.624, o_gate[0] = 0.273, c_hat[0] = 0.801
c_state[0] = 0.753, h_state[0] = 0.174
Time Step 4:
i_gate[0] = 0.383, f_gate[0] = 0.618, o_gate[0] = 0.272, c_hat[0] = 0.800
c_state[0] = 0.772, h_state[0] = 0.177
Backward Time Step 4:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.008
Gradient do_[0] = 0.140
Backward Time Step 3:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009
Gradient do_[0] = 0.175
Backward Time Step 2:
Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.033
Gradient do_[0] = 0.317
Backward Time Step 1:
Gradient di[0] = 0.043, df[0] = 0.035, dc_hat[0] = 0.055
Gradient do_[0] = 0.374
Backward Time Step 0:
Gradient di[0] = 0.054, df[0] = 0.049, dc_hat[0] = 0.102
Gradient do_[0] = 0.257
Time Step 0:
i_gate[0] = 0.531, f_gate[0] = 0.683, o_gate[0] = 0.305, c_hat[0] = 0.653
c_state[0] = 0.347, h_state[0] = 0.102
Time Step 1:
i_gate[0] = 0.471, f_gate[0] = 0.659, o_gate[0] = 0.279, c_hat[0] = 0.717
c_state[0] = 0.567, h_state[0] = 0.143
Time Step 2:
i_gate[0] = 0.437, f_gate[0] = 0.643, o_gate[0] = 0.268, c_hat[0] = 0.736
c_state[0] = 0.686, h_state[0] = 0.159
Time Step 3:
i_gate[0] = 0.407, f_gate[0] = 0.623, o_gate[0] = 0.265, c_hat[0] = 0.804
c_state[0] = 0.755, h_state[0] = 0.169
Time Step 4:
i_gate[0] = 0.382, f_gate[0] = 0.617, o_gate[0] = 0.264, c_hat[0] = 0.803
c_state[0] = 0.773, h_state[0] = 0.171
Backward Time Step 4:
Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.139
Backward Time Step 3:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009
Gradient do_[0] = 0.174
Backward Time Step 2:
Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.031
Gradient do_[0] = 0.317
Backward Time Step 1:
Gradient di[0] = 0.041, df[0] = 0.034, dc_hat[0] = 0.053
Gradient do_[0] = 0.374
Backward Time Step 0:
Gradient di[0] = 0.053, df[0] = 0.048, dc_hat[0] = 0.099
Gradient do_[0] = 0.258
Time Step 0:
i_gate[0] = 0.530, f_gate[0] = 0.683, o_gate[0] = 0.299, c_hat[0] = 0.656
c_state[0] = 0.348, h_state[0] = 0.100
Time Step 1:
i_gate[0] = 0.471, f_gate[0] = 0.659, o_gate[0] = 0.272, c_hat[0] = 0.720
c_state[0] = 0.568, h_state[0] = 0.140
Time Step 2:
i_gate[0] = 0.437, f_gate[0] = 0.642, o_gate[0] = 0.260, c_hat[0] = 0.739
c_state[0] = 0.688, h_state[0] = 0.155
Time Step 3:
i_gate[0] = 0.406, f_gate[0] = 0.623, o_gate[0] = 0.257, c_hat[0] = 0.807
c_state[0] = 0.756, h_state[0] = 0.164
Time Step 4:
i_gate[0] = 0.381, f_gate[0] = 0.616, o_gate[0] = 0.256, c_hat[0] = 0.806
c_state[0] = 0.773, h_state[0] = 0.166
Backward Time Step 4:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.007
Gradient do_[0] = 0.138
Backward Time Step 3:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.008
Gradient do_[0] = 0.173
Backward Time Step 2:
Gradient di[0] = 0.028, df[0] = 0.022, dc_hat[0] = 0.030
Gradient do_[0] = 0.316
Backward Time Step 1:
Gradient di[0] = 0.040, df[0] = 0.033, dc_hat[0] = 0.051
Gradient do_[0] = 0.374
Backward Time Step 0:
Gradient di[0] = 0.052, df[0] = 0.047, dc_hat[0] = 0.096
Gradient do_[0] = 0.259
Time Step 0:
i_gate[0] = 0.530, f_gate[0] = 0.683, o_gate[0] = 0.292, c_hat[0] = 0.659
c_state[0] = 0.349, h_state[0] = 0.098
Time Step 1:
i_gate[0] = 0.470, f_gate[0] = 0.659, o_gate[0] = 0.265, c_hat[0] = 0.723
c_state[0] = 0.570, h_state[0] = 0.137
Time Step 2:
i_gate[0] = 0.436, f_gate[0] = 0.642, o_gate[0] = 0.253, c_hat[0] = 0.742
c_state[0] = 0.690, h_state[0] = 0.151
Time Step 3:
i_gate[0] = 0.405, f_gate[0] = 0.622, o_gate[0] = 0.249, c_hat[0] = 0.810
c_state[0] = 0.757, h_state[0] = 0.159
Time Step 4:
i_gate[0] = 0.380, f_gate[0] = 0.615, o_gate[0] = 0.247, c_hat[0] = 0.810
c_state[0] = 0.774, h_state[0] = 0.161
Backward Time Step 4:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.007
Gradient do_[0] = 0.137
Backward Time Step 3:
Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.172
Backward Time Step 2:
Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.029
Gradient do_[0] = 0.316
Backward Time Step 1:
Gradient di[0] = 0.039, df[0] = 0.032, dc_hat[0] = 0.049
Gradient do_[0] = 0.374
Backward Time Step 0:
Gradient di[0] = 0.051, df[0] = 0.046, dc_hat[0] = 0.093
Gradient do_[0] = 0.260
Time Step 0:
i_gate[0] = 0.530, f_gate[0] = 0.683, o_gate[0] = 0.285, c_hat[0] = 0.662
c_state[0] = 0.350, h_state[0] = 0.096
Time Step 1:
i_gate[0] = 0.470, f_gate[0] = 0.659, o_gate[0] = 0.258, c_hat[0] = 0.726
c_state[0] = 0.572, h_state[0] = 0.133
Time Step 2:
i_gate[0] = 0.435, f_gate[0] = 0.642, o_gate[0] = 0.245, c_hat[0] = 0.746
c_state[0] = 0.691, h_state[0] = 0.147
Time Step 3:
i_gate[0] = 0.404, f_gate[0] = 0.622, o_gate[0] = 0.241, c_hat[0] = 0.812
c_state[0] = 0.758, h_state[0] = 0.154
Time Step 4:
i_gate[0] = 0.379, f_gate[0] = 0.615, o_gate[0] = 0.239, c_hat[0] = 0.813
c_state[0] = 0.774, h_state[0] = 0.155
Backward Time Step 4:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006
Gradient do_[0] = 0.135
Backward Time Step 3:
Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.007
Gradient do_[0] = 0.170
Backward Time Step 2:
Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.027
Gradient do_[0] = 0.314
Backward Time Step 1:
Gradient di[0] = 0.038, df[0] = 0.031, dc_hat[0] = 0.047
Gradient do_[0] = 0.374
Backward Time Step 0:
Gradient di[0] = 0.050, df[0] = 0.045, dc_hat[0] = 0.090
Gradient do_[0] = 0.260
Time Step 0:
i_gate[0] = 0.529, f_gate[0] = 0.683, o_gate[0] = 0.278, c_hat[0] = 0.664
c_state[0] = 0.351, h_state[0] = 0.094
Time Step 1:
i_gate[0] = 0.469, f_gate[0] = 0.659, o_gate[0] = 0.250, c_hat[0] = 0.729
c_state[0] = 0.573, h_state[0] = 0.130
Time Step 2:
i_gate[0] = 0.434, f_gate[0] = 0.641, o_gate[0] = 0.237, c_hat[0] = 0.749
c_state[0] = 0.693, h_state[0] = 0.142
Time Step 3:
i_gate[0] = 0.403, f_gate[0] = 0.621, o_gate[0] = 0.233, c_hat[0] = 0.815
c_state[0] = 0.759, h_state[0] = 0.149
Time Step 4:
i_gate[0] = 0.378, f_gate[0] = 0.614, o_gate[0] = 0.230, c_hat[0] = 0.816
c_state[0] = 0.774, h_state[0] = 0.150
Backward Time Step 4:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006
Gradient do_[0] = 0.133
Backward Time Step 3:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.007
Gradient do_[0] = 0.167
Backward Time Step 2:
Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.026
Gradient do_[0] = 0.313
Backward Time Step 1:
Gradient di[0] = 0.037, df[0] = 0.030, dc_hat[0] = 0.045
Gradient do_[0] = 0.373
Backward Time Step 0:
Gradient di[0] = 0.049, df[0] = 0.044, dc_hat[0] = 0.087
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.529, f_gate[0] = 0.683, o_gate[0] = 0.271, c_hat[0] = 0.667
c_state[0] = 0.353, h_state[0] = 0.092
Time Step 1:
i_gate[0] = 0.468, f_gate[0] = 0.659, o_gate[0] = 0.243, c_hat[0] = 0.731
c_state[0] = 0.575, h_state[0] = 0.126
Time Step 2:
i_gate[0] = 0.433, f_gate[0] = 0.641, o_gate[0] = 0.230, c_hat[0] = 0.752
c_state[0] = 0.694, h_state[0] = 0.138
Time Step 3:
i_gate[0] = 0.402, f_gate[0] = 0.621, o_gate[0] = 0.225, c_hat[0] = 0.818
c_state[0] = 0.760, h_state[0] = 0.144
Time Step 4:
i_gate[0] = 0.377, f_gate[0] = 0.613, o_gate[0] = 0.222, c_hat[0] = 0.819
c_state[0] = 0.774, h_state[0] = 0.144
Backward Time Step 4:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.130
Backward Time Step 3:
Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.164
Backward Time Step 2:
Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.025
Gradient do_[0] = 0.311
Backward Time Step 1:
Gradient di[0] = 0.036, df[0] = 0.029, dc_hat[0] = 0.043
Gradient do_[0] = 0.372
Backward Time Step 0:
Gradient di[0] = 0.048, df[0] = 0.042, dc_hat[0] = 0.084
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.529, f_gate[0] = 0.684, o_gate[0] = 0.264, c_hat[0] = 0.669
c_state[0] = 0.354, h_state[0] = 0.090
Time Step 1:
i_gate[0] = 0.468, f_gate[0] = 0.659, o_gate[0] = 0.236, c_hat[0] = 0.734
c_state[0] = 0.576, h_state[0] = 0.123
Time Step 2:
i_gate[0] = 0.432, f_gate[0] = 0.641, o_gate[0] = 0.222, c_hat[0] = 0.755
c_state[0] = 0.696, h_state[0] = 0.134
Time Step 3:
i_gate[0] = 0.401, f_gate[0] = 0.620, o_gate[0] = 0.217, c_hat[0] = 0.821
c_state[0] = 0.760, h_state[0] = 0.139
Time Step 4:
i_gate[0] = 0.376, f_gate[0] = 0.612, o_gate[0] = 0.214, c_hat[0] = 0.822
c_state[0] = 0.775, h_state[0] = 0.139
Backward Time Step 4:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.127
Backward Time Step 3:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006
Gradient do_[0] = 0.161
Backward Time Step 2:
Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.023
Gradient do_[0] = 0.309
Backward Time Step 1:
Gradient di[0] = 0.035, df[0] = 0.028, dc_hat[0] = 0.041
Gradient do_[0] = 0.371
Backward Time Step 0:
Gradient di[0] = 0.047, df[0] = 0.041, dc_hat[0] = 0.081
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.528, f_gate[0] = 0.684, o_gate[0] = 0.257, c_hat[0] = 0.671
c_state[0] = 0.355, h_state[0] = 0.088
Time Step 1:
i_gate[0] = 0.467, f_gate[0] = 0.659, o_gate[0] = 0.229, c_hat[0] = 0.736
c_state[0] = 0.578, h_state[0] = 0.119
Time Step 2:
i_gate[0] = 0.432, f_gate[0] = 0.640, o_gate[0] = 0.215, c_hat[0] = 0.757
c_state[0] = 0.697, h_state[0] = 0.129
Time Step 3:
i_gate[0] = 0.400, f_gate[0] = 0.620, o_gate[0] = 0.209, c_hat[0] = 0.823
c_state[0] = 0.761, h_state[0] = 0.134
Time Step 4:
i_gate[0] = 0.375, f_gate[0] = 0.612, o_gate[0] = 0.206, c_hat[0] = 0.825
c_state[0] = 0.774, h_state[0] = 0.133
Backward Time Step 4:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.124
Backward Time Step 3:
Gradient di[0] = 0.009, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.157
Backward Time Step 2:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.022
Gradient do_[0] = 0.307
Backward Time Step 1:
Gradient di[0] = 0.034, df[0] = 0.027, dc_hat[0] = 0.039
Gradient do_[0] = 0.369
Backward Time Step 0:
Gradient di[0] = 0.045, df[0] = 0.040, dc_hat[0] = 0.079
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.528, f_gate[0] = 0.684, o_gate[0] = 0.251, c_hat[0] = 0.674
c_state[0] = 0.356, h_state[0] = 0.086
Time Step 1:
i_gate[0] = 0.467, f_gate[0] = 0.659, o_gate[0] = 0.222, c_hat[0] = 0.738
c_state[0] = 0.579, h_state[0] = 0.116
Time Step 2:
i_gate[0] = 0.431, f_gate[0] = 0.640, o_gate[0] = 0.208, c_hat[0] = 0.760
c_state[0] = 0.698, h_state[0] = 0.125
Time Step 3:
i_gate[0] = 0.399, f_gate[0] = 0.619, o_gate[0] = 0.202, c_hat[0] = 0.825
c_state[0] = 0.761, h_state[0] = 0.130
Time Step 4:
i_gate[0] = 0.374, f_gate[0] = 0.611, o_gate[0] = 0.198, c_hat[0] = 0.828
c_state[0] = 0.774, h_state[0] = 0.129
Backward Time Step 4:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004
Gradient do_[0] = 0.121
Backward Time Step 3:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.154
Backward Time Step 2:
Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.021
Gradient do_[0] = 0.304
Backward Time Step 1:
Gradient di[0] = 0.032, df[0] = 0.026, dc_hat[0] = 0.038
Gradient do_[0] = 0.368
Backward Time Step 0:
Gradient di[0] = 0.044, df[0] = 0.039, dc_hat[0] = 0.076
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.528, f_gate[0] = 0.684, o_gate[0] = 0.244, c_hat[0] = 0.676
c_state[0] = 0.356, h_state[0] = 0.084
Time Step 1:
i_gate[0] = 0.466, f_gate[0] = 0.658, o_gate[0] = 0.215, c_hat[0] = 0.741
c_state[0] = 0.580, h_state[0] = 0.113
Time Step 2:
i_gate[0] = 0.430, f_gate[0] = 0.640, o_gate[0] = 0.201, c_hat[0] = 0.763
c_state[0] = 0.699, h_state[0] = 0.121
Time Step 3:
i_gate[0] = 0.398, f_gate[0] = 0.618, o_gate[0] = 0.195, c_hat[0] = 0.828
c_state[0] = 0.762, h_state[0] = 0.125
Time Step 4:
i_gate[0] = 0.373, f_gate[0] = 0.610, o_gate[0] = 0.191, c_hat[0] = 0.830
c_state[0] = 0.774, h_state[0] = 0.124
Backward Time Step 4:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004
Gradient do_[0] = 0.118
Backward Time Step 3:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.151
Backward Time Step 2:
Gradient di[0] = 0.021, df[0] = 0.016, dc_hat[0] = 0.020
Gradient do_[0] = 0.302
Backward Time Step 1:
Gradient di[0] = 0.031, df[0] = 0.025, dc_hat[0] = 0.036
Gradient do_[0] = 0.366
Backward Time Step 0:
Gradient di[0] = 0.043, df[0] = 0.038, dc_hat[0] = 0.074
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.238, c_hat[0] = 0.677
c_state[0] = 0.357, h_state[0] = 0.082
Time Step 1:
i_gate[0] = 0.466, f_gate[0] = 0.658, o_gate[0] = 0.209, c_hat[0] = 0.743
c_state[0] = 0.581, h_state[0] = 0.109
Time Step 2:
i_gate[0] = 0.429, f_gate[0] = 0.639, o_gate[0] = 0.195, c_hat[0] = 0.765
c_state[0] = 0.700, h_state[0] = 0.118
Time Step 3:
i_gate[0] = 0.397, f_gate[0] = 0.618, o_gate[0] = 0.189, c_hat[0] = 0.830
c_state[0] = 0.762, h_state[0] = 0.121
Time Step 4:
i_gate[0] = 0.372, f_gate[0] = 0.609, o_gate[0] = 0.184, c_hat[0] = 0.833
c_state[0] = 0.774, h_state[0] = 0.119
Backward Time Step 4:
Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.004
Gradient do_[0] = 0.115
Backward Time Step 3:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.005
Gradient do_[0] = 0.148
Backward Time Step 2:
Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.019
Gradient do_[0] = 0.299
Backward Time Step 1:
Gradient di[0] = 0.030, df[0] = 0.024, dc_hat[0] = 0.034
Gradient do_[0] = 0.365
Backward Time Step 0:
Gradient di[0] = 0.042, df[0] = 0.037, dc_hat[0] = 0.071
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.233, c_hat[0] = 0.679
c_state[0] = 0.358, h_state[0] = 0.080
Time Step 1:
i_gate[0] = 0.465, f_gate[0] = 0.658, o_gate[0] = 0.203, c_hat[0] = 0.745
c_state[0] = 0.582, h_state[0] = 0.107
Time Step 2:
i_gate[0] = 0.429, f_gate[0] = 0.639, o_gate[0] = 0.189, c_hat[0] = 0.767
c_state[0] = 0.701, h_state[0] = 0.114
Time Step 3:
i_gate[0] = 0.397, f_gate[0] = 0.617, o_gate[0] = 0.183, c_hat[0] = 0.832
c_state[0] = 0.763, h_state[0] = 0.117
Time Step 4:
i_gate[0] = 0.371, f_gate[0] = 0.609, o_gate[0] = 0.177, c_hat[0] = 0.835
c_state[0] = 0.774, h_state[0] = 0.115
Backward Time Step 4:
Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.113
Backward Time Step 3:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004
Gradient do_[0] = 0.145
Backward Time Step 2:
Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.018
Gradient do_[0] = 0.297
Backward Time Step 1:
Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.033
Gradient do_[0] = 0.364
Backward Time Step 0:
Gradient di[0] = 0.041, df[0] = 0.036, dc_hat[0] = 0.069
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.227, c_hat[0] = 0.681
c_state[0] = 0.359, h_state[0] = 0.078
Time Step 1:
i_gate[0] = 0.465, f_gate[0] = 0.658, o_gate[0] = 0.198, c_hat[0] = 0.746
c_state[0] = 0.583, h_state[0] = 0.104
Time Step 2:
i_gate[0] = 0.428, f_gate[0] = 0.639, o_gate[0] = 0.183, c_hat[0] = 0.769
c_state[0] = 0.702, h_state[0] = 0.111
Time Step 3:
i_gate[0] = 0.396, f_gate[0] = 0.617, o_gate[0] = 0.177, c_hat[0] = 0.834
c_state[0] = 0.763, h_state[0] = 0.114
Time Step 4:
i_gate[0] = 0.370, f_gate[0] = 0.608, o_gate[0] = 0.171, c_hat[0] = 0.837
c_state[0] = 0.774, h_state[0] = 0.111
Backward Time Step 4:
Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.110
Backward Time Step 3:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004
Gradient do_[0] = 0.142
Backward Time Step 2:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017
Gradient do_[0] = 0.295
Backward Time Step 1:
Gradient di[0] = 0.029, df[0] = 0.023, dc_hat[0] = 0.032
Gradient do_[0] = 0.362
Backward Time Step 0:
Gradient di[0] = 0.040, df[0] = 0.035, dc_hat[0] = 0.067
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.222, c_hat[0] = 0.683
c_state[0] = 0.360, h_state[0] = 0.077
Time Step 1:
i_gate[0] = 0.464, f_gate[0] = 0.658, o_gate[0] = 0.192, c_hat[0] = 0.748
c_state[0] = 0.584, h_state[0] = 0.101
Time Step 2:
i_gate[0] = 0.427, f_gate[0] = 0.638, o_gate[0] = 0.178, c_hat[0] = 0.771
c_state[0] = 0.702, h_state[0] = 0.108
Time Step 3:
i_gate[0] = 0.395, f_gate[0] = 0.616, o_gate[0] = 0.171, c_hat[0] = 0.836
c_state[0] = 0.763, h_state[0] = 0.110
Time Step 4:
i_gate[0] = 0.370, f_gate[0] = 0.607, o_gate[0] = 0.166, c_hat[0] = 0.839
c_state[0] = 0.773, h_state[0] = 0.108
Backward Time Step 4:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.108
Backward Time Step 3:
Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.004
Gradient do_[0] = 0.139
Backward Time Step 2:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.016
Gradient do_[0] = 0.293
Backward Time Step 1:
Gradient di[0] = 0.028, df[0] = 0.022, dc_hat[0] = 0.031
Gradient do_[0] = 0.361
Backward Time Step 0:
Gradient di[0] = 0.039, df[0] = 0.034, dc_hat[0] = 0.065
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.217, c_hat[0] = 0.684
c_state[0] = 0.360, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.464, f_gate[0] = 0.658, o_gate[0] = 0.187, c_hat[0] = 0.750
c_state[0] = 0.585, h_state[0] = 0.099
Time Step 2:
i_gate[0] = 0.427, f_gate[0] = 0.638, o_gate[0] = 0.173, c_hat[0] = 0.773
c_state[0] = 0.703, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.394, f_gate[0] = 0.616, o_gate[0] = 0.166, c_hat[0] = 0.837
c_state[0] = 0.763, h_state[0] = 0.107
Time Step 4:
i_gate[0] = 0.369, f_gate[0] = 0.607, o_gate[0] = 0.160, c_hat[0] = 0.841
c_state[0] = 0.773, h_state[0] = 0.104
Backward Time Step 4:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.106
Backward Time Step 3:
Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.004
Gradient do_[0] = 0.137
Backward Time Step 2:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016
Gradient do_[0] = 0.291
Backward Time Step 1:
Gradient di[0] = 0.027, df[0] = 0.021, dc_hat[0] = 0.029
Gradient do_[0] = 0.360
Backward Time Step 0:
Gradient di[0] = 0.039, df[0] = 0.033, dc_hat[0] = 0.063
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.526, f_gate[0] = 0.684, o_gate[0] = 0.212, c_hat[0] = 0.685
c_state[0] = 0.361, h_state[0] = 0.073
Time Step 1:
i_gate[0] = 0.463, f_gate[0] = 0.658, o_gate[0] = 0.183, c_hat[0] = 0.751
c_state[0] = 0.585, h_state[0] = 0.096
Time Step 2:
i_gate[0] = 0.426, f_gate[0] = 0.637, o_gate[0] = 0.168, c_hat[0] = 0.775
c_state[0] = 0.704, h_state[0] = 0.102
Time Step 3:
i_gate[0] = 0.394, f_gate[0] = 0.615, o_gate[0] = 0.161, c_hat[0] = 0.839
c_state[0] = 0.763, h_state[0] = 0.104
Time Step 4:
i_gate[0] = 0.368, f_gate[0] = 0.606, o_gate[0] = 0.155, c_hat[0] = 0.843
c_state[0] = 0.773, h_state[0] = 0.101
Backward Time Step 4:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.104
Backward Time Step 3:
Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.134
Backward Time Step 2:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.015
Gradient do_[0] = 0.289
Backward Time Step 1:
Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.028
Gradient do_[0] = 0.359
Backward Time Step 0:
Gradient di[0] = 0.038, df[0] = 0.033, dc_hat[0] = 0.062
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.208, c_hat[0] = 0.687
c_state[0] = 0.361, h_state[0] = 0.072
Time Step 1:
i_gate[0] = 0.463, f_gate[0] = 0.657, o_gate[0] = 0.178, c_hat[0] = 0.753
c_state[0] = 0.586, h_state[0] = 0.094
Time Step 2:
i_gate[0] = 0.426, f_gate[0] = 0.637, o_gate[0] = 0.164, c_hat[0] = 0.777
c_state[0] = 0.704, h_state[0] = 0.099
Time Step 3:
i_gate[0] = 0.393, f_gate[0] = 0.615, o_gate[0] = 0.157, c_hat[0] = 0.841
c_state[0] = 0.763, h_state[0] = 0.101
Time Step 4:
i_gate[0] = 0.367, f_gate[0] = 0.605, o_gate[0] = 0.151, c_hat[0] = 0.844
c_state[0] = 0.772, h_state[0] = 0.098
Backward Time Step 4:
Gradient di[0] = 0.005, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = 0.102
Backward Time Step 3:
Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.132
Backward Time Step 2:
Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.014
Gradient do_[0] = 0.288
Backward Time Step 1:
Gradient di[0] = 0.026, df[0] = 0.020, dc_hat[0] = 0.027
Gradient do_[0] = 0.358
Backward Time Step 0:
Gradient di[0] = 0.037, df[0] = 0.032, dc_hat[0] = 0.060
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.203, c_hat[0] = 0.688
c_state[0] = 0.362, h_state[0] = 0.071
Time Step 1:
i_gate[0] = 0.462, f_gate[0] = 0.657, o_gate[0] = 0.174, c_hat[0] = 0.754
c_state[0] = 0.587, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.425, f_gate[0] = 0.637, o_gate[0] = 0.159, c_hat[0] = 0.779
c_state[0] = 0.705, h_state[0] = 0.097
Time Step 3:
i_gate[0] = 0.393, f_gate[0] = 0.614, o_gate[0] = 0.152, c_hat[0] = 0.842
c_state[0] = 0.763, h_state[0] = 0.098
Time Step 4:
i_gate[0] = 0.367, f_gate[0] = 0.605, o_gate[0] = 0.146, c_hat[0] = 0.846
c_state[0] = 0.772, h_state[0] = 0.095
Backward Time Step 4:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.100
Backward Time Step 3:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.130
Backward Time Step 2:
Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.014
Gradient do_[0] = 0.286
Backward Time Step 1:
Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.026
Gradient do_[0] = 0.357
Backward Time Step 0:
Gradient di[0] = 0.036, df[0] = 0.031, dc_hat[0] = 0.058
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.199, c_hat[0] = 0.689
c_state[0] = 0.363, h_state[0] = 0.069
Time Step 1:
i_gate[0] = 0.462, f_gate[0] = 0.657, o_gate[0] = 0.170, c_hat[0] = 0.756
c_state[0] = 0.587, h_state[0] = 0.090
Time Step 2:
i_gate[0] = 0.424, f_gate[0] = 0.636, o_gate[0] = 0.155, c_hat[0] = 0.780
c_state[0] = 0.705, h_state[0] = 0.094
Time Step 3:
i_gate[0] = 0.392, f_gate[0] = 0.614, o_gate[0] = 0.148, c_hat[0] = 0.844
c_state[0] = 0.763, h_state[0] = 0.095
Time Step 4:
i_gate[0] = 0.366, f_gate[0] = 0.604, o_gate[0] = 0.142, c_hat[0] = 0.848
c_state[0] = 0.771, h_state[0] = 0.092
Backward Time Step 4:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.098
Backward Time Step 3:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.127
Backward Time Step 2:
Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.013
Gradient do_[0] = 0.284
Backward Time Step 1:
Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.026
Gradient do_[0] = 0.355
Backward Time Step 0:
Gradient di[0] = 0.036, df[0] = 0.031, dc_hat[0] = 0.057
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.195, c_hat[0] = 0.691
c_state[0] = 0.363, h_state[0] = 0.068
Time Step 1:
i_gate[0] = 0.462, f_gate[0] = 0.657, o_gate[0] = 0.166, c_hat[0] = 0.757
c_state[0] = 0.588, h_state[0] = 0.088
Time Step 2:
i_gate[0] = 0.424, f_gate[0] = 0.636, o_gate[0] = 0.151, c_hat[0] = 0.782
c_state[0] = 0.705, h_state[0] = 0.092
Time Step 3:
i_gate[0] = 0.391, f_gate[0] = 0.613, o_gate[0] = 0.144, c_hat[0] = 0.845
c_state[0] = 0.763, h_state[0] = 0.093
Time Step 4:
i_gate[0] = 0.366, f_gate[0] = 0.603, o_gate[0] = 0.138, c_hat[0] = 0.849
c_state[0] = 0.771, h_state[0] = 0.089
Backward Time Step 4:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.097
Backward Time Step 3:
Gradient di[0] = 0.005, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = 0.125
Backward Time Step 2:
Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.282
Backward Time Step 1:
Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.025
Gradient do_[0] = 0.354
Backward Time Step 0:
Gradient di[0] = 0.035, df[0] = 0.030, dc_hat[0] = 0.056
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.192, c_hat[0] = 0.692
c_state[0] = 0.364, h_state[0] = 0.067
Time Step 1:
i_gate[0] = 0.461, f_gate[0] = 0.657, o_gate[0] = 0.162, c_hat[0] = 0.758
c_state[0] = 0.589, h_state[0] = 0.086
Time Step 2:
i_gate[0] = 0.423, f_gate[0] = 0.636, o_gate[0] = 0.148, c_hat[0] = 0.783
c_state[0] = 0.706, h_state[0] = 0.090
Time Step 3:
i_gate[0] = 0.391, f_gate[0] = 0.613, o_gate[0] = 0.141, c_hat[0] = 0.846
c_state[0] = 0.763, h_state[0] = 0.090
Time Step 4:
i_gate[0] = 0.365, f_gate[0] = 0.603, o_gate[0] = 0.134, c_hat[0] = 0.851
c_state[0] = 0.770, h_state[0] = 0.087
Backward Time Step 4:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.095
Backward Time Step 3:
Gradient di[0] = 0.005, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = 0.123
Backward Time Step 2:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.012
Gradient do_[0] = 0.281
Backward Time Step 1:
Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.024
Gradient do_[0] = 0.353
Backward Time Step 0:
Gradient di[0] = 0.034, df[0] = 0.029, dc_hat[0] = 0.054
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.188, c_hat[0] = 0.693
c_state[0] = 0.364, h_state[0] = 0.066
Time Step 1:
i_gate[0] = 0.461, f_gate[0] = 0.656, o_gate[0] = 0.159, c_hat[0] = 0.759
c_state[0] = 0.589, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.423, f_gate[0] = 0.635, o_gate[0] = 0.144, c_hat[0] = 0.785
c_state[0] = 0.706, h_state[0] = 0.088
Time Step 3:
i_gate[0] = 0.390, f_gate[0] = 0.612, o_gate[0] = 0.137, c_hat[0] = 0.847
c_state[0] = 0.763, h_state[0] = 0.088
Time Step 4:
i_gate[0] = 0.364, f_gate[0] = 0.602, o_gate[0] = 0.130, c_hat[0] = 0.852
c_state[0] = 0.770, h_state[0] = 0.084
Backward Time Step 4:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.094
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.121
Backward Time Step 2:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.012
Gradient do_[0] = 0.279
Backward Time Step 1:
Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.023
Gradient do_[0] = 0.352
Backward Time Step 0:
Gradient di[0] = 0.034, df[0] = 0.029, dc_hat[0] = 0.053
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.185, c_hat[0] = 0.694
c_state[0] = 0.364, h_state[0] = 0.065
Time Step 1:
i_gate[0] = 0.461, f_gate[0] = 0.656, o_gate[0] = 0.155, c_hat[0] = 0.761
c_state[0] = 0.589, h_state[0] = 0.082
Time Step 2:
i_gate[0] = 0.422, f_gate[0] = 0.635, o_gate[0] = 0.141, c_hat[0] = 0.786
c_state[0] = 0.706, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.390, f_gate[0] = 0.612, o_gate[0] = 0.134, c_hat[0] = 0.849
c_state[0] = 0.763, h_state[0] = 0.086
Time Step 4:
i_gate[0] = 0.364, f_gate[0] = 0.601, o_gate[0] = 0.127, c_hat[0] = 0.853
c_state[0] = 0.769, h_state[0] = 0.082
Backward Time Step 4:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.092
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.119
Backward Time Step 2:
Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.011
Gradient do_[0] = 0.278
Backward Time Step 1:
Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.023
Gradient do_[0] = 0.351
Backward Time Step 0:
Gradient di[0] = 0.033, df[0] = 0.028, dc_hat[0] = 0.052
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.182, c_hat[0] = 0.695
c_state[0] = 0.365, h_state[0] = 0.063
Time Step 1:
i_gate[0] = 0.460, f_gate[0] = 0.656, o_gate[0] = 0.152, c_hat[0] = 0.762
c_state[0] = 0.590, h_state[0] = 0.081
Time Step 2:
i_gate[0] = 0.422, f_gate[0] = 0.634, o_gate[0] = 0.138, c_hat[0] = 0.787
c_state[0] = 0.706, h_state[0] = 0.084
Time Step 3:
i_gate[0] = 0.389, f_gate[0] = 0.611, o_gate[0] = 0.131, c_hat[0] = 0.850
c_state[0] = 0.762, h_state[0] = 0.084
Time Step 4:
i_gate[0] = 0.363, f_gate[0] = 0.601, o_gate[0] = 0.124, c_hat[0] = 0.854
c_state[0] = 0.768, h_state[0] = 0.080
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.091
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.118
Backward Time Step 2:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.011
Gradient do_[0] = 0.276
Backward Time Step 1:
Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.022
Gradient do_[0] = 0.350
Backward Time Step 0:
Gradient di[0] = 0.032, df[0] = 0.028, dc_hat[0] = 0.051
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.179, c_hat[0] = 0.696
c_state[0] = 0.365, h_state[0] = 0.062
Time Step 1:
i_gate[0] = 0.460, f_gate[0] = 0.656, o_gate[0] = 0.149, c_hat[0] = 0.763
c_state[0] = 0.590, h_state[0] = 0.079
Time Step 2:
i_gate[0] = 0.421, f_gate[0] = 0.634, o_gate[0] = 0.135, c_hat[0] = 0.788
c_state[0] = 0.707, h_state[0] = 0.082
Time Step 3:
i_gate[0] = 0.389, f_gate[0] = 0.611, o_gate[0] = 0.128, c_hat[0] = 0.851
c_state[0] = 0.762, h_state[0] = 0.082
Time Step 4:
i_gate[0] = 0.363, f_gate[0] = 0.600, o_gate[0] = 0.121, c_hat[0] = 0.855
c_state[0] = 0.768, h_state[0] = 0.078
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.090
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.116
Backward Time Step 2:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.011
Gradient do_[0] = 0.275
Backward Time Step 1:
Gradient di[0] = 0.021, df[0] = 0.016, dc_hat[0] = 0.021
Gradient do_[0] = 0.349
Backward Time Step 0:
Gradient di[0] = 0.032, df[0] = 0.027, dc_hat[0] = 0.050
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.176, c_hat[0] = 0.696
c_state[0] = 0.366, h_state[0] = 0.062
Time Step 1:
i_gate[0] = 0.460, f_gate[0] = 0.656, o_gate[0] = 0.146, c_hat[0] = 0.764
c_state[0] = 0.591, h_state[0] = 0.078
Time Step 2:
i_gate[0] = 0.421, f_gate[0] = 0.634, o_gate[0] = 0.132, c_hat[0] = 0.790
c_state[0] = 0.707, h_state[0] = 0.080
Time Step 3:
i_gate[0] = 0.388, f_gate[0] = 0.610, o_gate[0] = 0.125, c_hat[0] = 0.852
c_state[0] = 0.762, h_state[0] = 0.080
Time Step 4:
i_gate[0] = 0.362, f_gate[0] = 0.600, o_gate[0] = 0.118, c_hat[0] = 0.857
c_state[0] = 0.767, h_state[0] = 0.076
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.088
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.114
Backward Time Step 2:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010
Gradient do_[0] = 0.273
Backward Time Step 1:
Gradient di[0] = 0.021, df[0] = 0.016, dc_hat[0] = 0.021
Gradient do_[0] = 0.348
Backward Time Step 0:
Gradient di[0] = 0.031, df[0] = 0.027, dc_hat[0] = 0.049
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.173, c_hat[0] = 0.697
c_state[0] = 0.366, h_state[0] = 0.061
Time Step 1:
i_gate[0] = 0.459, f_gate[0] = 0.656, o_gate[0] = 0.144, c_hat[0] = 0.764
c_state[0] = 0.591, h_state[0] = 0.076
Time Step 2:
i_gate[0] = 0.421, f_gate[0] = 0.633, o_gate[0] = 0.130, c_hat[0] = 0.791
c_state[0] = 0.707, h_state[0] = 0.079
Time Step 3:
i_gate[0] = 0.388, f_gate[0] = 0.610, o_gate[0] = 0.122, c_hat[0] = 0.853
c_state[0] = 0.761, h_state[0] = 0.079
Time Step 4:
i_gate[0] = 0.362, f_gate[0] = 0.599, o_gate[0] = 0.116, c_hat[0] = 0.858
c_state[0] = 0.766, h_state[0] = 0.074
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.087
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.113
Backward Time Step 2:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.010
Gradient do_[0] = 0.272
Backward Time Step 1:
Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.020
Gradient do_[0] = 0.348
Backward Time Step 0:
Gradient di[0] = 0.031, df[0] = 0.026, dc_hat[0] = 0.048
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.170, c_hat[0] = 0.698
c_state[0] = 0.366, h_state[0] = 0.060
Time Step 1:
i_gate[0] = 0.459, f_gate[0] = 0.655, o_gate[0] = 0.141, c_hat[0] = 0.765
c_state[0] = 0.591, h_state[0] = 0.075
Time Step 2:
i_gate[0] = 0.420, f_gate[0] = 0.633, o_gate[0] = 0.127, c_hat[0] = 0.792
c_state[0] = 0.707, h_state[0] = 0.077
Time Step 3:
i_gate[0] = 0.387, f_gate[0] = 0.609, o_gate[0] = 0.120, c_hat[0] = 0.854
c_state[0] = 0.761, h_state[0] = 0.077
Time Step 4:
i_gate[0] = 0.361, f_gate[0] = 0.599, o_gate[0] = 0.113, c_hat[0] = 0.859
c_state[0] = 0.766, h_state[0] = 0.073
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.086
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.111
Backward Time Step 2:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.010
Gradient do_[0] = 0.271
Backward Time Step 1:
Gradient di[0] = 0.020, df[0] = 0.015, dc_hat[0] = 0.020
Gradient do_[0] = 0.347
Backward Time Step 0:
Gradient di[0] = 0.031, df[0] = 0.026, dc_hat[0] = 0.047
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.168, c_hat[0] = 0.699
c_state[0] = 0.367, h_state[0] = 0.059
Time Step 1:
i_gate[0] = 0.459, f_gate[0] = 0.655, o_gate[0] = 0.139, c_hat[0] = 0.766
c_state[0] = 0.592, h_state[0] = 0.074
Time Step 2:
i_gate[0] = 0.420, f_gate[0] = 0.633, o_gate[0] = 0.125, c_hat[0] = 0.793
c_state[0] = 0.707, h_state[0] = 0.076
Time Step 3:
i_gate[0] = 0.387, f_gate[0] = 0.609, o_gate[0] = 0.118, c_hat[0] = 0.855
c_state[0] = 0.761, h_state[0] = 0.075
Time Step 4:
i_gate[0] = 0.361, f_gate[0] = 0.598, o_gate[0] = 0.111, c_hat[0] = 0.859
c_state[0] = 0.765, h_state[0] = 0.071
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.085
Backward Time Step 3:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002
Gradient do_[0] = 0.110
Backward Time Step 2:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.010
Gradient do_[0] = 0.270
Backward Time Step 1:
Gradient di[0] = 0.020, df[0] = 0.015, dc_hat[0] = 0.019
Gradient do_[0] = 0.346
Backward Time Step 0:
Gradient di[0] = 0.030, df[0] = 0.026, dc_hat[0] = 0.046
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.166, c_hat[0] = 0.700
c_state[0] = 0.367, h_state[0] = 0.058
Time Step 1:
i_gate[0] = 0.458, f_gate[0] = 0.655, o_gate[0] = 0.137, c_hat[0] = 0.767
c_state[0] = 0.592, h_state[0] = 0.073
Time Step 2:
i_gate[0] = 0.419, f_gate[0] = 0.632, o_gate[0] = 0.123, c_hat[0] = 0.794
c_state[0] = 0.707, h_state[0] = 0.075
Time Step 3:
i_gate[0] = 0.386, f_gate[0] = 0.608, o_gate[0] = 0.116, c_hat[0] = 0.855
c_state[0] = 0.760, h_state[0] = 0.074
Time Step 4:
i_gate[0] = 0.360, f_gate[0] = 0.598, o_gate[0] = 0.109, c_hat[0] = 0.860
c_state[0] = 0.764, h_state[0] = 0.070
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.085
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.109
Backward Time Step 2:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009
Gradient do_[0] = 0.269
Backward Time Step 1:
Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.019
Gradient do_[0] = 0.345
Backward Time Step 0:
Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.046
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.163, c_hat[0] = 0.700
c_state[0] = 0.367, h_state[0] = 0.057
Time Step 1:
i_gate[0] = 0.458, f_gate[0] = 0.655, o_gate[0] = 0.135, c_hat[0] = 0.768
c_state[0] = 0.592, h_state[0] = 0.072
Time Step 2:
i_gate[0] = 0.419, f_gate[0] = 0.632, o_gate[0] = 0.121, c_hat[0] = 0.795
c_state[0] = 0.707, h_state[0] = 0.074
Time Step 3:
i_gate[0] = 0.386, f_gate[0] = 0.608, o_gate[0] = 0.114, c_hat[0] = 0.856
c_state[0] = 0.760, h_state[0] = 0.073
Time Step 4:
i_gate[0] = 0.359, f_gate[0] = 0.597, o_gate[0] = 0.107, c_hat[0] = 0.861
c_state[0] = 0.763, h_state[0] = 0.069
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.084
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.108
Backward Time Step 2:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.009
Gradient do_[0] = 0.268
Backward Time Step 1:
Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.019
Gradient do_[0] = 0.345
Backward Time Step 0:
Gradient di[0] = 0.029, df[0] = 0.025, dc_hat[0] = 0.045
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.161, c_hat[0] = 0.701
c_state[0] = 0.368, h_state[0] = 0.057
Time Step 1:
i_gate[0] = 0.458, f_gate[0] = 0.655, o_gate[0] = 0.133, c_hat[0] = 0.769
c_state[0] = 0.593, h_state[0] = 0.071
Time Step 2:
i_gate[0] = 0.418, f_gate[0] = 0.632, o_gate[0] = 0.119, c_hat[0] = 0.795
c_state[0] = 0.707, h_state[0] = 0.072
Time Step 3:
i_gate[0] = 0.385, f_gate[0] = 0.607, o_gate[0] = 0.112, c_hat[0] = 0.857
c_state[0] = 0.759, h_state[0] = 0.072
Time Step 4:
i_gate[0] = 0.359, f_gate[0] = 0.597, o_gate[0] = 0.105, c_hat[0] = 0.862
c_state[0] = 0.763, h_state[0] = 0.067
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.083
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.107
Backward Time Step 2:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.009
Gradient do_[0] = 0.267
Backward Time Step 1:
Gradient di[0] = 0.019, df[0] = 0.014, dc_hat[0] = 0.018
Gradient do_[0] = 0.344
Backward Time Step 0:
Gradient di[0] = 0.029, df[0] = 0.025, dc_hat[0] = 0.044
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.685, o_gate[0] = 0.160, c_hat[0] = 0.702
c_state[0] = 0.368, h_state[0] = 0.056
Time Step 1:
i_gate[0] = 0.457, f_gate[0] = 0.655, o_gate[0] = 0.131, c_hat[0] = 0.769
c_state[0] = 0.593, h_state[0] = 0.070
Time Step 2:
i_gate[0] = 0.418, f_gate[0] = 0.632, o_gate[0] = 0.117, c_hat[0] = 0.796
c_state[0] = 0.707, h_state[0] = 0.071
Time Step 3:
i_gate[0] = 0.385, f_gate[0] = 0.607, o_gate[0] = 0.110, c_hat[0] = 0.858
c_state[0] = 0.759, h_state[0] = 0.070
Time Step 4:
i_gate[0] = 0.358, f_gate[0] = 0.596, o_gate[0] = 0.103, c_hat[0] = 0.863
c_state[0] = 0.762, h_state[0] = 0.066
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.083
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.106
Backward Time Step 2:
Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.009
Gradient do_[0] = 0.266
Backward Time Step 1:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.018
Gradient do_[0] = 0.344
Backward Time Step 0:
Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.044
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.685, o_gate[0] = 0.158, c_hat[0] = 0.702
c_state[0] = 0.368, h_state[0] = 0.056
Time Step 1:
i_gate[0] = 0.457, f_gate[0] = 0.655, o_gate[0] = 0.129, c_hat[0] = 0.770
c_state[0] = 0.593, h_state[0] = 0.069
Time Step 2:
i_gate[0] = 0.418, f_gate[0] = 0.631, o_gate[0] = 0.115, c_hat[0] = 0.797
c_state[0] = 0.707, h_state[0] = 0.070
Time Step 3:
i_gate[0] = 0.384, f_gate[0] = 0.607, o_gate[0] = 0.108, c_hat[0] = 0.858
c_state[0] = 0.759, h_state[0] = 0.069
Time Step 4:
i_gate[0] = 0.358, f_gate[0] = 0.596, o_gate[0] = 0.101, c_hat[0] = 0.864
c_state[0] = 0.761, h_state[0] = 0.065
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.082
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.105
Backward Time Step 2:
Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.009
Gradient do_[0] = 0.266
Backward Time Step 1:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.018
Gradient do_[0] = 0.343
Backward Time Step 0:
Gradient di[0] = 0.028, df[0] = 0.024, dc_hat[0] = 0.043
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.156, c_hat[0] = 0.703
c_state[0] = 0.369, h_state[0] = 0.055
Time Step 1:
i_gate[0] = 0.457, f_gate[0] = 0.654, o_gate[0] = 0.128, c_hat[0] = 0.770
c_state[0] = 0.593, h_state[0] = 0.068
Time Step 2:
i_gate[0] = 0.417, f_gate[0] = 0.631, o_gate[0] = 0.114, c_hat[0] = 0.798
c_state[0] = 0.707, h_state[0] = 0.069
Time Step 3:
i_gate[0] = 0.384, f_gate[0] = 0.606, o_gate[0] = 0.107, c_hat[0] = 0.859
c_state[0] = 0.758, h_state[0] = 0.068
Time Step 4:
i_gate[0] = 0.358, f_gate[0] = 0.595, o_gate[0] = 0.100, c_hat[0] = 0.864
c_state[0] = 0.760, h_state[0] = 0.064
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.082
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = 0.104
Backward Time Step 2:
Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.265
Backward Time Step 1:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017
Gradient do_[0] = 0.343
Backward Time Step 0:
Gradient di[0] = 0.028, df[0] = 0.024, dc_hat[0] = 0.043
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.154, c_hat[0] = 0.703
c_state[0] = 0.369, h_state[0] = 0.054
Time Step 1:
i_gate[0] = 0.457, f_gate[0] = 0.654, o_gate[0] = 0.126, c_hat[0] = 0.771
c_state[0] = 0.593, h_state[0] = 0.067
Time Step 2:
i_gate[0] = 0.417, f_gate[0] = 0.631, o_gate[0] = 0.112, c_hat[0] = 0.798
c_state[0] = 0.707, h_state[0] = 0.068
Time Step 3:
i_gate[0] = 0.383, f_gate[0] = 0.606, o_gate[0] = 0.105, c_hat[0] = 0.860
c_state[0] = 0.758, h_state[0] = 0.067
Time Step 4:
i_gate[0] = 0.357, f_gate[0] = 0.595, o_gate[0] = 0.098, c_hat[0] = 0.865
c_state[0] = 0.759, h_state[0] = 0.063
Backward Time Step 4:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.081
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.103
Backward Time Step 2:
Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.264
Backward Time Step 1:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017
Gradient do_[0] = 0.342
Backward Time Step 0:
Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.042
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.153, c_hat[0] = 0.704
c_state[0] = 0.369, h_state[0] = 0.054
Time Step 1:
i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.125, c_hat[0] = 0.772
c_state[0] = 0.594, h_state[0] = 0.066
Time Step 2:
i_gate[0] = 0.416, f_gate[0] = 0.630, o_gate[0] = 0.111, c_hat[0] = 0.799
c_state[0] = 0.707, h_state[0] = 0.068
Time Step 3:
i_gate[0] = 0.383, f_gate[0] = 0.605, o_gate[0] = 0.104, c_hat[0] = 0.860
c_state[0] = 0.757, h_state[0] = 0.066
Time Step 4:
i_gate[0] = 0.357, f_gate[0] = 0.594, o_gate[0] = 0.097, c_hat[0] = 0.866
c_state[0] = 0.758, h_state[0] = 0.062
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.081
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.102
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.263
Backward Time Step 1:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.017
Gradient do_[0] = 0.341
Backward Time Step 0:
Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.041
Gradient do_[0] = 0.264
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.151, c_hat[0] = 0.704
c_state[0] = 0.369, h_state[0] = 0.053
Time Step 1:
i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.123, c_hat[0] = 0.772
c_state[0] = 0.594, h_state[0] = 0.066
Time Step 2:
i_gate[0] = 0.416, f_gate[0] = 0.630, o_gate[0] = 0.110, c_hat[0] = 0.800
c_state[0] = 0.707, h_state[0] = 0.067
Time Step 3:
i_gate[0] = 0.382, f_gate[0] = 0.605, o_gate[0] = 0.103, c_hat[0] = 0.861
c_state[0] = 0.757, h_state[0] = 0.066
Time Step 4:
i_gate[0] = 0.356, f_gate[0] = 0.594, o_gate[0] = 0.096, c_hat[0] = 0.866
c_state[0] = 0.758, h_state[0] = 0.061
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.080
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.101
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.263
Backward Time Step 1:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016
Gradient do_[0] = 0.341
Backward Time Step 0:
Gradient di[0] = 0.027, df[0] = 0.023, dc_hat[0] = 0.041
Gradient do_[0] = 0.264
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.150, c_hat[0] = 0.705
c_state[0] = 0.369, h_state[0] = 0.053
Time Step 1:
i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.122, c_hat[0] = 0.773
c_state[0] = 0.594, h_state[0] = 0.065
Time Step 2:
i_gate[0] = 0.416, f_gate[0] = 0.630, o_gate[0] = 0.108, c_hat[0] = 0.800
c_state[0] = 0.707, h_state[0] = 0.066
Time Step 3:
i_gate[0] = 0.382, f_gate[0] = 0.605, o_gate[0] = 0.101, c_hat[0] = 0.861
c_state[0] = 0.756, h_state[0] = 0.065
Time Step 4:
i_gate[0] = 0.356, f_gate[0] = 0.593, o_gate[0] = 0.094, c_hat[0] = 0.867
c_state[0] = 0.757, h_state[0] = 0.060
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.079
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.101
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.262
Backward Time Step 1:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016
Gradient do_[0] = 0.340
Backward Time Step 0:
Gradient di[0] = 0.027, df[0] = 0.023, dc_hat[0] = 0.041
Gradient do_[0] = 0.264
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.148, c_hat[0] = 0.705
c_state[0] = 0.370, h_state[0] = 0.052
Time Step 1:
i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.120, c_hat[0] = 0.773
c_state[0] = 0.594, h_state[0] = 0.064
Time Step 2:
i_gate[0] = 0.415, f_gate[0] = 0.630, o_gate[0] = 0.107, c_hat[0] = 0.801
c_state[0] = 0.707, h_state[0] = 0.065
Time Step 3:
i_gate[0] = 0.381, f_gate[0] = 0.604, o_gate[0] = 0.100, c_hat[0] = 0.862
c_state[0] = 0.756, h_state[0] = 0.064
Time Step 4:
i_gate[0] = 0.355, f_gate[0] = 0.593, o_gate[0] = 0.093, c_hat[0] = 0.867
c_state[0] = 0.756, h_state[0] = 0.059
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.079
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.100
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008
Gradient do_[0] = 0.261
Backward Time Step 1:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016
Gradient do_[0] = 0.340
Backward Time Step 0:
Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.040
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.147, c_hat[0] = 0.706
c_state[0] = 0.370, h_state[0] = 0.052
Time Step 1:
i_gate[0] = 0.455, f_gate[0] = 0.654, o_gate[0] = 0.119, c_hat[0] = 0.774
c_state[0] = 0.594, h_state[0] = 0.063
Time Step 2:
i_gate[0] = 0.415, f_gate[0] = 0.629, o_gate[0] = 0.106, c_hat[0] = 0.802
c_state[0] = 0.706, h_state[0] = 0.064
Time Step 3:
i_gate[0] = 0.381, f_gate[0] = 0.604, o_gate[0] = 0.099, c_hat[0] = 0.862
c_state[0] = 0.755, h_state[0] = 0.063
Time Step 4:
i_gate[0] = 0.355, f_gate[0] = 0.592, o_gate[0] = 0.092, c_hat[0] = 0.868
c_state[0] = 0.755, h_state[0] = 0.059
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.078
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.099
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.008
Gradient do_[0] = 0.260
Backward Time Step 1:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016
Gradient do_[0] = 0.339
Backward Time Step 0:
Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.040
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.146, c_hat[0] = 0.706
c_state[0] = 0.370, h_state[0] = 0.052
Time Step 1:
i_gate[0] = 0.455, f_gate[0] = 0.654, o_gate[0] = 0.118, c_hat[0] = 0.774
c_state[0] = 0.594, h_state[0] = 0.063
Time Step 2:
i_gate[0] = 0.415, f_gate[0] = 0.629, o_gate[0] = 0.105, c_hat[0] = 0.802
c_state[0] = 0.706, h_state[0] = 0.064
Time Step 3:
i_gate[0] = 0.381, f_gate[0] = 0.603, o_gate[0] = 0.098, c_hat[0] = 0.863
c_state[0] = 0.755, h_state[0] = 0.062
Time Step 4:
i_gate[0] = 0.354, f_gate[0] = 0.592, o_gate[0] = 0.091, c_hat[0] = 0.868
c_state[0] = 0.754, h_state[0] = 0.058
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.078
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.098
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.259
Backward Time Step 1:
Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.016
Gradient do_[0] = 0.338
Backward Time Step 0:
Gradient di[0] = 0.026, df[0] = 0.022, dc_hat[0] = 0.039
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.144, c_hat[0] = 0.707
c_state[0] = 0.370, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.455, f_gate[0] = 0.654, o_gate[0] = 0.117, c_hat[0] = 0.775
c_state[0] = 0.594, h_state[0] = 0.062
Time Step 2:
i_gate[0] = 0.414, f_gate[0] = 0.629, o_gate[0] = 0.103, c_hat[0] = 0.803
c_state[0] = 0.706, h_state[0] = 0.063
Time Step 3:
i_gate[0] = 0.380, f_gate[0] = 0.603, o_gate[0] = 0.096, c_hat[0] = 0.863
c_state[0] = 0.754, h_state[0] = 0.062
Time Step 4:
i_gate[0] = 0.354, f_gate[0] = 0.591, o_gate[0] = 0.090, c_hat[0] = 0.869
c_state[0] = 0.753, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.077
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.097
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.258
Backward Time Step 1:
Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015
Gradient do_[0] = 0.338
Backward Time Step 0:
Gradient di[0] = 0.026, df[0] = 0.022, dc_hat[0] = 0.039
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.143, c_hat[0] = 0.707
c_state[0] = 0.370, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.455, f_gate[0] = 0.653, o_gate[0] = 0.116, c_hat[0] = 0.775
c_state[0] = 0.594, h_state[0] = 0.062
Time Step 2:
i_gate[0] = 0.414, f_gate[0] = 0.629, o_gate[0] = 0.102, c_hat[0] = 0.803
c_state[0] = 0.706, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.380, f_gate[0] = 0.603, o_gate[0] = 0.095, c_hat[0] = 0.864
c_state[0] = 0.753, h_state[0] = 0.061
Time Step 4:
i_gate[0] = 0.353, f_gate[0] = 0.591, o_gate[0] = 0.088, c_hat[0] = 0.869
c_state[0] = 0.752, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.076
Backward Time Step 3:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.096
Backward Time Step 2:
Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.257
Backward Time Step 1:
Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015
Gradient do_[0] = 0.337
Backward Time Step 0:
Gradient di[0] = 0.026, df[0] = 0.022, dc_hat[0] = 0.038
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.142, c_hat[0] = 0.708
c_state[0] = 0.371, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.114, c_hat[0] = 0.775
c_state[0] = 0.594, h_state[0] = 0.061
Time Step 2:
i_gate[0] = 0.414, f_gate[0] = 0.628, o_gate[0] = 0.101, c_hat[0] = 0.804
c_state[0] = 0.706, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.379, f_gate[0] = 0.602, o_gate[0] = 0.094, c_hat[0] = 0.864
c_state[0] = 0.753, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.353, f_gate[0] = 0.590, o_gate[0] = 0.087, c_hat[0] = 0.870
c_state[0] = 0.751, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.076
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.095
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.256
Backward Time Step 1:
Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015
Gradient do_[0] = 0.336
Backward Time Step 0:
Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.038
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.141, c_hat[0] = 0.708
c_state[0] = 0.371, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.113, c_hat[0] = 0.776
c_state[0] = 0.594, h_state[0] = 0.060
Time Step 2:
i_gate[0] = 0.413, f_gate[0] = 0.628, o_gate[0] = 0.100, c_hat[0] = 0.804
c_state[0] = 0.706, h_state[0] = 0.061
Time Step 3:
i_gate[0] = 0.379, f_gate[0] = 0.602, o_gate[0] = 0.093, c_hat[0] = 0.864
c_state[0] = 0.752, h_state[0] = 0.059
Time Step 4:
i_gate[0] = 0.352, f_gate[0] = 0.590, o_gate[0] = 0.086, c_hat[0] = 0.870
c_state[0] = 0.750, h_state[0] = 0.055
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.075
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.094
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.256
Backward Time Step 1:
Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015
Gradient do_[0] = 0.336
Backward Time Step 0:
Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.038
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.140, c_hat[0] = 0.708
c_state[0] = 0.371, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.112, c_hat[0] = 0.776
c_state[0] = 0.594, h_state[0] = 0.060
Time Step 2:
i_gate[0] = 0.413, f_gate[0] = 0.628, o_gate[0] = 0.099, c_hat[0] = 0.804
c_state[0] = 0.705, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.379, f_gate[0] = 0.602, o_gate[0] = 0.092, c_hat[0] = 0.865
c_state[0] = 0.752, h_state[0] = 0.059
Time Step 4:
i_gate[0] = 0.352, f_gate[0] = 0.589, o_gate[0] = 0.085, c_hat[0] = 0.871
c_state[0] = 0.749, h_state[0] = 0.054
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.074
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.093
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.255
Backward Time Step 1:
Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.015
Gradient do_[0] = 0.335
Backward Time Step 0:
Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.037
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.138, c_hat[0] = 0.709
c_state[0] = 0.371, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.111, c_hat[0] = 0.776
c_state[0] = 0.595, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.413, f_gate[0] = 0.628, o_gate[0] = 0.098, c_hat[0] = 0.805
c_state[0] = 0.705, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.378, f_gate[0] = 0.601, o_gate[0] = 0.091, c_hat[0] = 0.865
c_state[0] = 0.751, h_state[0] = 0.058
Time Step 4:
i_gate[0] = 0.351, f_gate[0] = 0.589, o_gate[0] = 0.085, c_hat[0] = 0.871
c_state[0] = 0.748, h_state[0] = 0.054
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.074
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.092
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.254
Backward Time Step 1:
Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.014
Gradient do_[0] = 0.334
Backward Time Step 0:
Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.037
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.137, c_hat[0] = 0.709
c_state[0] = 0.371, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.110, c_hat[0] = 0.777
c_state[0] = 0.595, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.412, f_gate[0] = 0.627, o_gate[0] = 0.097, c_hat[0] = 0.805
c_state[0] = 0.705, h_state[0] = 0.059
Time Step 3:
i_gate[0] = 0.378, f_gate[0] = 0.601, o_gate[0] = 0.091, c_hat[0] = 0.866
c_state[0] = 0.750, h_state[0] = 0.058
Time Step 4:
i_gate[0] = 0.351, f_gate[0] = 0.588, o_gate[0] = 0.084, c_hat[0] = 0.871
c_state[0] = 0.747, h_state[0] = 0.053
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.073
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.091
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.253
Backward Time Step 1:
Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.014
Gradient do_[0] = 0.333
Backward Time Step 0:
Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.036
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.136, c_hat[0] = 0.709
c_state[0] = 0.371, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.109, c_hat[0] = 0.777
c_state[0] = 0.595, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.412, f_gate[0] = 0.627, o_gate[0] = 0.096, c_hat[0] = 0.806
c_state[0] = 0.705, h_state[0] = 0.059
Time Step 3:
i_gate[0] = 0.377, f_gate[0] = 0.600, o_gate[0] = 0.090, c_hat[0] = 0.866
c_state[0] = 0.750, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.350, f_gate[0] = 0.588, o_gate[0] = 0.083, c_hat[0] = 0.872
c_state[0] = 0.746, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.073
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.090
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007
Gradient do_[0] = 0.252
Backward Time Step 1:
Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.014
Gradient do_[0] = 0.333
Backward Time Step 0:
Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.036
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.135, c_hat[0] = 0.710
c_state[0] = 0.372, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.108, c_hat[0] = 0.777
c_state[0] = 0.595, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.412, f_gate[0] = 0.627, o_gate[0] = 0.096, c_hat[0] = 0.806
c_state[0] = 0.704, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.377, f_gate[0] = 0.600, o_gate[0] = 0.089, c_hat[0] = 0.866
c_state[0] = 0.749, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.350, f_gate[0] = 0.587, o_gate[0] = 0.082, c_hat[0] = 0.872
c_state[0] = 0.745, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.072
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.089
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006
Gradient do_[0] = 0.251
Backward Time Step 1:
Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.014
Gradient do_[0] = 0.332
Backward Time Step 0:
Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.036
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.710
c_state[0] = 0.372, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.107, c_hat[0] = 0.778
c_state[0] = 0.595, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.411, f_gate[0] = 0.627, o_gate[0] = 0.095, c_hat[0] = 0.806
c_state[0] = 0.704, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.376, f_gate[0] = 0.600, o_gate[0] = 0.088, c_hat[0] = 0.867
c_state[0] = 0.749, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.349, f_gate[0] = 0.587, o_gate[0] = 0.081, c_hat[0] = 0.873
c_state[0] = 0.744, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.071
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.089
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006
Gradient do_[0] = 0.250
Backward Time Step 1:
Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.014
Gradient do_[0] = 0.331
Backward Time Step 0:
Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.133, c_hat[0] = 0.710
c_state[0] = 0.372, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.107, c_hat[0] = 0.778
c_state[0] = 0.595, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.411, f_gate[0] = 0.626, o_gate[0] = 0.094, c_hat[0] = 0.807
c_state[0] = 0.704, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.376, f_gate[0] = 0.599, o_gate[0] = 0.087, c_hat[0] = 0.867
c_state[0] = 0.748, h_state[0] = 0.055
Time Step 4:
i_gate[0] = 0.349, f_gate[0] = 0.586, o_gate[0] = 0.080, c_hat[0] = 0.873
c_state[0] = 0.743, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.071
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001
Gradient do_[0] = 0.088
Backward Time Step 2:
Gradient di[0] = 0.009, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.249
Backward Time Step 1:
Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.331
Backward Time Step 0:
Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.132, c_hat[0] = 0.711
c_state[0] = 0.372, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.106, c_hat[0] = 0.778
c_state[0] = 0.595, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.411, f_gate[0] = 0.626, o_gate[0] = 0.093, c_hat[0] = 0.807
c_state[0] = 0.704, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.376, f_gate[0] = 0.599, o_gate[0] = 0.086, c_hat[0] = 0.867
c_state[0] = 0.747, h_state[0] = 0.055
Time Step 4:
i_gate[0] = 0.348, f_gate[0] = 0.586, o_gate[0] = 0.080, c_hat[0] = 0.873
c_state[0] = 0.742, h_state[0] = 0.050
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.070
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.087
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.249
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.330
Backward Time Step 0:
Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.711
c_state[0] = 0.372, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.105, c_hat[0] = 0.778
c_state[0] = 0.595, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.092, c_hat[0] = 0.807
c_state[0] = 0.704, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.375, f_gate[0] = 0.599, o_gate[0] = 0.086, c_hat[0] = 0.867
c_state[0] = 0.747, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.348, f_gate[0] = 0.585, o_gate[0] = 0.079, c_hat[0] = 0.874
c_state[0] = 0.741, h_state[0] = 0.050
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.070
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.086
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.248
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.330
Backward Time Step 0:
Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.711
c_state[0] = 0.372, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.104, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.092, c_hat[0] = 0.808
c_state[0] = 0.703, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.375, f_gate[0] = 0.598, o_gate[0] = 0.085, c_hat[0] = 0.868
c_state[0] = 0.746, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.347, f_gate[0] = 0.585, o_gate[0] = 0.078, c_hat[0] = 0.874
c_state[0] = 0.740, h_state[0] = 0.049
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.069
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.085
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.247
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.329
Backward Time Step 0:
Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.034
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.130, c_hat[0] = 0.712
c_state[0] = 0.372, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.103, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.091, c_hat[0] = 0.808
c_state[0] = 0.703, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.374, f_gate[0] = 0.598, o_gate[0] = 0.084, c_hat[0] = 0.868
c_state[0] = 0.745, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.347, f_gate[0] = 0.584, o_gate[0] = 0.077, c_hat[0] = 0.874
c_state[0] = 0.739, h_state[0] = 0.049
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.069
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.085
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.246
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.328
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.034
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.129, c_hat[0] = 0.712
c_state[0] = 0.373, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.103, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.090, c_hat[0] = 0.808
c_state[0] = 0.703, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.374, f_gate[0] = 0.598, o_gate[0] = 0.084, c_hat[0] = 0.868
c_state[0] = 0.745, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.346, f_gate[0] = 0.584, o_gate[0] = 0.077, c_hat[0] = 0.874
c_state[0] = 0.738, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.068
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.084
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.246
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.328
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.034
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.712
c_state[0] = 0.373, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.102, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.090, c_hat[0] = 0.808
c_state[0] = 0.703, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.374, f_gate[0] = 0.597, o_gate[0] = 0.083, c_hat[0] = 0.868
c_state[0] = 0.744, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.346, f_gate[0] = 0.584, o_gate[0] = 0.076, c_hat[0] = 0.875
c_state[0] = 0.737, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.068
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.083
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.245
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.327
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.034
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.713
c_state[0] = 0.373, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.101, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.089, c_hat[0] = 0.809
c_state[0] = 0.702, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.373, f_gate[0] = 0.597, o_gate[0] = 0.082, c_hat[0] = 0.869
c_state[0] = 0.743, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.345, f_gate[0] = 0.583, o_gate[0] = 0.076, c_hat[0] = 0.875
c_state[0] = 0.736, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.067
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.083
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.244
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.327
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.127, c_hat[0] = 0.713
c_state[0] = 0.373, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.101, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.408, f_gate[0] = 0.625, o_gate[0] = 0.088, c_hat[0] = 0.809
c_state[0] = 0.702, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.373, f_gate[0] = 0.597, o_gate[0] = 0.082, c_hat[0] = 0.869
c_state[0] = 0.743, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.345, f_gate[0] = 0.583, o_gate[0] = 0.075, c_hat[0] = 0.875
c_state[0] = 0.735, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.067
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.082
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.244
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.126, c_hat[0] = 0.713
c_state[0] = 0.373, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.100, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.408, f_gate[0] = 0.625, o_gate[0] = 0.088, c_hat[0] = 0.809
c_state[0] = 0.702, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.372, f_gate[0] = 0.597, o_gate[0] = 0.081, c_hat[0] = 0.869
c_state[0] = 0.742, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.344, f_gate[0] = 0.582, o_gate[0] = 0.075, c_hat[0] = 0.876
c_state[0] = 0.734, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.067
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.081
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.243
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.126, c_hat[0] = 0.713
c_state[0] = 0.373, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.100, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.408, f_gate[0] = 0.625, o_gate[0] = 0.087, c_hat[0] = 0.809
c_state[0] = 0.702, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.372, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.869
c_state[0] = 0.742, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.344, f_gate[0] = 0.582, o_gate[0] = 0.074, c_hat[0] = 0.876
c_state[0] = 0.733, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.066
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.081
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.242
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.125, c_hat[0] = 0.714
c_state[0] = 0.373, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.099, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.407, f_gate[0] = 0.625, o_gate[0] = 0.087, c_hat[0] = 0.809
c_state[0] = 0.701, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.371, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.870
c_state[0] = 0.741, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.343, f_gate[0] = 0.581, o_gate[0] = 0.074, c_hat[0] = 0.876
c_state[0] = 0.732, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.066
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.080
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.242
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.125, c_hat[0] = 0.714
c_state[0] = 0.373, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.099, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.407, f_gate[0] = 0.624, o_gate[0] = 0.087, c_hat[0] = 0.810
c_state[0] = 0.701, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.371, f_gate[0] = 0.596, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.740, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.343, f_gate[0] = 0.581, o_gate[0] = 0.074, c_hat[0] = 0.876
c_state[0] = 0.730, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.066
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.080
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.241
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.124, c_hat[0] = 0.714
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.407, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.701, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.370, f_gate[0] = 0.596, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.740, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.342, f_gate[0] = 0.581, o_gate[0] = 0.073, c_hat[0] = 0.876
c_state[0] = 0.729, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.066
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.079
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.241
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.124, c_hat[0] = 0.714
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.407, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.701, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.370, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.739, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.342, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.728, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.065
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.079
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.241
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.124, c_hat[0] = 0.715
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.406, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.700, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.370, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.738, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.341, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.727, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.065
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.079
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.324
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.123, c_hat[0] = 0.715
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.406, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.700, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.369, f_gate[0] = 0.595, o_gate[0] = 0.079, c_hat[0] = 0.870
c_state[0] = 0.738, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.340, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.726, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.065
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.078
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.324
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.123, c_hat[0] = 0.715
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.449, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.406, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.700, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.369, f_gate[0] = 0.595, o_gate[0] = 0.079, c_hat[0] = 0.870
c_state[0] = 0.737, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.340, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.725, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.065
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.078
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.324
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.123, c_hat[0] = 0.716
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.405, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.700, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.368, f_gate[0] = 0.595, o_gate[0] = 0.079, c_hat[0] = 0.870
c_state[0] = 0.737, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.339, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.724, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.065
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.078
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.123, c_hat[0] = 0.716
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.405, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.699, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.368, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.736, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.338, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.723, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.065
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.078
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.716
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.405, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.810
c_state[0] = 0.699, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.367, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.735, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.337, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.722, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.064
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.078
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.716
c_state[0] = 0.375, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.404, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.809
c_state[0] = 0.699, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.366, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.735, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.337, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.721, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.064
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.077
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.717
c_state[0] = 0.375, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.449, f_gate[0] = 0.654, o_gate[0] = 0.098, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.404, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.809
c_state[0] = 0.699, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.366, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870
c_state[0] = 0.734, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.336, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877
c_state[0] = 0.720, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.064
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.077
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.717
c_state[0] = 0.375, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.449, f_gate[0] = 0.654, o_gate[0] = 0.099, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.403, f_gate[0] = 0.626, o_gate[0] = 0.087, c_hat[0] = 0.809
c_state[0] = 0.699, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.365, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.870
c_state[0] = 0.734, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.335, f_gate[0] = 0.579, o_gate[0] = 0.074, c_hat[0] = 0.877
c_state[0] = 0.718, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.064
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.077
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.240
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.125, c_hat[0] = 0.717
c_state[0] = 0.375, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.448, f_gate[0] = 0.654, o_gate[0] = 0.099, c_hat[0] = 0.780
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.403, f_gate[0] = 0.626, o_gate[0] = 0.087, c_hat[0] = 0.809
c_state[0] = 0.698, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.364, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.870
c_state[0] = 0.733, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.334, f_gate[0] = 0.579, o_gate[0] = 0.074, c_hat[0] = 0.877
c_state[0] = 0.717, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.063
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.077
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.239
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.125, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.448, f_gate[0] = 0.654, o_gate[0] = 0.100, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.403, f_gate[0] = 0.627, o_gate[0] = 0.088, c_hat[0] = 0.808
c_state[0] = 0.698, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.363, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.869
c_state[0] = 0.732, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.333, f_gate[0] = 0.579, o_gate[0] = 0.075, c_hat[0] = 0.877
c_state[0] = 0.715, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.063
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.077
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.239
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.126, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.448, f_gate[0] = 0.655, o_gate[0] = 0.100, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.402, f_gate[0] = 0.627, o_gate[0] = 0.088, c_hat[0] = 0.808
c_state[0] = 0.698, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.363, f_gate[0] = 0.596, o_gate[0] = 0.082, c_hat[0] = 0.869
c_state[0] = 0.731, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.579, o_gate[0] = 0.075, c_hat[0] = 0.876
c_state[0] = 0.714, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.063
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.076
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.239
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.126, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.448, f_gate[0] = 0.655, o_gate[0] = 0.101, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.402, f_gate[0] = 0.627, o_gate[0] = 0.089, c_hat[0] = 0.808
c_state[0] = 0.698, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.362, f_gate[0] = 0.597, o_gate[0] = 0.083, c_hat[0] = 0.869
c_state[0] = 0.730, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.579, o_gate[0] = 0.076, c_hat[0] = 0.876
c_state[0] = 0.712, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.062
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.076
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.239
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.327
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.127, c_hat[0] = 0.718
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.448, f_gate[0] = 0.656, o_gate[0] = 0.101, c_hat[0] = 0.779
c_state[0] = 0.595, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.401, f_gate[0] = 0.628, o_gate[0] = 0.089, c_hat[0] = 0.807
c_state[0] = 0.697, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.361, f_gate[0] = 0.597, o_gate[0] = 0.083, c_hat[0] = 0.869
c_state[0] = 0.729, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.328, f_gate[0] = 0.579, o_gate[0] = 0.076, c_hat[0] = 0.876
c_state[0] = 0.710, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.061
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.075
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.239
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.327
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.127, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.447, f_gate[0] = 0.656, o_gate[0] = 0.102, c_hat[0] = 0.778
c_state[0] = 0.595, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.400, f_gate[0] = 0.628, o_gate[0] = 0.090, c_hat[0] = 0.807
c_state[0] = 0.697, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.359, f_gate[0] = 0.597, o_gate[0] = 0.084, c_hat[0] = 0.869
c_state[0] = 0.728, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.327, f_gate[0] = 0.579, o_gate[0] = 0.077, c_hat[0] = 0.876
c_state[0] = 0.708, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.060
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.075
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.238
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.327
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.447, f_gate[0] = 0.656, o_gate[0] = 0.102, c_hat[0] = 0.778
c_state[0] = 0.595, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.400, f_gate[0] = 0.629, o_gate[0] = 0.090, c_hat[0] = 0.807
c_state[0] = 0.696, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.358, f_gate[0] = 0.597, o_gate[0] = 0.084, c_hat[0] = 0.868
c_state[0] = 0.727, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.325, f_gate[0] = 0.579, o_gate[0] = 0.077, c_hat[0] = 0.876
c_state[0] = 0.706, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.059
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.074
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.237
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.327
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.263
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.447, f_gate[0] = 0.657, o_gate[0] = 0.102, c_hat[0] = 0.778
c_state[0] = 0.594, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.399, f_gate[0] = 0.629, o_gate[0] = 0.090, c_hat[0] = 0.806
c_state[0] = 0.696, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.357, f_gate[0] = 0.598, o_gate[0] = 0.084, c_hat[0] = 0.868
c_state[0] = 0.726, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.324, f_gate[0] = 0.579, o_gate[0] = 0.078, c_hat[0] = 0.876
c_state[0] = 0.704, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.058
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.072
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.236
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.326
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.447, f_gate[0] = 0.657, o_gate[0] = 0.102, c_hat[0] = 0.778
c_state[0] = 0.594, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.398, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.806
c_state[0] = 0.695, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.356, f_gate[0] = 0.598, o_gate[0] = 0.085, c_hat[0] = 0.868
c_state[0] = 0.724, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.322, f_gate[0] = 0.579, o_gate[0] = 0.078, c_hat[0] = 0.876
c_state[0] = 0.702, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.056
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.070
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.234
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.325
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.262
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.446, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777
c_state[0] = 0.594, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.398, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.806
c_state[0] = 0.694, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.355, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.868
c_state[0] = 0.723, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.321, f_gate[0] = 0.579, o_gate[0] = 0.078, c_hat[0] = 0.876
c_state[0] = 0.699, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.054
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.068
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.232
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.324
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.261
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.446, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777
c_state[0] = 0.594, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.397, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.806
c_state[0] = 0.693, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.354, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.868
c_state[0] = 0.721, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.319, f_gate[0] = 0.578, o_gate[0] = 0.078, c_hat[0] = 0.875
c_state[0] = 0.697, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.051
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.066
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.230
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.322
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033
Gradient do_[0] = 0.260
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.446, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777
c_state[0] = 0.593, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.396, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.805
c_state[0] = 0.693, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.353, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.868
c_state[0] = 0.720, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.318, f_gate[0] = 0.578, o_gate[0] = 0.078, c_hat[0] = 0.875
c_state[0] = 0.694, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.049
Backward Time Step 3:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.063
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.228
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.321
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.260
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777
c_state[0] = 0.593, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.396, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.805
c_state[0] = 0.692, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.352, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.867
c_state[0] = 0.718, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.316, f_gate[0] = 0.578, o_gate[0] = 0.079, c_hat[0] = 0.875
c_state[0] = 0.692, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000
Gradient do_[0] = 0.045
Backward Time Step 3:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.059
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.225
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013
Gradient do_[0] = 0.319
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.259
Epoch 100, Train Loss=0.009895, Weight Norm=12.102183
Sample Predictions at Epoch 100:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 65.10 | 63.87 | 1.23 |
| 193 | 2024-10-14 | 65.13 | 66.55 | 1.42 |
| 194 | 2024-10-15 | 65.40 | 66.00 | 0.60 |
| 195 | 2024-10-16 | 65.47 | 67.20 | 1.73 |
| 196 | 2024-10-17 | 65.64 | 66.76 | 1.12 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.658, o_gate[0] = 0.103, c_hat[0] = 0.776
c_state[0] = 0.593, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.395, f_gate[0] = 0.630, o_gate[0] = 0.091, c_hat[0] = 0.805
c_state[0] = 0.691, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.350, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.867
c_state[0] = 0.716, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.315, f_gate[0] = 0.577, o_gate[0] = 0.079, c_hat[0] = 0.875
c_state[0] = 0.689, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000
Gradient do_[0] = 0.039
Backward Time Step 3:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.054
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.221
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.013
Gradient do_[0] = 0.318
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.258
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.658, o_gate[0] = 0.103, c_hat[0] = 0.776
c_state[0] = 0.592, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.394, f_gate[0] = 0.630, o_gate[0] = 0.092, c_hat[0] = 0.804
c_state[0] = 0.690, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.349, f_gate[0] = 0.597, o_gate[0] = 0.086, c_hat[0] = 0.867
c_state[0] = 0.715, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.313, f_gate[0] = 0.577, o_gate[0] = 0.079, c_hat[0] = 0.875
c_state[0] = 0.687, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000
Gradient do_[0] = 0.029
Backward Time Step 3:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.046
Backward Time Step 2:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006
Gradient do_[0] = 0.215
Backward Time Step 1:
Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.013
Gradient do_[0] = 0.315
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.257
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.658, o_gate[0] = 0.104, c_hat[0] = 0.775
c_state[0] = 0.592, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.393, f_gate[0] = 0.630, o_gate[0] = 0.092, c_hat[0] = 0.804
c_state[0] = 0.689, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.348, f_gate[0] = 0.597, o_gate[0] = 0.086, c_hat[0] = 0.866
c_state[0] = 0.713, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.577, o_gate[0] = 0.080, c_hat[0] = 0.874
c_state[0] = 0.684, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000
Gradient do_[0] = 0.011
Backward Time Step 3:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000
Gradient do_[0] = 0.032
Backward Time Step 2:
Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.205
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.312
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.256
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.658, o_gate[0] = 0.104, c_hat[0] = 0.775
c_state[0] = 0.592, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.393, f_gate[0] = 0.631, o_gate[0] = 0.093, c_hat[0] = 0.803
c_state[0] = 0.688, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.598, o_gate[0] = 0.087, c_hat[0] = 0.866
c_state[0] = 0.711, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.310, f_gate[0] = 0.578, o_gate[0] = 0.080, c_hat[0] = 0.874
c_state[0] = 0.682, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = 0.000, df[0] = 0.000, dc_hat[0] = 0.000
Gradient do_[0] = -0.020
Backward Time Step 3:
Gradient di[0] = 0.001, df[0] = 0.000, dc_hat[0] = 0.000
Gradient do_[0] = 0.004
Backward Time Step 2:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.005
Gradient do_[0] = 0.185
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.305
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.254
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.131, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.659, o_gate[0] = 0.105, c_hat[0] = 0.774
c_state[0] = 0.591, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.392, f_gate[0] = 0.631, o_gate[0] = 0.093, c_hat[0] = 0.802
c_state[0] = 0.687, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.599, o_gate[0] = 0.088, c_hat[0] = 0.865
c_state[0] = 0.710, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.308, f_gate[0] = 0.579, o_gate[0] = 0.081, c_hat[0] = 0.872
c_state[0] = 0.680, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.000
Gradient do_[0] = -0.076
Backward Time Step 3:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.047
Backward Time Step 2:
Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.005
Gradient do_[0] = 0.146
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.290
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.250
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.132, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.443, f_gate[0] = 0.660, o_gate[0] = 0.106, c_hat[0] = 0.773
c_state[0] = 0.591, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.391, f_gate[0] = 0.633, o_gate[0] = 0.095, c_hat[0] = 0.800
c_state[0] = 0.686, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.600, o_gate[0] = 0.089, c_hat[0] = 0.863
c_state[0] = 0.709, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.306, f_gate[0] = 0.581, o_gate[0] = 0.083, c_hat[0] = 0.871
c_state[0] = 0.678, h_state[0] = 0.049
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.160
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.135
Backward Time Step 2:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.004
Gradient do_[0] = 0.075
Backward Time Step 1:
Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.011
Gradient do_[0] = 0.259
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.242
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.443, f_gate[0] = 0.660, o_gate[0] = 0.108, c_hat[0] = 0.771
c_state[0] = 0.590, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.634, o_gate[0] = 0.096, c_hat[0] = 0.798
c_state[0] = 0.685, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.342, f_gate[0] = 0.603, o_gate[0] = 0.091, c_hat[0] = 0.861
c_state[0] = 0.708, h_state[0] = 0.055
Time Step 4:
i_gate[0] = 0.304, f_gate[0] = 0.585, o_gate[0] = 0.085, c_hat[0] = 0.868
c_state[0] = 0.678, h_state[0] = 0.050
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.248
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.247
Backward Time Step 2:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.024
Backward Time Step 1:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.010
Gradient do_[0] = 0.207
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.031
Gradient do_[0] = 0.228
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.661, o_gate[0] = 0.110, c_hat[0] = 0.770
c_state[0] = 0.589, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.389, f_gate[0] = 0.636, o_gate[0] = 0.098, c_hat[0] = 0.795
c_state[0] = 0.684, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.341, f_gate[0] = 0.606, o_gate[0] = 0.093, c_hat[0] = 0.859
c_state[0] = 0.707, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.302, f_gate[0] = 0.589, o_gate[0] = 0.087, c_hat[0] = 0.865
c_state[0] = 0.678, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.264
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.002
Gradient do_[0] = -0.300
Backward Time Step 2:
Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = -0.088
Backward Time Step 1:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.009
Gradient do_[0] = 0.160
Backward Time Step 0:
Gradient di[0] = 0.021, df[0] = 0.018, dc_hat[0] = 0.030
Gradient do_[0] = 0.213
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.138, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.662, o_gate[0] = 0.111, c_hat[0] = 0.768
c_state[0] = 0.589, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.638, o_gate[0] = 0.100, c_hat[0] = 0.793
c_state[0] = 0.683, h_state[0] = 0.059
Time Step 3:
i_gate[0] = 0.339, f_gate[0] = 0.609, o_gate[0] = 0.095, c_hat[0] = 0.856
c_state[0] = 0.706, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.300, f_gate[0] = 0.594, o_gate[0] = 0.089, c_hat[0] = 0.861
c_state[0] = 0.678, h_state[0] = 0.053
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.121
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.152
Backward Time Step 2:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = 0.027
Backward Time Step 1:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.011
Gradient do_[0] = 0.208
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.031
Gradient do_[0] = 0.224
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.138, c_hat[0] = 0.720
c_state[0] = 0.376, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.662, o_gate[0] = 0.111, c_hat[0] = 0.767
c_state[0] = 0.588, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.638, o_gate[0] = 0.100, c_hat[0] = 0.791
c_state[0] = 0.682, h_state[0] = 0.059
Time Step 3:
i_gate[0] = 0.338, f_gate[0] = 0.609, o_gate[0] = 0.095, c_hat[0] = 0.855
c_state[0] = 0.704, h_state[0] = 0.058
Time Step 4:
i_gate[0] = 0.300, f_gate[0] = 0.594, o_gate[0] = 0.090, c_hat[0] = 0.860
c_state[0] = 0.676, h_state[0] = 0.053
Backward Time Step 4:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.002
Gradient do_[0] = 0.238
Backward Time Step 3:
Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.003
Gradient do_[0] = 0.318
Backward Time Step 2:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010
Gradient do_[0] = 0.458
Backward Time Step 1:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.018
Gradient do_[0] = 0.453
Backward Time Step 0:
Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.037
Gradient do_[0] = 0.294
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.661, o_gate[0] = 0.110, c_hat[0] = 0.768
c_state[0] = 0.588, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.635, o_gate[0] = 0.098, c_hat[0] = 0.793
c_state[0] = 0.681, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.340, f_gate[0] = 0.603, o_gate[0] = 0.093, c_hat[0] = 0.857
c_state[0] = 0.702, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.302, f_gate[0] = 0.585, o_gate[0] = 0.087, c_hat[0] = 0.864
c_state[0] = 0.672, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.241
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.002
Gradient do_[0] = -0.264
Backward Time Step 2:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.053
Backward Time Step 1:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.010
Gradient do_[0] = 0.176
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.030
Gradient do_[0] = 0.216
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.142, c_hat[0] = 0.720
c_state[0] = 0.377, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.662, o_gate[0] = 0.116, c_hat[0] = 0.771
c_state[0] = 0.588, h_state[0] = 0.061
Time Step 2:
i_gate[0] = 0.384, f_gate[0] = 0.638, o_gate[0] = 0.105, c_hat[0] = 0.797
c_state[0] = 0.681, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.334, f_gate[0] = 0.608, o_gate[0] = 0.100, c_hat[0] = 0.860
c_state[0] = 0.701, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.295, f_gate[0] = 0.593, o_gate[0] = 0.095, c_hat[0] = 0.865
c_state[0] = 0.671, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.242
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.003
Gradient do_[0] = -0.337
Backward Time Step 2:
Gradient di[0] = 0.000, df[0] = 0.000, dc_hat[0] = 0.000
Gradient do_[0] = -0.159
Backward Time Step 1:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.008
Gradient do_[0] = 0.090
Backward Time Step 0:
Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.029
Gradient do_[0] = 0.187
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.144, c_hat[0] = 0.720
c_state[0] = 0.377, h_state[0] = 0.052
Time Step 1:
i_gate[0] = 0.439, f_gate[0] = 0.663, o_gate[0] = 0.117, c_hat[0] = 0.769
c_state[0] = 0.587, h_state[0] = 0.062
Time Step 2:
i_gate[0] = 0.382, f_gate[0] = 0.640, o_gate[0] = 0.106, c_hat[0] = 0.794
c_state[0] = 0.679, h_state[0] = 0.063
Time Step 3:
i_gate[0] = 0.330, f_gate[0] = 0.613, o_gate[0] = 0.102, c_hat[0] = 0.857
c_state[0] = 0.699, h_state[0] = 0.062
Time Step 4:
i_gate[0] = 0.292, f_gate[0] = 0.602, o_gate[0] = 0.098, c_hat[0] = 0.860
c_state[0] = 0.672, h_state[0] = 0.058
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.000
Gradient do_[0] = -0.055
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.099
Backward Time Step 2:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.004
Gradient do_[0] = 0.052
Backward Time Step 1:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.011
Gradient do_[0] = 0.208
Backward Time Step 0:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032
Gradient do_[0] = 0.220
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.143, c_hat[0] = 0.720
c_state[0] = 0.377, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.438, f_gate[0] = 0.664, o_gate[0] = 0.116, c_hat[0] = 0.768
c_state[0] = 0.586, h_state[0] = 0.061
Time Step 2:
i_gate[0] = 0.380, f_gate[0] = 0.641, o_gate[0] = 0.105, c_hat[0] = 0.792
c_state[0] = 0.676, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.615, o_gate[0] = 0.101, c_hat[0] = 0.854
c_state[0] = 0.696, h_state[0] = 0.061
Time Step 4:
i_gate[0] = 0.289, f_gate[0] = 0.606, o_gate[0] = 0.098, c_hat[0] = 0.857
c_state[0] = 0.669, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = 0.008, df[0] = 0.007, dc_hat[0] = 0.004
Gradient do_[0] = 0.402
Backward Time Step 3:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.006
Gradient do_[0] = 0.643
Backward Time Step 2:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.017
Gradient do_[0] = 0.855
Backward Time Step 1:
Gradient di[0] = 0.028, df[0] = 0.022, dc_hat[0] = 0.027
Gradient do_[0] = 0.766
Backward Time Step 0:
Gradient di[0] = 0.034, df[0] = 0.028, dc_hat[0] = 0.048
Gradient do_[0] = 0.405
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.719
c_state[0] = 0.375, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.438, f_gate[0] = 0.662, o_gate[0] = 0.110, c_hat[0] = 0.766
c_state[0] = 0.584, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.382, f_gate[0] = 0.637, o_gate[0] = 0.099, c_hat[0] = 0.791
c_state[0] = 0.674, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.607, o_gate[0] = 0.094, c_hat[0] = 0.855
c_state[0] = 0.692, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.292, f_gate[0] = 0.594, o_gate[0] = 0.090, c_hat[0] = 0.860
c_state[0] = 0.663, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.005
Gradient do_[0] = 0.606
Backward Time Step 3:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.008
Gradient do_[0] = 0.892
Backward Time Step 2:
Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.019
Gradient do_[0] = 1.045
Backward Time Step 1:
Gradient di[0] = 0.029, df[0] = 0.023, dc_hat[0] = 0.028
Gradient do_[0] = 0.845
Backward Time Step 0:
Gradient di[0] = 0.034, df[0] = 0.028, dc_hat[0] = 0.047
Gradient do_[0] = 0.420
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.132, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.660, o_gate[0] = 0.105, c_hat[0] = 0.765
c_state[0] = 0.585, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.386, f_gate[0] = 0.634, o_gate[0] = 0.094, c_hat[0] = 0.791
c_state[0] = 0.676, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.337, f_gate[0] = 0.602, o_gate[0] = 0.089, c_hat[0] = 0.856
c_state[0] = 0.695, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.299, f_gate[0] = 0.585, o_gate[0] = 0.084, c_hat[0] = 0.862
c_state[0] = 0.665, h_state[0] = 0.049
Backward Time Step 4:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.005
Gradient do_[0] = 0.732
Backward Time Step 3:
Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.008
Gradient do_[0] = 0.900
Backward Time Step 2:
Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.017
Gradient do_[0] = 0.953
Backward Time Step 1:
Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.023
Gradient do_[0] = 0.698
Backward Time Step 0:
Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.041
Gradient do_[0] = 0.359
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.660, o_gate[0] = 0.105, c_hat[0] = 0.769
c_state[0] = 0.586, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.385, f_gate[0] = 0.632, o_gate[0] = 0.094, c_hat[0] = 0.797
c_state[0] = 0.677, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.335, f_gate[0] = 0.599, o_gate[0] = 0.089, c_hat[0] = 0.862
c_state[0] = 0.694, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.296, f_gate[0] = 0.582, o_gate[0] = 0.084, c_hat[0] = 0.869
c_state[0] = 0.662, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.006
Gradient do_[0] = 0.814
Backward Time Step 3:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.008
Gradient do_[0] = 0.998
Backward Time Step 2:
Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.018
Gradient do_[0] = 1.057
Backward Time Step 1:
Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.024
Gradient do_[0] = 0.757
Backward Time Step 0:
Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.042
Gradient do_[0] = 0.381
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.125, c_hat[0] = 0.718
c_state[0] = 0.374, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.657, o_gate[0] = 0.099, c_hat[0] = 0.769
c_state[0] = 0.585, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.628, o_gate[0] = 0.089, c_hat[0] = 0.797
c_state[0] = 0.677, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.340, f_gate[0] = 0.593, o_gate[0] = 0.083, c_hat[0] = 0.862
c_state[0] = 0.694, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.302, f_gate[0] = 0.572, o_gate[0] = 0.077, c_hat[0] = 0.871
c_state[0] = 0.660, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000
Gradient do_[0] = 0.044
Backward Time Step 3:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000
Gradient do_[0] = 0.045
Backward Time Step 2:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.005
Gradient do_[0] = 0.200
Backward Time Step 1:
Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.011
Gradient do_[0] = 0.291
Backward Time Step 0:
Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030
Gradient do_[0] = 0.243
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.659, o_gate[0] = 0.102, c_hat[0] = 0.770
c_state[0] = 0.586, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.385, f_gate[0] = 0.630, o_gate[0] = 0.091, c_hat[0] = 0.799
c_state[0] = 0.677, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.336, f_gate[0] = 0.597, o_gate[0] = 0.086, c_hat[0] = 0.864
c_state[0] = 0.694, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.297, f_gate[0] = 0.579, o_gate[0] = 0.081, c_hat[0] = 0.872
c_state[0] = 0.661, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.005
Gradient do_[0] = 0.809
Backward Time Step 3:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.008
Gradient do_[0] = 0.947
Backward Time Step 2:
Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.016
Gradient do_[0] = 0.989
Backward Time Step 1:
Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.022
Gradient do_[0] = 0.703
Backward Time Step 0:
Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.040
Gradient do_[0] = 0.364
Time Step 0:
i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.122, c_hat[0] = 0.717
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.656, o_gate[0] = 0.097, c_hat[0] = 0.770
c_state[0] = 0.585, h_state[0] = 0.051
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.626, o_gate[0] = 0.086, c_hat[0] = 0.799
c_state[0] = 0.676, h_state[0] = 0.051
Time Step 3:
i_gate[0] = 0.340, f_gate[0] = 0.590, o_gate[0] = 0.081, c_hat[0] = 0.865
c_state[0] = 0.693, h_state[0] = 0.049
Time Step 4:
i_gate[0] = 0.303, f_gate[0] = 0.568, o_gate[0] = 0.075, c_hat[0] = 0.873
c_state[0] = 0.658, h_state[0] = 0.043
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.003
Gradient do_[0] = -0.428
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = -0.004, dc_hat[0] = -0.003
Gradient do_[0] = -0.370
Backward Time Step 2:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = -0.096
Backward Time Step 1:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009
Gradient do_[0] = 0.189
Backward Time Step 0:
Gradient di[0] = 0.019, df[0] = 0.016, dc_hat[0] = 0.028
Gradient do_[0] = 0.219
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.127, c_hat[0] = 0.718
c_state[0] = 0.374, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.102, c_hat[0] = 0.770
c_state[0] = 0.585, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.385, f_gate[0] = 0.630, o_gate[0] = 0.091, c_hat[0] = 0.800
c_state[0] = 0.676, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.336, f_gate[0] = 0.596, o_gate[0] = 0.086, c_hat[0] = 0.864
c_state[0] = 0.693, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.297, f_gate[0] = 0.577, o_gate[0] = 0.081, c_hat[0] = 0.873
c_state[0] = 0.660, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.003
Gradient do_[0] = 0.411
Backward Time Step 3:
Gradient di[0] = 0.009, df[0] = 0.006, dc_hat[0] = 0.004
Gradient do_[0] = 0.480
Backward Time Step 2:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.011
Gradient do_[0] = 0.582
Backward Time Step 1:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017
Gradient do_[0] = 0.494
Backward Time Step 0:
Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.035
Gradient do_[0] = 0.303
Time Step 0:
i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.123, c_hat[0] = 0.717
c_state[0] = 0.374, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.656, o_gate[0] = 0.098, c_hat[0] = 0.771
c_state[0] = 0.585, h_state[0] = 0.051
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.625, o_gate[0] = 0.087, c_hat[0] = 0.801
c_state[0] = 0.675, h_state[0] = 0.051
Time Step 3:
i_gate[0] = 0.338, f_gate[0] = 0.589, o_gate[0] = 0.082, c_hat[0] = 0.866
c_state[0] = 0.690, h_state[0] = 0.049
Time Step 4:
i_gate[0] = 0.301, f_gate[0] = 0.567, o_gate[0] = 0.076, c_hat[0] = 0.875
c_state[0] = 0.655, h_state[0] = 0.044
Backward Time Step 4:
Gradient di[0] = -0.012, df[0] = -0.009, dc_hat[0] = -0.005
Gradient do_[0] = -0.762
Backward Time Step 3:
Gradient di[0] = -0.011, df[0] = -0.008, dc_hat[0] = -0.005
Gradient do_[0] = -0.678
Backward Time Step 2:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.326
Backward Time Step 1:
Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.007
Gradient do_[0] = 0.102
Backward Time Step 0:
Gradient di[0] = 0.018, df[0] = 0.015, dc_hat[0] = 0.026
Gradient do_[0] = 0.197
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.439, f_gate[0] = 0.659, o_gate[0] = 0.103, c_hat[0] = 0.771
c_state[0] = 0.585, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.384, f_gate[0] = 0.630, o_gate[0] = 0.092, c_hat[0] = 0.801
c_state[0] = 0.675, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.334, f_gate[0] = 0.596, o_gate[0] = 0.088, c_hat[0] = 0.865
c_state[0] = 0.691, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.295, f_gate[0] = 0.578, o_gate[0] = 0.082, c_hat[0] = 0.873
c_state[0] = 0.657, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.003
Gradient do_[0] = -0.387
Backward Time Step 3:
Gradient di[0] = -0.008, df[0] = -0.006, dc_hat[0] = -0.004
Gradient do_[0] = -0.469
Backward Time Step 2:
Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.255
Backward Time Step 1:
Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.006
Gradient do_[0] = 0.057
Backward Time Step 0:
Gradient di[0] = 0.018, df[0] = 0.015, dc_hat[0] = 0.025
Gradient do_[0] = 0.174
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.438, f_gate[0] = 0.660, o_gate[0] = 0.104, c_hat[0] = 0.769
c_state[0] = 0.584, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.382, f_gate[0] = 0.633, o_gate[0] = 0.094, c_hat[0] = 0.798
c_state[0] = 0.674, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.601, o_gate[0] = 0.090, c_hat[0] = 0.862
c_state[0] = 0.690, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.292, f_gate[0] = 0.586, o_gate[0] = 0.085, c_hat[0] = 0.869
c_state[0] = 0.659, h_state[0] = 0.049
Backward Time Step 4:
Gradient di[0] = 0.054, df[0] = 0.043, dc_hat[0] = 0.021
Gradient do_[0] = 3.052
Backward Time Step 3:
Gradient di[0] = 0.082, df[0] = 0.062, dc_hat[0] = 0.036
Gradient do_[0] = 4.449
Backward Time Step 2:
Gradient di[0] = 0.094, df[0] = 0.073, dc_hat[0] = 0.069
Gradient do_[0] = 4.583
Backward Time Step 1:
Gradient di[0] = 0.088, df[0] = 0.069, dc_hat[0] = 0.083
Gradient do_[0] = 3.049
Backward Time Step 0:
Gradient di[0] = 0.074, df[0] = 0.061, dc_hat[0] = 0.104
Gradient do_[0] = 1.150
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.715
c_state[0] = 0.372, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.657, o_gate[0] = 0.099, c_hat[0] = 0.766
c_state[0] = 0.582, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.629, o_gate[0] = 0.089, c_hat[0] = 0.794
c_state[0] = 0.674, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.339, f_gate[0] = 0.596, o_gate[0] = 0.084, c_hat[0] = 0.860
c_state[0] = 0.693, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.302, f_gate[0] = 0.578, o_gate[0] = 0.078, c_hat[0] = 0.868
c_state[0] = 0.663, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = -0.003, dc_hat[0] = -0.001
Gradient do_[0] = -0.212
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.236
Backward Time Step 2:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.033
Backward Time Step 1:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009
Gradient do_[0] = 0.180
Backward Time Step 0:
Gradient di[0] = 0.019, df[0] = 0.016, dc_hat[0] = 0.028
Gradient do_[0] = 0.212
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.126, c_hat[0] = 0.715
c_state[0] = 0.372, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.101, c_hat[0] = 0.764
c_state[0] = 0.581, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.631, o_gate[0] = 0.090, c_hat[0] = 0.792
c_state[0] = 0.673, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.339, f_gate[0] = 0.598, o_gate[0] = 0.085, c_hat[0] = 0.858
c_state[0] = 0.693, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.301, f_gate[0] = 0.581, o_gate[0] = 0.080, c_hat[0] = 0.865
c_state[0] = 0.664, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.177
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.211
Backward Time Step 2:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.020
Backward Time Step 1:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009
Gradient do_[0] = 0.179
Backward Time Step 0:
Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028
Gradient do_[0] = 0.210
Time Step 0:
i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.127, c_hat[0] = 0.715
c_state[0] = 0.372, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.102, c_hat[0] = 0.764
c_state[0] = 0.581, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.631, o_gate[0] = 0.091, c_hat[0] = 0.791
c_state[0] = 0.673, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.339, f_gate[0] = 0.599, o_gate[0] = 0.086, c_hat[0] = 0.857
c_state[0] = 0.693, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.301, f_gate[0] = 0.583, o_gate[0] = 0.081, c_hat[0] = 0.863
c_state[0] = 0.664, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = -0.018, df[0] = -0.014, dc_hat[0] = -0.007
Gradient do_[0] = -1.049
Backward Time Step 3:
Gradient di[0] = -0.023, df[0] = -0.018, dc_hat[0] = -0.011
Gradient do_[0] = -1.330
Backward Time Step 2:
Gradient di[0] = -0.016, df[0] = -0.013, dc_hat[0] = -0.013
Gradient do_[0] = -1.026
Backward Time Step 1:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.005
Gradient do_[0] = -0.377
Backward Time Step 0:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.015
Gradient do_[0] = 0.045
Time Step 0:
i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.133, c_hat[0] = 0.716
c_state[0] = 0.373, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.438, f_gate[0] = 0.661, o_gate[0] = 0.107, c_hat[0] = 0.763
c_state[0] = 0.581, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.384, f_gate[0] = 0.636, o_gate[0] = 0.096, c_hat[0] = 0.790
c_state[0] = 0.673, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.334, f_gate[0] = 0.607, o_gate[0] = 0.092, c_hat[0] = 0.855
c_state[0] = 0.693, h_state[0] = 0.055
Time Step 4:
i_gate[0] = 0.296, f_gate[0] = 0.594, o_gate[0] = 0.087, c_hat[0] = 0.860
c_state[0] = 0.667, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = 0.036, df[0] = 0.029, dc_hat[0] = 0.016
Gradient do_[0] = 2.047
Backward Time Step 3:
Gradient di[0] = 0.061, df[0] = 0.047, dc_hat[0] = 0.029
Gradient do_[0] = 3.284
Backward Time Step 2:
Gradient di[0] = 0.075, df[0] = 0.059, dc_hat[0] = 0.058
Gradient do_[0] = 3.545
Backward Time Step 1:
Gradient di[0] = 0.075, df[0] = 0.059, dc_hat[0] = 0.073
Gradient do_[0] = 2.526
Backward Time Step 0:
Gradient di[0] = 0.066, df[0] = 0.055, dc_hat[0] = 0.094
Gradient do_[0] = 0.993
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.127, c_hat[0] = 0.714
c_state[0] = 0.371, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.659, o_gate[0] = 0.101, c_hat[0] = 0.760
c_state[0] = 0.580, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.633, o_gate[0] = 0.091, c_hat[0] = 0.786
c_state[0] = 0.673, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.342, f_gate[0] = 0.603, o_gate[0] = 0.086, c_hat[0] = 0.852
c_state[0] = 0.698, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.306, f_gate[0] = 0.587, o_gate[0] = 0.080, c_hat[0] = 0.859
c_state[0] = 0.672, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.004
Gradient do_[0] = -0.516
Backward Time Step 3:
Gradient di[0] = -0.011, df[0] = -0.009, dc_hat[0] = -0.006
Gradient do_[0] = -0.654
Backward Time Step 2:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.004
Gradient do_[0] = -0.415
Backward Time Step 1:
Gradient di[0] = 0.003, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.023
Gradient do_[0] = 0.147
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.131, c_hat[0] = 0.714
c_state[0] = 0.371, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.661, o_gate[0] = 0.105, c_hat[0] = 0.759
c_state[0] = 0.580, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.637, o_gate[0] = 0.094, c_hat[0] = 0.784
c_state[0] = 0.674, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.341, f_gate[0] = 0.609, o_gate[0] = 0.090, c_hat[0] = 0.849
c_state[0] = 0.699, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.304, f_gate[0] = 0.597, o_gate[0] = 0.084, c_hat[0] = 0.854
c_state[0] = 0.677, h_state[0] = 0.050
Backward Time Step 4:
Gradient di[0] = 0.032, df[0] = 0.025, dc_hat[0] = 0.015
Gradient do_[0] = 1.885
Backward Time Step 3:
Gradient di[0] = 0.051, df[0] = 0.039, dc_hat[0] = 0.025
Gradient do_[0] = 2.845
Backward Time Step 2:
Gradient di[0] = 0.061, df[0] = 0.048, dc_hat[0] = 0.049
Gradient do_[0] = 2.960
Backward Time Step 1:
Gradient di[0] = 0.060, df[0] = 0.048, dc_hat[0] = 0.060
Gradient do_[0] = 2.036
Backward Time Step 0:
Gradient di[0] = 0.054, df[0] = 0.045, dc_hat[0] = 0.077
Gradient do_[0] = 0.797
Time Step 0:
i_gate[0] = 0.519, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.713
c_state[0] = 0.370, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.443, f_gate[0] = 0.659, o_gate[0] = 0.100, c_hat[0] = 0.757
c_state[0] = 0.579, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.394, f_gate[0] = 0.634, o_gate[0] = 0.089, c_hat[0] = 0.780
c_state[0] = 0.675, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.349, f_gate[0] = 0.605, o_gate[0] = 0.084, c_hat[0] = 0.847
c_state[0] = 0.704, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.313, f_gate[0] = 0.590, o_gate[0] = 0.078, c_hat[0] = 0.853
c_state[0] = 0.682, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = -0.051, df[0] = -0.040, dc_hat[0] = -0.024
Gradient do_[0] = -3.259
Backward Time Step 3:
Gradient di[0] = -0.066, df[0] = -0.050, dc_hat[0] = -0.034
Gradient do_[0] = -3.942
Backward Time Step 2:
Gradient di[0] = -0.055, df[0] = -0.043, dc_hat[0] = -0.045
Gradient do_[0] = -3.142
Backward Time Step 1:
Gradient di[0] = -0.032, df[0] = -0.026, dc_hat[0] = -0.033
Gradient do_[0] = -1.414
Backward Time Step 0:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.006
Gradient do_[0] = -0.215
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.131, c_hat[0] = 0.715
c_state[0] = 0.372, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.662, o_gate[0] = 0.105, c_hat[0] = 0.758
c_state[0] = 0.581, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.640, o_gate[0] = 0.095, c_hat[0] = 0.782
c_state[0] = 0.676, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.343, f_gate[0] = 0.613, o_gate[0] = 0.090, c_hat[0] = 0.847
c_state[0] = 0.705, h_state[0] = 0.055
Time Step 4:
i_gate[0] = 0.306, f_gate[0] = 0.602, o_gate[0] = 0.085, c_hat[0] = 0.851
c_state[0] = 0.684, h_state[0] = 0.050
Backward Time Step 4:
Gradient di[0] = 0.030, df[0] = 0.024, dc_hat[0] = 0.014
Gradient do_[0] = 1.828
Backward Time Step 3:
Gradient di[0] = 0.050, df[0] = 0.038, dc_hat[0] = 0.025
Gradient do_[0] = 2.803
Backward Time Step 2:
Gradient di[0] = 0.061, df[0] = 0.048, dc_hat[0] = 0.050
Gradient do_[0] = 2.949
Backward Time Step 1:
Gradient di[0] = 0.061, df[0] = 0.048, dc_hat[0] = 0.061
Gradient do_[0] = 2.054
Backward Time Step 0:
Gradient di[0] = 0.054, df[0] = 0.045, dc_hat[0] = 0.077
Gradient do_[0] = 0.805
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.713
c_state[0] = 0.370, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.660, o_gate[0] = 0.100, c_hat[0] = 0.755
c_state[0] = 0.580, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.396, f_gate[0] = 0.637, o_gate[0] = 0.089, c_hat[0] = 0.778
c_state[0] = 0.678, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.351, f_gate[0] = 0.609, o_gate[0] = 0.084, c_hat[0] = 0.845
c_state[0] = 0.709, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.316, f_gate[0] = 0.595, o_gate[0] = 0.078, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.023
Backward Time Step 3:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.018
Backward Time Step 2:
Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.005
Gradient do_[0] = 0.152
Backward Time Step 1:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.012
Gradient do_[0] = 0.267
Backward Time Step 0:
Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030
Gradient do_[0] = 0.236
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.713
c_state[0] = 0.370, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.660, o_gate[0] = 0.100, c_hat[0] = 0.756
c_state[0] = 0.580, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.397, f_gate[0] = 0.636, o_gate[0] = 0.090, c_hat[0] = 0.779
c_state[0] = 0.678, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.353, f_gate[0] = 0.607, o_gate[0] = 0.084, c_hat[0] = 0.845
c_state[0] = 0.710, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.318, f_gate[0] = 0.592, o_gate[0] = 0.078, c_hat[0] = 0.850
c_state[0] = 0.691, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = -0.118, df[0] = -0.091, dc_hat[0] = -0.056
Gradient do_[0] = -7.581
Backward Time Step 3:
Gradient di[0] = -0.147, df[0] = -0.110, dc_hat[0] = -0.077
Gradient do_[0] = -8.762
Backward Time Step 2:
Gradient di[0] = -0.124, df[0] = -0.098, dc_hat[0] = -0.104
Gradient do_[0] = -6.873
Backward Time Step 1:
Gradient di[0] = -0.076, df[0] = -0.060, dc_hat[0] = -0.078
Gradient do_[0] = -3.099
Backward Time Step 0:
Gradient di[0] = -0.025, df[0] = -0.021, dc_hat[0] = -0.036
Gradient do_[0] = -0.600
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.719
c_state[0] = 0.375, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.665, o_gate[0] = 0.105, c_hat[0] = 0.762
c_state[0] = 0.588, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.394, f_gate[0] = 0.644, o_gate[0] = 0.095, c_hat[0] = 0.784
c_state[0] = 0.687, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.617, o_gate[0] = 0.091, c_hat[0] = 0.849
c_state[0] = 0.719, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.606, o_gate[0] = 0.085, c_hat[0] = 0.852
c_state[0] = 0.701, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = -0.020, df[0] = -0.016, dc_hat[0] = -0.009
Gradient do_[0] = -1.201
Backward Time Step 3:
Gradient di[0] = -0.031, df[0] = -0.023, dc_hat[0] = -0.016
Gradient do_[0] = -1.741
Backward Time Step 2:
Gradient di[0] = -0.026, df[0] = -0.020, dc_hat[0] = -0.021
Gradient do_[0] = -1.501
Backward Time Step 1:
Gradient di[0] = -0.015, df[0] = -0.012, dc_hat[0] = -0.015
Gradient do_[0] = -0.740
Backward Time Step 0:
Gradient di[0] = 0.004, df[0] = 0.004, dc_hat[0] = 0.006
Gradient do_[0] = -0.069
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.667, o_gate[0] = 0.111, c_hat[0] = 0.762
c_state[0] = 0.588, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.647, o_gate[0] = 0.101, c_hat[0] = 0.784
c_state[0] = 0.686, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.342, f_gate[0] = 0.623, o_gate[0] = 0.097, c_hat[0] = 0.847
c_state[0] = 0.717, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.306, f_gate[0] = 0.616, o_gate[0] = 0.092, c_hat[0] = 0.848
c_state[0] = 0.702, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.002
Gradient do_[0] = -0.297
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = -0.007, dc_hat[0] = -0.005
Gradient do_[0] = -0.498
Backward Time Step 2:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003
Gradient do_[0] = -0.368
Backward Time Step 1:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.088
Backward Time Step 0:
Gradient di[0] = 0.015, df[0] = 0.013, dc_hat[0] = 0.022
Gradient do_[0] = 0.118
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.719
c_state[0] = 0.376, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.668, o_gate[0] = 0.111, c_hat[0] = 0.760
c_state[0] = 0.586, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.650, o_gate[0] = 0.101, c_hat[0] = 0.781
c_state[0] = 0.684, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.338, f_gate[0] = 0.628, o_gate[0] = 0.098, c_hat[0] = 0.844
c_state[0] = 0.715, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.303, f_gate[0] = 0.625, o_gate[0] = 0.094, c_hat[0] = 0.843
c_state[0] = 0.702, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.009
Gradient do_[0] = 1.004
Backward Time Step 3:
Gradient di[0] = 0.033, df[0] = 0.026, dc_hat[0] = 0.017
Gradient do_[0] = 1.789
Backward Time Step 2:
Gradient di[0] = 0.048, df[0] = 0.038, dc_hat[0] = 0.039
Gradient do_[0] = 2.196
Backward Time Step 1:
Gradient di[0] = 0.056, df[0] = 0.044, dc_hat[0] = 0.055
Gradient do_[0] = 1.787
Backward Time Step 0:
Gradient di[0] = 0.055, df[0] = 0.045, dc_hat[0] = 0.077
Gradient do_[0] = 0.781
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.131, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.443, f_gate[0] = 0.667, o_gate[0] = 0.106, c_hat[0] = 0.758
c_state[0] = 0.586, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.393, f_gate[0] = 0.648, o_gate[0] = 0.096, c_hat[0] = 0.778
c_state[0] = 0.686, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.625, o_gate[0] = 0.091, c_hat[0] = 0.842
c_state[0] = 0.720, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.618, o_gate[0] = 0.086, c_hat[0] = 0.843
c_state[0] = 0.707, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.115
Backward Time Step 3:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.167
Backward Time Step 2:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.006
Backward Time Step 1:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009
Gradient do_[0] = 0.166
Backward Time Step 0:
Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028
Gradient do_[0] = 0.203
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.717
c_state[0] = 0.375, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.666, o_gate[0] = 0.105, c_hat[0] = 0.757
c_state[0] = 0.586, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.395, f_gate[0] = 0.648, o_gate[0] = 0.095, c_hat[0] = 0.777
c_state[0] = 0.686, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.349, f_gate[0] = 0.623, o_gate[0] = 0.090, c_hat[0] = 0.841
c_state[0] = 0.721, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.314, f_gate[0] = 0.615, o_gate[0] = 0.085, c_hat[0] = 0.842
c_state[0] = 0.708, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = -0.010, df[0] = -0.008, dc_hat[0] = -0.005
Gradient do_[0] = -0.587
Backward Time Step 3:
Gradient di[0] = -0.015, df[0] = -0.011, dc_hat[0] = -0.008
Gradient do_[0] = -0.848
Backward Time Step 2:
Gradient di[0] = -0.010, df[0] = -0.008, dc_hat[0] = -0.008
Gradient do_[0] = -0.663
Backward Time Step 1:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.249
Backward Time Step 0:
Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.017
Gradient do_[0] = 0.075
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.444, f_gate[0] = 0.668, o_gate[0] = 0.108, c_hat[0] = 0.756
c_state[0] = 0.586, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.395, f_gate[0] = 0.651, o_gate[0] = 0.098, c_hat[0] = 0.774
c_state[0] = 0.687, h_state[0] = 0.059
Time Step 3:
i_gate[0] = 0.349, f_gate[0] = 0.629, o_gate[0] = 0.094, c_hat[0] = 0.837
c_state[0] = 0.725, h_state[0] = 0.058
Time Step 4:
i_gate[0] = 0.315, f_gate[0] = 0.624, o_gate[0] = 0.089, c_hat[0] = 0.836
c_state[0] = 0.715, h_state[0] = 0.055
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = -0.006, dc_hat[0] = -0.004
Gradient do_[0] = -0.465
Backward Time Step 3:
Gradient di[0] = -0.013, df[0] = -0.010, dc_hat[0] = -0.007
Gradient do_[0] = -0.690
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.006
Gradient do_[0] = -0.531
Backward Time Step 1:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.180
Backward Time Step 0:
Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.020
Gradient do_[0] = 0.095
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.443, f_gate[0] = 0.669, o_gate[0] = 0.110, c_hat[0] = 0.755
c_state[0] = 0.585, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.394, f_gate[0] = 0.654, o_gate[0] = 0.101, c_hat[0] = 0.771
c_state[0] = 0.686, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.348, f_gate[0] = 0.634, o_gate[0] = 0.097, c_hat[0] = 0.833
c_state[0] = 0.725, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.314, f_gate[0] = 0.631, o_gate[0] = 0.092, c_hat[0] = 0.830
c_state[0] = 0.717, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.003
Gradient do_[0] = -0.322
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = -0.007, dc_hat[0] = -0.005
Gradient do_[0] = -0.489
Backward Time Step 2:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003
Gradient do_[0] = -0.350
Backward Time Step 1:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003
Gradient do_[0] = -0.073
Backward Time Step 0:
Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.022
Gradient do_[0] = 0.124
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.138, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.443, f_gate[0] = 0.669, o_gate[0] = 0.111, c_hat[0] = 0.753
c_state[0] = 0.585, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.392, f_gate[0] = 0.655, o_gate[0] = 0.102, c_hat[0] = 0.769
c_state[0] = 0.684, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.636, o_gate[0] = 0.098, c_hat[0] = 0.830
c_state[0] = 0.723, h_state[0] = 0.061
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.635, o_gate[0] = 0.094, c_hat[0] = 0.825
c_state[0] = 0.716, h_state[0] = 0.058
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003
Gradient do_[0] = -0.307
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = -0.007, dc_hat[0] = -0.005
Gradient do_[0] = -0.475
Backward Time Step 2:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.004
Gradient do_[0] = -0.352
Backward Time Step 1:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.086
Backward Time Step 0:
Gradient di[0] = 0.015, df[0] = 0.013, dc_hat[0] = 0.022
Gradient do_[0] = 0.115
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.139, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.670, o_gate[0] = 0.112, c_hat[0] = 0.752
c_state[0] = 0.584, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.391, f_gate[0] = 0.657, o_gate[0] = 0.102, c_hat[0] = 0.766
c_state[0] = 0.682, h_state[0] = 0.061
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.639, o_gate[0] = 0.099, c_hat[0] = 0.827
c_state[0] = 0.721, h_state[0] = 0.061
Time Step 4:
i_gate[0] = 0.310, f_gate[0] = 0.640, o_gate[0] = 0.095, c_hat[0] = 0.820
c_state[0] = 0.715, h_state[0] = 0.058
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.116
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.184
Backward Time Step 2:
Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002
Gradient do_[0] = -0.045
Backward Time Step 1:
Gradient di[0] = 0.008, df[0] = 0.007, dc_hat[0] = 0.009
Gradient do_[0] = 0.131
Backward Time Step 0:
Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028
Gradient do_[0] = 0.187
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.670, o_gate[0] = 0.111, c_hat[0] = 0.751
c_state[0] = 0.583, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.392, f_gate[0] = 0.656, o_gate[0] = 0.101, c_hat[0] = 0.765
c_state[0] = 0.682, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.637, o_gate[0] = 0.097, c_hat[0] = 0.826
c_state[0] = 0.720, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.637, o_gate[0] = 0.093, c_hat[0] = 0.819
c_state[0] = 0.714, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003
Gradient do_[0] = -0.265
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.004
Gradient do_[0] = -0.391
Backward Time Step 2:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.260
Backward Time Step 1:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.004
Gradient do_[0] = -0.023
Backward Time Step 0:
Gradient di[0] = 0.016, df[0] = 0.014, dc_hat[0] = 0.023
Gradient do_[0] = 0.134
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.139, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.670, o_gate[0] = 0.112, c_hat[0] = 0.751
c_state[0] = 0.583, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.391, f_gate[0] = 0.658, o_gate[0] = 0.103, c_hat[0] = 0.763
c_state[0] = 0.682, h_state[0] = 0.061
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.641, o_gate[0] = 0.099, c_hat[0] = 0.824
c_state[0] = 0.721, h_state[0] = 0.061
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.642, o_gate[0] = 0.095, c_hat[0] = 0.816
c_state[0] = 0.717, h_state[0] = 0.059
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.223
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.004
Gradient do_[0] = -0.346
Backward Time Step 2:
Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.227
Backward Time Step 1:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005
Gradient do_[0] = -0.003
Backward Time Step 0:
Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024
Gradient do_[0] = 0.139
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.140, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.671, o_gate[0] = 0.113, c_hat[0] = 0.750
c_state[0] = 0.582, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.389, f_gate[0] = 0.659, o_gate[0] = 0.103, c_hat[0] = 0.761
c_state[0] = 0.679, h_state[0] = 0.061
Time Step 3:
i_gate[0] = 0.342, f_gate[0] = 0.643, o_gate[0] = 0.100, c_hat[0] = 0.821
c_state[0] = 0.718, h_state[0] = 0.061
Time Step 4:
i_gate[0] = 0.309, f_gate[0] = 0.646, o_gate[0] = 0.096, c_hat[0] = 0.811
c_state[0] = 0.715, h_state[0] = 0.059
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.066
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.111
Backward Time Step 2:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = 0.030
Backward Time Step 1:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.010
Gradient do_[0] = 0.184
Backward Time Step 0:
Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030
Gradient do_[0] = 0.204
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.670, o_gate[0] = 0.111, c_hat[0] = 0.749
c_state[0] = 0.581, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.658, o_gate[0] = 0.101, c_hat[0] = 0.760
c_state[0] = 0.679, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.641, o_gate[0] = 0.098, c_hat[0] = 0.820
c_state[0] = 0.717, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.642, o_gate[0] = 0.094, c_hat[0] = 0.811
c_state[0] = 0.713, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.214
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004
Gradient do_[0] = -0.316
Backward Time Step 2:
Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.189
Backward Time Step 1:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005
Gradient do_[0] = 0.023
Backward Time Step 0:
Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024
Gradient do_[0] = 0.146
Time Step 0:
i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.140, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.671, o_gate[0] = 0.113, c_hat[0] = 0.749
c_state[0] = 0.581, h_state[0] = 0.059
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.660, o_gate[0] = 0.103, c_hat[0] = 0.760
c_state[0] = 0.679, h_state[0] = 0.061
Time Step 3:
i_gate[0] = 0.342, f_gate[0] = 0.646, o_gate[0] = 0.100, c_hat[0] = 0.819
c_state[0] = 0.718, h_state[0] = 0.062
Time Step 4:
i_gate[0] = 0.309, f_gate[0] = 0.650, o_gate[0] = 0.097, c_hat[0] = 0.808
c_state[0] = 0.716, h_state[0] = 0.059
Backward Time Step 4:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.132
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003
Gradient do_[0] = -0.222
Backward Time Step 2:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = -0.102
Backward Time Step 1:
Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.007
Gradient do_[0] = 0.083
Backward Time Step 0:
Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.026
Gradient do_[0] = 0.166
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.139, c_hat[0] = 0.718
c_state[0] = 0.375, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.439, f_gate[0] = 0.671, o_gate[0] = 0.111, c_hat[0] = 0.747
c_state[0] = 0.580, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.660, o_gate[0] = 0.102, c_hat[0] = 0.757
c_state[0] = 0.676, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.341, f_gate[0] = 0.645, o_gate[0] = 0.099, c_hat[0] = 0.817
c_state[0] = 0.715, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.308, f_gate[0] = 0.650, o_gate[0] = 0.095, c_hat[0] = 0.805
c_state[0] = 0.712, h_state[0] = 0.058
Backward Time Step 4:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003
Gradient do_[0] = 0.280
Backward Time Step 3:
Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005
Gradient do_[0] = 0.426
Backward Time Step 2:
Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.015
Gradient do_[0] = 0.655
Backward Time Step 1:
Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.025
Gradient do_[0] = 0.663
Backward Time Step 0:
Gradient di[0] = 0.032, df[0] = 0.026, dc_hat[0] = 0.045
Gradient do_[0] = 0.385
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.133, c_hat[0] = 0.717
c_state[0] = 0.374, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.670, o_gate[0] = 0.106, c_hat[0] = 0.745
c_state[0] = 0.580, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.392, f_gate[0] = 0.658, o_gate[0] = 0.097, c_hat[0] = 0.755
c_state[0] = 0.677, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.640, o_gate[0] = 0.093, c_hat[0] = 0.817
c_state[0] = 0.716, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.313, f_gate[0] = 0.642, o_gate[0] = 0.088, c_hat[0] = 0.807
c_state[0] = 0.712, h_state[0] = 0.054
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.065
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001
Gradient do_[0] = -0.087
Backward Time Step 2:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.004
Gradient do_[0] = 0.070
Backward Time Step 1:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.011
Gradient do_[0] = 0.213
Backward Time Step 0:
Gradient di[0] = 0.020, df[0] = 0.017, dc_hat[0] = 0.029
Gradient do_[0] = 0.214
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.717
c_state[0] = 0.375, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.671, o_gate[0] = 0.107, c_hat[0] = 0.746
c_state[0] = 0.581, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.393, f_gate[0] = 0.660, o_gate[0] = 0.097, c_hat[0] = 0.756
c_state[0] = 0.680, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.645, o_gate[0] = 0.094, c_hat[0] = 0.816
c_state[0] = 0.722, h_state[0] = 0.058
Time Step 4:
i_gate[0] = 0.315, f_gate[0] = 0.647, o_gate[0] = 0.089, c_hat[0] = 0.805
c_state[0] = 0.721, h_state[0] = 0.055
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004
Gradient do_[0] = -0.379
Backward Time Step 3:
Gradient di[0] = -0.010, df[0] = -0.008, dc_hat[0] = -0.006
Gradient do_[0] = -0.550
Backward Time Step 2:
Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.005
Gradient do_[0] = -0.447
Backward Time Step 1:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.167
Backward Time Step 0:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.018
Gradient do_[0] = 0.077
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.135, c_hat[0] = 0.717
c_state[0] = 0.375, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.108, c_hat[0] = 0.745
c_state[0] = 0.580, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.662, o_gate[0] = 0.098, c_hat[0] = 0.754
c_state[0] = 0.677, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.648, o_gate[0] = 0.095, c_hat[0] = 0.813
c_state[0] = 0.718, h_state[0] = 0.058
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.653, o_gate[0] = 0.091, c_hat[0] = 0.800
c_state[0] = 0.718, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001
Gradient do_[0] = -0.072
Backward Time Step 3:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.118
Backward Time Step 2:
Gradient di[0] = 0.003, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = 0.022
Backward Time Step 1:
Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.010
Gradient do_[0] = 0.176
Backward Time Step 0:
Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028
Gradient do_[0] = 0.199
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.132, c_hat[0] = 0.717
c_state[0] = 0.374, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.671, o_gate[0] = 0.106, c_hat[0] = 0.744
c_state[0] = 0.579, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.391, f_gate[0] = 0.660, o_gate[0] = 0.096, c_hat[0] = 0.752
c_state[0] = 0.676, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.645, o_gate[0] = 0.092, c_hat[0] = 0.812
c_state[0] = 0.717, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.314, f_gate[0] = 0.649, o_gate[0] = 0.088, c_hat[0] = 0.799
c_state[0] = 0.716, h_state[0] = 0.054
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = -0.003, dc_hat[0] = -0.002
Gradient do_[0] = -0.200
Backward Time Step 3:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003
Gradient do_[0] = -0.284
Backward Time Step 2:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.153
Backward Time Step 1:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.006
Gradient do_[0] = 0.047
Backward Time Step 0:
Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024
Gradient do_[0] = 0.151
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.133, c_hat[0] = 0.717
c_state[0] = 0.374, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.106, c_hat[0] = 0.743
c_state[0] = 0.579, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.662, o_gate[0] = 0.097, c_hat[0] = 0.751
c_state[0] = 0.676, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.648, o_gate[0] = 0.093, c_hat[0] = 0.810
c_state[0] = 0.717, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.652, o_gate[0] = 0.089, c_hat[0] = 0.797
c_state[0] = 0.716, h_state[0] = 0.055
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003
Gradient do_[0] = -0.237
Backward Time Step 3:
Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004
Gradient do_[0] = -0.348
Backward Time Step 2:
Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.002
Gradient do_[0] = -0.234
Backward Time Step 1:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.004
Gradient do_[0] = -0.016
Backward Time Step 0:
Gradient di[0] = 0.015, df[0] = 0.013, dc_hat[0] = 0.022
Gradient do_[0] = 0.126
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.717
c_state[0] = 0.374, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.439, f_gate[0] = 0.672, o_gate[0] = 0.107, c_hat[0] = 0.743
c_state[0] = 0.578, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.663, o_gate[0] = 0.097, c_hat[0] = 0.749
c_state[0] = 0.674, h_state[0] = 0.057
Time Step 3:
i_gate[0] = 0.343, f_gate[0] = 0.650, o_gate[0] = 0.093, c_hat[0] = 0.807
c_state[0] = 0.715, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.656, o_gate[0] = 0.090, c_hat[0] = 0.793
c_state[0] = 0.715, h_state[0] = 0.055
Backward Time Step 4:
Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.008
Gradient do_[0] = 0.773
Backward Time Step 3:
Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.012
Gradient do_[0] = 1.095
Backward Time Step 2:
Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.028
Gradient do_[0] = 1.401
Backward Time Step 1:
Gradient di[0] = 0.038, df[0] = 0.030, dc_hat[0] = 0.040
Gradient do_[0] = 1.224
Backward Time Step 0:
Gradient di[0] = 0.043, df[0] = 0.036, dc_hat[0] = 0.061
Gradient do_[0] = 0.599
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.671, o_gate[0] = 0.101, c_hat[0] = 0.740
c_state[0] = 0.577, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.393, f_gate[0] = 0.660, o_gate[0] = 0.091, c_hat[0] = 0.747
c_state[0] = 0.674, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.349, f_gate[0] = 0.645, o_gate[0] = 0.087, c_hat[0] = 0.807
c_state[0] = 0.716, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.317, f_gate[0] = 0.647, o_gate[0] = 0.082, c_hat[0] = 0.795
c_state[0] = 0.715, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.166
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003
Gradient do_[0] = -0.225
Backward Time Step 2:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = -0.080
Backward Time Step 1:
Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.007
Gradient do_[0] = 0.102
Backward Time Step 0:
Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.025
Gradient do_[0] = 0.172
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.129, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.442, f_gate[0] = 0.672, o_gate[0] = 0.102, c_hat[0] = 0.740
c_state[0] = 0.577, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.393, f_gate[0] = 0.662, o_gate[0] = 0.092, c_hat[0] = 0.746
c_state[0] = 0.675, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.349, f_gate[0] = 0.648, o_gate[0] = 0.088, c_hat[0] = 0.806
c_state[0] = 0.719, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.317, f_gate[0] = 0.651, o_gate[0] = 0.083, c_hat[0] = 0.793
c_state[0] = 0.719, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.005
Gradient do_[0] = -0.522
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = -0.010, dc_hat[0] = -0.008
Gradient do_[0] = -0.710
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.008
Gradient do_[0] = -0.602
Backward Time Step 1:
Gradient di[0] = -0.003, df[0] = -0.003, dc_hat[0] = -0.003
Gradient do_[0] = -0.274
Backward Time Step 0:
Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.014
Gradient do_[0] = 0.039
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.132, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.104, c_hat[0] = 0.739
c_state[0] = 0.576, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.664, o_gate[0] = 0.094, c_hat[0] = 0.744
c_state[0] = 0.673, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.652, o_gate[0] = 0.090, c_hat[0] = 0.803
c_state[0] = 0.716, h_state[0] = 0.056
Time Step 4:
i_gate[0] = 0.313, f_gate[0] = 0.658, o_gate[0] = 0.086, c_hat[0] = 0.787
c_state[0] = 0.718, h_state[0] = 0.053
Backward Time Step 4:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = 0.002
Backward Time Step 3:
Gradient di[0] = -0.001, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.012
Backward Time Step 2:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005
Gradient do_[0] = 0.140
Backward Time Step 1:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.012
Gradient do_[0] = 0.262
Backward Time Step 0:
Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030
Gradient do_[0] = 0.229
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.672, o_gate[0] = 0.101, c_hat[0] = 0.738
c_state[0] = 0.576, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.393, f_gate[0] = 0.662, o_gate[0] = 0.091, c_hat[0] = 0.743
c_state[0] = 0.673, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.348, f_gate[0] = 0.648, o_gate[0] = 0.087, c_hat[0] = 0.803
c_state[0] = 0.715, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.316, f_gate[0] = 0.652, o_gate[0] = 0.082, c_hat[0] = 0.789
c_state[0] = 0.716, h_state[0] = 0.050
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003
Gradient do_[0] = -0.312
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.005
Gradient do_[0] = -0.416
Backward Time Step 2:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003
Gradient do_[0] = -0.286
Backward Time Step 1:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003
Gradient do_[0] = -0.049
Backward Time Step 0:
Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.020
Gradient do_[0] = 0.116
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.673, o_gate[0] = 0.103, c_hat[0] = 0.738
c_state[0] = 0.576, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.391, f_gate[0] = 0.665, o_gate[0] = 0.093, c_hat[0] = 0.742
c_state[0] = 0.673, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.653, o_gate[0] = 0.089, c_hat[0] = 0.801
c_state[0] = 0.716, h_state[0] = 0.055
Time Step 4:
i_gate[0] = 0.314, f_gate[0] = 0.659, o_gate[0] = 0.085, c_hat[0] = 0.784
c_state[0] = 0.718, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.165
Backward Time Step 3:
Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003
Gradient do_[0] = -0.237
Backward Time Step 2:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = -0.103
Backward Time Step 1:
Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.007
Gradient do_[0] = 0.083
Backward Time Step 0:
Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024
Gradient do_[0] = 0.163
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.129, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.101, c_hat[0] = 0.737
c_state[0] = 0.575, h_state[0] = 0.052
Time Step 2:
i_gate[0] = 0.391, f_gate[0] = 0.664, o_gate[0] = 0.091, c_hat[0] = 0.740
c_state[0] = 0.672, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.652, o_gate[0] = 0.087, c_hat[0] = 0.799
c_state[0] = 0.715, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.316, f_gate[0] = 0.657, o_gate[0] = 0.082, c_hat[0] = 0.783
c_state[0] = 0.717, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.004, dc_hat[0] = -0.003
Gradient do_[0] = -0.295
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.005
Gradient do_[0] = -0.400
Backward Time Step 2:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003
Gradient do_[0] = -0.274
Backward Time Step 1:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003
Gradient do_[0] = -0.040
Backward Time Step 0:
Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.020
Gradient do_[0] = 0.118
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.439, f_gate[0] = 0.673, o_gate[0] = 0.103, c_hat[0] = 0.737
c_state[0] = 0.574, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.389, f_gate[0] = 0.666, o_gate[0] = 0.093, c_hat[0] = 0.739
c_state[0] = 0.670, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.655, o_gate[0] = 0.089, c_hat[0] = 0.797
c_state[0] = 0.714, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.313, f_gate[0] = 0.663, o_gate[0] = 0.085, c_hat[0] = 0.778
c_state[0] = 0.717, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.074
Backward Time Step 3:
Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001
Gradient do_[0] = 0.082
Backward Time Step 2:
Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.007
Gradient do_[0] = 0.239
Backward Time Step 1:
Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.014
Gradient do_[0] = 0.332
Backward Time Step 0:
Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032
Gradient do_[0] = 0.253
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.126, c_hat[0] = 0.714
c_state[0] = 0.373, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.099, c_hat[0] = 0.736
c_state[0] = 0.574, h_state[0] = 0.051
Time Step 2:
i_gate[0] = 0.392, f_gate[0] = 0.664, o_gate[0] = 0.089, c_hat[0] = 0.739
c_state[0] = 0.671, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.348, f_gate[0] = 0.651, o_gate[0] = 0.085, c_hat[0] = 0.798
c_state[0] = 0.714, h_state[0] = 0.052
Time Step 4:
i_gate[0] = 0.316, f_gate[0] = 0.656, o_gate[0] = 0.080, c_hat[0] = 0.781
c_state[0] = 0.716, h_state[0] = 0.049
Backward Time Step 4:
Gradient di[0] = -0.004, df[0] = -0.004, dc_hat[0] = -0.003
Gradient do_[0] = -0.298
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.005
Gradient do_[0] = -0.393
Backward Time Step 2:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.261
Backward Time Step 1:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003
Gradient do_[0] = -0.032
Backward Time Step 0:
Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.020
Gradient do_[0] = 0.120
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.130, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.439, f_gate[0] = 0.674, o_gate[0] = 0.102, c_hat[0] = 0.737
c_state[0] = 0.574, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.389, f_gate[0] = 0.667, o_gate[0] = 0.092, c_hat[0] = 0.739
c_state[0] = 0.671, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.657, o_gate[0] = 0.088, c_hat[0] = 0.797
c_state[0] = 0.715, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.314, f_gate[0] = 0.664, o_gate[0] = 0.084, c_hat[0] = 0.778
c_state[0] = 0.719, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004
Gradient do_[0] = -0.396
Backward Time Step 3:
Gradient di[0] = -0.009, df[0] = -0.008, dc_hat[0] = -0.006
Gradient do_[0] = -0.548
Backward Time Step 2:
Gradient di[0] = -0.005, df[0] = -0.005, dc_hat[0] = -0.006
Gradient do_[0] = -0.439
Backward Time Step 1:
Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000
Gradient do_[0] = -0.158
Backward Time Step 0:
Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.017
Gradient do_[0] = 0.075
Time Step 0:
i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.133, c_hat[0] = 0.715
c_state[0] = 0.373, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.437, f_gate[0] = 0.674, o_gate[0] = 0.104, c_hat[0] = 0.736
c_state[0] = 0.573, h_state[0] = 0.054
Time Step 2:
i_gate[0] = 0.386, f_gate[0] = 0.669, o_gate[0] = 0.094, c_hat[0] = 0.737
c_state[0] = 0.668, h_state[0] = 0.055
Time Step 3:
i_gate[0] = 0.341, f_gate[0] = 0.660, o_gate[0] = 0.090, c_hat[0] = 0.792
c_state[0] = 0.711, h_state[0] = 0.055
Time Step 4:
i_gate[0] = 0.310, f_gate[0] = 0.670, o_gate[0] = 0.086, c_hat[0] = 0.772
c_state[0] = 0.715, h_state[0] = 0.053
Backward Time Step 4:
Gradient di[0] = 0.131, df[0] = 0.118, dc_hat[0] = 0.099
Gradient do_[0] = 9.057
Backward Time Step 3:
Gradient di[0] = 0.196, df[0] = 0.163, dc_hat[0] = 0.140
Gradient do_[0] = 11.981
Backward Time Step 2:
Gradient di[0] = 0.241, df[0] = 0.204, dc_hat[0] = 0.243
Gradient do_[0] = 12.870
Backward Time Step 1:
Gradient di[0] = 0.255, df[0] = 0.209, dc_hat[0] = 0.283
Gradient do_[0] = 9.439
Backward Time Step 0:
Gradient di[0] = 0.214, df[0] = 0.178, dc_hat[0] = 0.306
Gradient do_[0] = 3.567
Time Step 0:
i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.127, c_hat[0] = 0.699
c_state[0] = 0.362, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.435, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.721
c_state[0] = 0.556, h_state[0] = 0.050
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.661, o_gate[0] = 0.089, c_hat[0] = 0.723
c_state[0] = 0.647, h_state[0] = 0.051
Time Step 3:
i_gate[0] = 0.342, f_gate[0] = 0.650, o_gate[0] = 0.085, c_hat[0] = 0.783
c_state[0] = 0.688, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.310, f_gate[0] = 0.656, o_gate[0] = 0.080, c_hat[0] = 0.763
c_state[0] = 0.688, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.003
Gradient do_[0] = 0.284
Backward Time Step 3:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.004
Gradient do_[0] = 0.351
Backward Time Step 2:
Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.012
Gradient do_[0] = 0.508
Backward Time Step 1:
Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.020
Gradient do_[0] = 0.509
Backward Time Step 0:
Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.037
Gradient do_[0] = 0.306
Time Step 0:
i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.122, c_hat[0] = 0.698
c_state[0] = 0.361, h_state[0] = 0.042
Time Step 1:
i_gate[0] = 0.437, f_gate[0] = 0.668, o_gate[0] = 0.095, c_hat[0] = 0.719
c_state[0] = 0.555, h_state[0] = 0.048
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.659, o_gate[0] = 0.085, c_hat[0] = 0.722
c_state[0] = 0.648, h_state[0] = 0.048
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.646, o_gate[0] = 0.080, c_hat[0] = 0.784
c_state[0] = 0.690, h_state[0] = 0.048
Time Step 4:
i_gate[0] = 0.315, f_gate[0] = 0.649, o_gate[0] = 0.075, c_hat[0] = 0.767
c_state[0] = 0.690, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.006
Gradient do_[0] = -0.590
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = -0.010, dc_hat[0] = -0.009
Gradient do_[0] = -0.736
Backward Time Step 2:
Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.009
Gradient do_[0] = -0.587
Backward Time Step 1:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003
Gradient do_[0] = -0.250
Backward Time Step 0:
Gradient di[0] = 0.009, df[0] = 0.008, dc_hat[0] = 0.014
Gradient do_[0] = 0.046
Time Step 0:
i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.126, c_hat[0] = 0.699
c_state[0] = 0.361, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.435, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.720
c_state[0] = 0.555, h_state[0] = 0.050
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.661, o_gate[0] = 0.088, c_hat[0] = 0.721
c_state[0] = 0.647, h_state[0] = 0.050
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.651, o_gate[0] = 0.084, c_hat[0] = 0.782
c_state[0] = 0.689, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.656, o_gate[0] = 0.079, c_hat[0] = 0.762
c_state[0] = 0.690, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.004
Gradient do_[0] = -0.324
Backward Time Step 3:
Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.006
Gradient do_[0] = -0.425
Backward Time Step 2:
Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003
Gradient do_[0] = -0.283
Backward Time Step 1:
Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003
Gradient do_[0] = -0.041
Backward Time Step 0:
Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.021
Gradient do_[0] = 0.117
Time Step 0:
i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.127, c_hat[0] = 0.699
c_state[0] = 0.361, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.435, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.719
c_state[0] = 0.554, h_state[0] = 0.050
Time Step 2:
i_gate[0] = 0.386, f_gate[0] = 0.663, o_gate[0] = 0.088, c_hat[0] = 0.719
c_state[0] = 0.645, h_state[0] = 0.050
Time Step 3:
i_gate[0] = 0.342, f_gate[0] = 0.653, o_gate[0] = 0.084, c_hat[0] = 0.779
c_state[0] = 0.687, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.310, f_gate[0] = 0.659, o_gate[0] = 0.079, c_hat[0] = 0.758
c_state[0] = 0.688, h_state[0] = 0.047
Backward Time Step 4:
Gradient di[0] = 0.057, df[0] = 0.052, dc_hat[0] = 0.047
Gradient do_[0] = 4.169
Backward Time Step 3:
Gradient di[0] = 0.083, df[0] = 0.070, dc_hat[0] = 0.064
Gradient do_[0] = 5.294
Backward Time Step 2:
Gradient di[0] = 0.101, df[0] = 0.088, dc_hat[0] = 0.111
Gradient do_[0] = 5.538
Backward Time Step 1:
Gradient di[0] = 0.107, df[0] = 0.090, dc_hat[0] = 0.127
Gradient do_[0] = 4.011
Backward Time Step 0:
Gradient di[0] = 0.093, df[0] = 0.078, dc_hat[0] = 0.140
Gradient do_[0] = 1.518
Time Step 0:
i_gate[0] = 0.515, f_gate[0] = 0.680, o_gate[0] = 0.121, c_hat[0] = 0.692
c_state[0] = 0.356, h_state[0] = 0.042
Time Step 1:
i_gate[0] = 0.436, f_gate[0] = 0.666, o_gate[0] = 0.094, c_hat[0] = 0.711
c_state[0] = 0.547, h_state[0] = 0.047
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.658, o_gate[0] = 0.084, c_hat[0] = 0.711
c_state[0] = 0.637, h_state[0] = 0.047
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.646, o_gate[0] = 0.079, c_hat[0] = 0.775
c_state[0] = 0.680, h_state[0] = 0.046
Time Step 4:
i_gate[0] = 0.315, f_gate[0] = 0.649, o_gate[0] = 0.073, c_hat[0] = 0.755
c_state[0] = 0.679, h_state[0] = 0.043
Backward Time Step 4:
Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.007
Gradient do_[0] = -0.589
Backward Time Step 3:
Gradient di[0] = -0.012, df[0] = -0.010, dc_hat[0] = -0.009
Gradient do_[0] = -0.727
Backward Time Step 2:
Gradient di[0] = -0.007, df[0] = -0.007, dc_hat[0] = -0.008
Gradient do_[0] = -0.557
Backward Time Step 1:
Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002
Gradient do_[0] = -0.216
Backward Time Step 0:
Gradient di[0] = 0.010, df[0] = 0.009, dc_hat[0] = 0.016
Gradient do_[0] = 0.061
Time Step 0:
i_gate[0] = 0.515, f_gate[0] = 0.680, o_gate[0] = 0.123, c_hat[0] = 0.692
c_state[0] = 0.357, h_state[0] = 0.042
Time Step 1:
i_gate[0] = 0.434, f_gate[0] = 0.667, o_gate[0] = 0.095, c_hat[0] = 0.710
c_state[0] = 0.546, h_state[0] = 0.047
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.661, o_gate[0] = 0.085, c_hat[0] = 0.708
c_state[0] = 0.635, h_state[0] = 0.048
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.650, o_gate[0] = 0.080, c_hat[0] = 0.770
c_state[0] = 0.679, h_state[0] = 0.047
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.656, o_gate[0] = 0.074, c_hat[0] = 0.748
c_state[0] = 0.679, h_state[0] = 0.044
Backward Time Step 4:
Gradient di[0] = 0.196, df[0] = 0.180, dc_hat[0] = 0.168
Gradient do_[0] = 14.917
Backward Time Step 3:
Gradient di[0] = 0.279, df[0] = 0.238, dc_hat[0] = 0.225
Gradient do_[0] = 18.334
Backward Time Step 2:
Gradient di[0] = 0.313, df[0] = 0.276, dc_hat[0] = 0.360
Gradient do_[0] = 17.925
Backward Time Step 1:
Gradient di[0] = 0.306, df[0] = 0.260, dc_hat[0] = 0.378
Gradient do_[0] = 12.037
Backward Time Step 0:
Gradient di[0] = 0.230, df[0] = 0.197, dc_hat[0] = 0.357
Gradient do_[0] = 4.052
Time Step 0:
i_gate[0] = 0.509, f_gate[0] = 0.675, o_gate[0] = 0.118, c_hat[0] = 0.671
c_state[0] = 0.342, h_state[0] = 0.039
Time Step 1:
i_gate[0] = 0.432, f_gate[0] = 0.660, o_gate[0] = 0.091, c_hat[0] = 0.686
c_state[0] = 0.521, h_state[0] = 0.044
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.651, o_gate[0] = 0.081, c_hat[0] = 0.684
c_state[0] = 0.604, h_state[0] = 0.044
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.638, o_gate[0] = 0.075, c_hat[0] = 0.752
c_state[0] = 0.645, h_state[0] = 0.043
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.641, o_gate[0] = 0.069, c_hat[0] = 0.730
c_state[0] = 0.641, h_state[0] = 0.039
Backward Time Step 4:
Gradient di[0] = 0.117, df[0] = 0.110, dc_hat[0] = 0.109
Gradient do_[0] = 8.973
Backward Time Step 3:
Gradient di[0] = 0.161, df[0] = 0.140, dc_hat[0] = 0.142
Gradient do_[0] = 10.594
Backward Time Step 2:
Gradient di[0] = 0.171, df[0] = 0.156, dc_hat[0] = 0.217
Gradient do_[0] = 9.845
Backward Time Step 1:
Gradient di[0] = 0.157, df[0] = 0.139, dc_hat[0] = 0.214
Gradient do_[0] = 6.245
Backward Time Step 0:
Gradient di[0] = 0.115, df[0] = 0.101, dc_hat[0] = 0.192
Gradient do_[0] = 2.017
Time Step 0:
i_gate[0] = 0.505, f_gate[0] = 0.670, o_gate[0] = 0.113, c_hat[0] = 0.652
c_state[0] = 0.329, h_state[0] = 0.036
Time Step 1:
i_gate[0] = 0.431, f_gate[0] = 0.654, o_gate[0] = 0.087, c_hat[0] = 0.664
c_state[0] = 0.502, h_state[0] = 0.040
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.644, o_gate[0] = 0.077, c_hat[0] = 0.660
c_state[0] = 0.579, h_state[0] = 0.040
Time Step 3:
i_gate[0] = 0.348, f_gate[0] = 0.629, o_gate[0] = 0.072, c_hat[0] = 0.735
c_state[0] = 0.620, h_state[0] = 0.040
Time Step 4:
i_gate[0] = 0.316, f_gate[0] = 0.629, o_gate[0] = 0.065, c_hat[0] = 0.715
c_state[0] = 0.616, h_state[0] = 0.036
Backward Time Step 4:
Gradient di[0] = -56.088, df[0] = -53.271, dc_hat[0] = -55.975
Gradient do_[0] = -4363.406
Backward Time Step 3:
Gradient di[0] = -74.442, df[0] = -65.558, dc_hat[0] = -71.455
Gradient do_[0] = -4935.278
Backward Time Step 2:
Gradient di[0] = -70.799, df[0] = -66.648, dc_hat[0] = -98.798
Gradient do_[0] = -4198.427
Backward Time Step 1:
Gradient di[0] = -57.482, df[0] = -52.277, dc_hat[0] = -85.194
Gradient do_[0] = -2388.240
Backward Time Step 0:
Gradient di[0] = -34.825, df[0] = -31.658, dc_hat[0] = -62.105
Gradient do_[0] = -669.594
Time Step 0:
i_gate[0] = 0.518, f_gate[0] = 0.681, o_gate[0] = 0.118, c_hat[0] = 0.680
c_state[0] = 0.352, h_state[0] = 0.040
Time Step 1:
i_gate[0] = 0.441, f_gate[0] = 0.668, o_gate[0] = 0.092, c_hat[0] = 0.697
c_state[0] = 0.542, h_state[0] = 0.046
Time Step 2:
i_gate[0] = 0.396, f_gate[0] = 0.662, o_gate[0] = 0.082, c_hat[0] = 0.697
c_state[0] = 0.635, h_state[0] = 0.046
Time Step 3:
i_gate[0] = 0.355, f_gate[0] = 0.651, o_gate[0] = 0.077, c_hat[0] = 0.763
c_state[0] = 0.685, h_state[0] = 0.046
Time Step 4:
i_gate[0] = 0.324, f_gate[0] = 0.655, o_gate[0] = 0.071, c_hat[0] = 0.744
c_state[0] = 0.689, h_state[0] = 0.042
Backward Time Step 4:
Gradient di[0] = 0.109, df[0] = 0.099, dc_hat[0] = 0.097
Gradient do_[0] = 8.840
Backward Time Step 3:
Gradient di[0] = 0.148, df[0] = 0.125, dc_hat[0] = 0.125
Gradient do_[0] = 10.154
Backward Time Step 2:
Gradient di[0] = 0.158, df[0] = 0.140, dc_hat[0] = 0.193
Gradient do_[0] = 9.353
Backward Time Step 1:
Gradient di[0] = 0.145, df[0] = 0.125, dc_hat[0] = 0.191
Gradient do_[0] = 5.876
Backward Time Step 0:
Gradient di[0] = 0.105, df[0] = 0.092, dc_hat[0] = 0.173
Gradient do_[0] = 1.881
Time Step 0:
i_gate[0] = 0.514, f_gate[0] = 0.677, o_gate[0] = 0.113, c_hat[0] = 0.661
c_state[0] = 0.340, h_state[0] = 0.037
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.663, o_gate[0] = 0.088, c_hat[0] = 0.676
c_state[0] = 0.523, h_state[0] = 0.042
Time Step 2:
i_gate[0] = 0.397, f_gate[0] = 0.655, o_gate[0] = 0.078, c_hat[0] = 0.675
c_state[0] = 0.611, h_state[0] = 0.043
Time Step 3:
i_gate[0] = 0.358, f_gate[0] = 0.642, o_gate[0] = 0.073, c_hat[0] = 0.748
c_state[0] = 0.660, h_state[0] = 0.042
Time Step 4:
i_gate[0] = 0.326, f_gate[0] = 0.644, o_gate[0] = 0.066, c_hat[0] = 0.729
c_state[0] = 0.663, h_state[0] = 0.039
Backward Time Step 4:
Gradient di[0] = 0.072, df[0] = 0.067, dc_hat[0] = 0.069
Gradient do_[0] = 5.966
Backward Time Step 3:
Gradient di[0] = 0.093, df[0] = 0.080, dc_hat[0] = 0.086
Gradient do_[0] = 6.507
Backward Time Step 2:
Gradient di[0] = 0.094, df[0] = 0.086, dc_hat[0] = 0.125
Gradient do_[0] = 5.621
Backward Time Step 1:
Gradient di[0] = 0.081, df[0] = 0.072, dc_hat[0] = 0.116
Gradient do_[0] = 3.310
Backward Time Step 0:
Gradient di[0] = 0.059, df[0] = 0.053, dc_hat[0] = 0.103
Gradient do_[0] = 1.045
Time Step 0:
i_gate[0] = 0.512, f_gate[0] = 0.674, o_gate[0] = 0.111, c_hat[0] = 0.645
c_state[0] = 0.330, h_state[0] = 0.035
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.086, c_hat[0] = 0.658
c_state[0] = 0.506, h_state[0] = 0.040
Time Step 2:
i_gate[0] = 0.399, f_gate[0] = 0.647, o_gate[0] = 0.076, c_hat[0] = 0.657
c_state[0] = 0.590, h_state[0] = 0.040
Time Step 3:
i_gate[0] = 0.360, f_gate[0] = 0.631, o_gate[0] = 0.071, c_hat[0] = 0.735
c_state[0] = 0.637, h_state[0] = 0.040
Time Step 4:
i_gate[0] = 0.329, f_gate[0] = 0.629, o_gate[0] = 0.064, c_hat[0] = 0.720
c_state[0] = 0.637, h_state[0] = 0.036
Backward Time Step 4:
Gradient di[0] = -8.832, df[0] = -8.159, dc_hat[0] = -8.814
Gradient do_[0] = -713.266
Backward Time Step 3:
Gradient di[0] = -11.515, df[0] = -9.987, dc_hat[0] = -11.228
Gradient do_[0] = -788.504
Backward Time Step 2:
Gradient di[0] = -10.567, df[0] = -9.907, dc_hat[0] = -15.177
Gradient do_[0] = -645.772
Backward Time Step 1:
Gradient di[0] = -8.282, df[0] = -7.570, dc_hat[0] = -12.760
Gradient do_[0] = -354.749
Backward Time Step 0:
Gradient di[0] = -4.946, df[0] = -4.548, dc_hat[0] = -9.188
Gradient do_[0] = -98.564
Time Step 0:
i_gate[0] = 0.500, f_gate[0] = 0.663, o_gate[0] = 0.106, c_hat[0] = 0.615
c_state[0] = 0.307, h_state[0] = 0.032
Time Step 1:
i_gate[0] = 0.427, f_gate[0] = 0.647, o_gate[0] = 0.081, c_hat[0] = 0.619
c_state[0] = 0.463, h_state[0] = 0.035
Time Step 2:
i_gate[0] = 0.387, f_gate[0] = 0.636, o_gate[0] = 0.072, c_hat[0] = 0.610
c_state[0] = 0.531, h_state[0] = 0.035
Time Step 3:
i_gate[0] = 0.349, f_gate[0] = 0.621, o_gate[0] = 0.067, c_hat[0] = 0.688
c_state[0] = 0.570, h_state[0] = 0.034
Time Step 4:
i_gate[0] = 0.319, f_gate[0] = 0.620, o_gate[0] = 0.060, c_hat[0] = 0.660
c_state[0] = 0.564, h_state[0] = 0.031
Backward Time Step 4:
Gradient di[0] = -10177084.000, df[0] = -10364855.000, dc_hat[0] = -12777321.000
Gradient do_[0] = -811651072.000
Backward Time Step 3:
Gradient di[0] = -13232801.000, df[0] = -12364564.000, dc_hat[0] = -15529369.000
Gradient do_[0] = -888755200.000
Backward Time Step 2:
Gradient di[0] = -12179485.000, df[0] = -12395052.000, dc_hat[0] = -20466140.000
Gradient do_[0] = -744923136.000
Backward Time Step 1:
Gradient di[0] = -9939050.000, df[0] = -9687769.000, dc_hat[0] = -17272750.000
Gradient do_[0] = -429558976.000
Backward Time Step 0:
Gradient di[0] = -6349600.000, df[0] = -6121459.000, dc_hat[0] = -12844551.000
Gradient do_[0] = -127568376.000
Time Step 0:
i_gate[0] = 0.512, f_gate[0] = 0.674, o_gate[0] = 0.111, c_hat[0] = 0.645
c_state[0] = 0.330, h_state[0] = 0.035
Time Step 1:
i_gate[0] = 0.438, f_gate[0] = 0.664, o_gate[0] = 0.086, c_hat[0] = 0.649
c_state[0] = 0.504, h_state[0] = 0.040
Time Step 2:
i_gate[0] = 0.398, f_gate[0] = 0.658, o_gate[0] = 0.077, c_hat[0] = 0.635
c_state[0] = 0.585, h_state[0] = 0.040
Time Step 3:
i_gate[0] = 0.360, f_gate[0] = 0.648, o_gate[0] = 0.072, c_hat[0] = 0.702
c_state[0] = 0.632, h_state[0] = 0.040
Time Step 4:
i_gate[0] = 0.332, f_gate[0] = 0.652, o_gate[0] = 0.066, c_hat[0] = 0.666
c_state[0] = 0.633, h_state[0] = 0.037
Backward Time Step 4:
Gradient di[0] = 1830121.000, df[0] = 1833165.750, dc_hat[0] = 2289939.750
Gradient do_[0] = 153543648.000
Backward Time Step 3:
Gradient di[0] = 2171240.000, df[0] = 1982507.250, dc_hat[0] = 2450665.000
Gradient do_[0] = 152030768.000
Backward Time Step 2:
Gradient di[0] = 2050281.875, df[0] = 1994436.375, dc_hat[0] = 3197066.500
Gradient do_[0] = 127804216.000
Backward Time Step 1:
Gradient di[0] = 1785827.250, df[0] = 1654820.375, dc_hat[0] = 2832199.250
Gradient do_[0] = 77136624.000
Backward Time Step 0:
Gradient di[0] = 1196132.250, df[0] = 1099798.750, dc_hat[0] = 2222246.250
Gradient do_[0] = 23799746.000
Time Step 0:
i_gate[0] = 0.500, f_gate[0] = 0.663, o_gate[0] = 0.106, c_hat[0] = 0.615
c_state[0] = 0.307, h_state[0] = 0.032
Time Step 1:
i_gate[0] = 0.428, f_gate[0] = 0.648, o_gate[0] = 0.082, c_hat[0] = 0.615
c_state[0] = 0.462, h_state[0] = 0.035
Time Step 2:
i_gate[0] = 0.389, f_gate[0] = 0.639, o_gate[0] = 0.073, c_hat[0] = 0.601
c_state[0] = 0.529, h_state[0] = 0.035
Time Step 3:
i_gate[0] = 0.352, f_gate[0] = 0.626, o_gate[0] = 0.068, c_hat[0] = 0.676
c_state[0] = 0.569, h_state[0] = 0.035
Time Step 4:
i_gate[0] = 0.323, f_gate[0] = 0.627, o_gate[0] = 0.062, c_hat[0] = 0.640
c_state[0] = 0.564, h_state[0] = 0.032
Backward Time Step 4:
Gradient di[0] = -14507731.000, df[0] = -15202469.000, dc_hat[0] = -19762446.000
Gradient do_[0] = -1152882304.000
Backward Time Step 3:
Gradient di[0] = -17390244.000, df[0] = -16539071.000, dc_hat[0] = -21584336.000
Gradient do_[0] = -1158783104.000
Backward Time Step 2:
Gradient di[0] = -15275506.000, df[0] = -15774069.000, dc_hat[0] = -26616044.000
Gradient do_[0] = -929913600.000
Backward Time Step 1:
Gradient di[0] = -12382044.000, df[0] = -12159019.000, dc_hat[0] = -21929236.000
Gradient do_[0] = -533257888.000
Backward Time Step 0:
Gradient di[0] = -7962733.000, df[0] = -7676632.500, dc_hat[0] = -16107743.000
Gradient do_[0] = -159977440.000
Time Step 0:
i_gate[0] = 0.512, f_gate[0] = 0.674, o_gate[0] = 0.111, c_hat[0] = 0.645
c_state[0] = 0.330, h_state[0] = 0.035
Time Step 1:
i_gate[0] = 0.440, f_gate[0] = 0.667, o_gate[0] = 0.087, c_hat[0] = 0.643
c_state[0] = 0.503, h_state[0] = 0.040
Time Step 2:
i_gate[0] = 0.401, f_gate[0] = 0.663, o_gate[0] = 0.078, c_hat[0] = 0.622
c_state[0] = 0.583, h_state[0] = 0.041
Time Step 3:
i_gate[0] = 0.366, f_gate[0] = 0.657, o_gate[0] = 0.074, c_hat[0] = 0.683
c_state[0] = 0.633, h_state[0] = 0.042
Time Step 4:
i_gate[0] = 0.339, f_gate[0] = 0.665, o_gate[0] = 0.069, c_hat[0] = 0.635
c_state[0] = 0.636, h_state[0] = 0.039
Backward Time Step 4:
Gradient di[0] = -5284627.500, df[0] = -5499826.000, dc_hat[0] = -7516888.500
Gradient do_[0] = -443141696.000
Backward Time Step 3:
Gradient di[0] = -5776481.500, df[0] = -5396417.000, dc_hat[0] = -7108135.500
Gradient do_[0] = -401221184.000
Backward Time Step 2:
Gradient di[0] = -4912044.500, df[0] = -4865726.000, dc_hat[0] = -8071287.500
Gradient do_[0] = -304769280.000
Backward Time Step 1:
Gradient di[0] = -4016747.000, df[0] = -3753353.750, dc_hat[0] = -6529669.500
Gradient do_[0] = -172796720.000
Backward Time Step 0:
Gradient di[0] = -2605996.250, df[0] = -2396115.750, dc_hat[0] = -4841576.500
Gradient do_[0] = -51852164.000
Time Step 0:
i_gate[0] = 0.524, f_gate[0] = 0.685, o_gate[0] = 0.116, c_hat[0] = 0.673
c_state[0] = 0.353, h_state[0] = 0.039
Time Step 1:
i_gate[0] = 0.450, f_gate[0] = 0.682, o_gate[0] = 0.091, c_hat[0] = 0.674
c_state[0] = 0.544, h_state[0] = 0.045
Time Step 2:
i_gate[0] = 0.410, f_gate[0] = 0.682, o_gate[0] = 0.082, c_hat[0] = 0.653
c_state[0] = 0.639, h_state[0] = 0.046
Time Step 3:
i_gate[0] = 0.375, f_gate[0] = 0.678, o_gate[0] = 0.078, c_hat[0] = 0.707
c_state[0] = 0.699, h_state[0] = 0.047
Time Step 4:
i_gate[0] = 0.350, f_gate[0] = 0.689, o_gate[0] = 0.073, c_hat[0] = 0.661
c_state[0] = 0.712, h_state[0] = 0.045
Backward Time Step 4:
Gradient di[0] = -394027.438, df[0] = -387097.312, dc_hat[0] = -516322.500
Gradient do_[0] = -34995756.000
Backward Time Step 3:
Gradient di[0] = -386826.500, df[0] = -345401.062, dc_hat[0] = -437262.750
Gradient do_[0] = -28305248.000
Backward Time Step 2:
Gradient di[0] = -327092.719, df[0] = -306033.969, dc_hat[0] = -486670.875
Gradient do_[0] = -20845446.000
Backward Time Step 1:
Gradient di[0] = -276198.375, df[0] = -244745.938, dc_hat[0] = -405795.938
Gradient do_[0] = -11949717.000
Backward Time Step 0:
Gradient di[0] = -190172.344, df[0] = -167523.609, dc_hat[0] = -325352.812
Gradient do_[0] = -3752702.250
Time Step 0:
i_gate[0] = 0.537, f_gate[0] = 0.696, o_gate[0] = 0.121, c_hat[0] = 0.699
c_state[0] = 0.375, h_state[0] = 0.043
Time Step 1:
i_gate[0] = 0.460, f_gate[0] = 0.698, o_gate[0] = 0.096, c_hat[0] = 0.705
c_state[0] = 0.586, h_state[0] = 0.051
Time Step 2:
i_gate[0] = 0.420, f_gate[0] = 0.701, o_gate[0] = 0.087, c_hat[0] = 0.686
c_state[0] = 0.698, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.384, f_gate[0] = 0.700, o_gate[0] = 0.083, c_hat[0] = 0.734
c_state[0] = 0.771, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.360, f_gate[0] = 0.713, o_gate[0] = 0.079, c_hat[0] = 0.693
c_state[0] = 0.799, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = -612.691, df[0] = -559.560, dc_hat[0] = -717.083
Gradient do_[0] = -57677.473
Backward Time Step 3:
Gradient di[0] = -840.684, df[0] = -711.329, dc_hat[0] = -856.875
Gradient do_[0] = -64818.344
Backward Time Step 2:
Gradient di[0] = -862.133, df[0] = -758.825, dc_hat[0] = -1148.269
Gradient do_[0] = -56389.848
Backward Time Step 1:
Gradient di[0] = -830.599, df[0] = -697.583, dc_hat[0] = -1095.456
Gradient do_[0] = -36096.195
Backward Time Step 0:
Gradient di[0] = -627.929, df[0] = -532.063, dc_hat[0] = -991.501
Gradient do_[0] = -12305.578
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.686, o_gate[0] = 0.116, c_hat[0] = 0.673
c_state[0] = 0.355, h_state[0] = 0.040
Time Step 1:
i_gate[0] = 0.448, f_gate[0] = 0.689, o_gate[0] = 0.090, c_hat[0] = 0.672
c_state[0] = 0.546, h_state[0] = 0.045
Time Step 2:
i_gate[0] = 0.408, f_gate[0] = 0.694, o_gate[0] = 0.081, c_hat[0] = 0.639
c_state[0] = 0.640, h_state[0] = 0.046
Time Step 3:
i_gate[0] = 0.374, f_gate[0] = 0.696, o_gate[0] = 0.078, c_hat[0] = 0.680
c_state[0] = 0.700, h_state[0] = 0.047
Time Step 4:
i_gate[0] = 0.352, f_gate[0] = 0.712, o_gate[0] = 0.074, c_hat[0] = 0.617
c_state[0] = 0.715, h_state[0] = 0.046
Backward Time Step 4:
Gradient di[0] = -30159064.000, df[0] = -31308278.000, dc_hat[0] = -46714880.000
Gradient do_[0] = -2836994048.000
Backward Time Step 3:
Gradient di[0] = -33148728.000, df[0] = -30640866.000, dc_hat[0] = -41779908.000
Gradient do_[0] = -2531370496.000
Backward Time Step 2:
Gradient di[0] = -29900568.000, df[0] = -28534246.000, dc_hat[0] = -46689516.000
Gradient do_[0] = -1971881728.000
Backward Time Step 1:
Gradient di[0] = -27403146.000, df[0] = -24341240.000, dc_hat[0] = -40497740.000
Gradient do_[0] = -1205203968.000
Backward Time Step 0:
Gradient di[0] = -21644432.000, df[0] = -19071374.000, dc_hat[0] = -37251524.000
Gradient do_[0] = -427794048.000
Time Step 0:
i_gate[0] = 0.540, f_gate[0] = 0.697, o_gate[0] = 0.121, c_hat[0] = 0.699
c_state[0] = 0.377, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.460, f_gate[0] = 0.706, o_gate[0] = 0.095, c_hat[0] = 0.700
c_state[0] = 0.588, h_state[0] = 0.050
Time Step 2:
i_gate[0] = 0.419, f_gate[0] = 0.714, o_gate[0] = 0.086, c_hat[0] = 0.670
c_state[0] = 0.701, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.386, f_gate[0] = 0.718, o_gate[0] = 0.083, c_hat[0] = 0.710
c_state[0] = 0.777, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.364, f_gate[0] = 0.734, o_gate[0] = 0.080, c_hat[0] = 0.659
c_state[0] = 0.810, h_state[0] = 0.054
Backward Time Step 4:
Gradient di[0] = 23767.682, df[0] = 22323.666, dc_hat[0] = 32096.670
Gradient do_[0] = 2357644.750
Backward Time Step 3:
Gradient di[0] = 41180.008, df[0] = 35585.730, dc_hat[0] = 46778.484
Gradient do_[0] = 3311088.000
Backward Time Step 2:
Gradient di[0] = 46190.203, df[0] = 41303.711, dc_hat[0] = 65493.723
Gradient do_[0] = 3133149.250
Backward Time Step 1:
Gradient di[0] = 48082.953, df[0] = 40545.488, dc_hat[0] = 64964.941
Gradient do_[0] = 2134384.000
Backward Time Step 0:
Gradient di[0] = 41183.887, df[0] = 34908.141, dc_hat[0] = 65427.656
Gradient do_[0] = 808568.750
Time Step 0:
i_gate[0] = 0.552, f_gate[0] = 0.707, o_gate[0] = 0.127, c_hat[0] = 0.724
c_state[0] = 0.399, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.472, f_gate[0] = 0.715, o_gate[0] = 0.102, c_hat[0] = 0.735
c_state[0] = 0.632, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.431, f_gate[0] = 0.722, o_gate[0] = 0.093, c_hat[0] = 0.718
c_state[0] = 0.765, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.396, f_gate[0] = 0.725, o_gate[0] = 0.090, c_hat[0] = 0.761
c_state[0] = 0.856, h_state[0] = 0.062
Time Step 4:
i_gate[0] = 0.373, f_gate[0] = 0.740, o_gate[0] = 0.087, c_hat[0] = 0.726
c_state[0] = 0.905, h_state[0] = 0.062
Backward Time Step 4:
Gradient di[0] = -29233.355, df[0] = -24491.543, dc_hat[0] = -30383.992
Gradient do_[0] = -2954344.000
Backward Time Step 3:
Gradient di[0] = -52481.242, df[0] = -41677.879, dc_hat[0] = -48051.535
Gradient do_[0] = -4294995.500
Backward Time Step 2:
Gradient di[0] = -61363.242, df[0] = -50566.457, dc_hat[0] = -72869.398
Gradient do_[0] = -4136427.750
Backward Time Step 1:
Gradient di[0] = -65615.867, df[0] = -52181.082, dc_hat[0] = -77656.203
Gradient do_[0] = -2870320.500
Backward Time Step 0:
Gradient di[0] = -54277.484, df[0] = -44412.965, dc_hat[0] = -79750.430
Gradient do_[0] = -1059924.625
Time Step 0:
i_gate[0] = 0.540, f_gate[0] = 0.697, o_gate[0] = 0.121, c_hat[0] = 0.699
c_state[0] = 0.377, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.459, f_gate[0] = 0.704, o_gate[0] = 0.095, c_hat[0] = 0.701
c_state[0] = 0.588, h_state[0] = 0.050
Time Step 2:
i_gate[0] = 0.417, f_gate[0] = 0.711, o_gate[0] = 0.087, c_hat[0] = 0.674
c_state[0] = 0.699, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.383, f_gate[0] = 0.714, o_gate[0] = 0.084, c_hat[0] = 0.716
c_state[0] = 0.773, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.361, f_gate[0] = 0.730, o_gate[0] = 0.080, c_hat[0] = 0.666
c_state[0] = 0.804, h_state[0] = 0.054
Backward Time Step 4:
Gradient di[0] = 2297.680, df[0] = 2152.850, dc_hat[0] = 3001.092
Gradient do_[0] = 223294.453
Backward Time Step 3:
Gradient di[0] = 3262.886, df[0] = 2814.169, dc_hat[0] = 3602.709
Gradient do_[0] = 257844.609
Backward Time Step 2:
Gradient di[0] = 3457.547, df[0] = 3084.812, dc_hat[0] = 4812.047
Gradient do_[0] = 231493.328
Backward Time Step 1:
Gradient di[0] = 3578.472, df[0] = 3014.284, dc_hat[0] = 4794.924
Gradient do_[0] = 157911.969
Backward Time Step 0:
Gradient di[0] = 3146.943, df[0] = 2667.401, dc_hat[0] = 4999.458
Gradient do_[0] = 61784.242
Time Step 0:
i_gate[0] = 0.549, f_gate[0] = 0.707, o_gate[0] = 0.126, c_hat[0] = 0.724
c_state[0] = 0.397, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.469, f_gate[0] = 0.714, o_gate[0] = 0.101, c_hat[0] = 0.736
c_state[0] = 0.629, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.427, f_gate[0] = 0.719, o_gate[0] = 0.093, c_hat[0] = 0.720
c_state[0] = 0.759, h_state[0] = 0.059
Time Step 3:
i_gate[0] = 0.392, f_gate[0] = 0.720, o_gate[0] = 0.090, c_hat[0] = 0.766
c_state[0] = 0.846, h_state[0] = 0.062
Time Step 4:
i_gate[0] = 0.368, f_gate[0] = 0.734, o_gate[0] = 0.086, c_hat[0] = 0.733
c_state[0] = 0.892, h_state[0] = 0.061
Backward Time Step 4:
Gradient di[0] = -60609.941, df[0] = -50891.809, dc_hat[0] = -60457.770
Gradient do_[0] = -5991625.000
Backward Time Step 3:
Gradient di[0] = -102356.219, df[0] = -81417.055, dc_hat[0] = -90722.039
Gradient do_[0] = -8224900.500
Backward Time Step 2:
Gradient di[0] = -119492.172, df[0] = -98569.125, dc_hat[0] = -139545.172
Gradient do_[0] = -7957394.500
Backward Time Step 1:
Gradient di[0] = -130363.156, df[0] = -103762.203, dc_hat[0] = -153146.953
Gradient do_[0] = -5673442.000
Backward Time Step 0:
Gradient di[0] = -112263.250, df[0] = -91755.070, dc_hat[0] = -163922.516
Gradient do_[0] = -2187659.000
Time Step 0:
i_gate[0] = 0.537, f_gate[0] = 0.697, o_gate[0] = 0.121, c_hat[0] = 0.699
c_state[0] = 0.375, h_state[0] = 0.043
Time Step 1:
i_gate[0] = 0.455, f_gate[0] = 0.703, o_gate[0] = 0.095, c_hat[0] = 0.702
c_state[0] = 0.583, h_state[0] = 0.050
Time Step 2:
i_gate[0] = 0.412, f_gate[0] = 0.708, o_gate[0] = 0.086, c_hat[0] = 0.676
c_state[0] = 0.692, h_state[0] = 0.052
Time Step 3:
i_gate[0] = 0.377, f_gate[0] = 0.710, o_gate[0] = 0.084, c_hat[0] = 0.720
c_state[0] = 0.762, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.354, f_gate[0] = 0.725, o_gate[0] = 0.080, c_hat[0] = 0.672
c_state[0] = 0.790, h_state[0] = 0.053
Backward Time Step 4:
Gradient di[0] = 92738.383, df[0] = 87251.102, dc_hat[0] = 117006.883
Gradient do_[0] = 8777577.000
Backward Time Step 3:
Gradient di[0] = 129194.812, df[0] = 111758.961, dc_hat[0] = 138556.125
Gradient do_[0] = 9997700.000
Backward Time Step 2:
Gradient di[0] = 137160.281, df[0] = 122586.156, dc_hat[0] = 187496.391
Gradient do_[0] = 9057979.000
Backward Time Step 1:
Gradient di[0] = 144032.594, df[0] = 121417.914, dc_hat[0] = 190921.922
Gradient do_[0] = 6316864.500
Backward Time Step 0:
Gradient di[0] = 129841.461, df[0] = 109960.500, dc_hat[0] = 205019.844
Gradient do_[0] = 2544487.750
Time Step 0:
i_gate[0] = 0.546, f_gate[0] = 0.706, o_gate[0] = 0.125, c_hat[0] = 0.724
c_state[0] = 0.395, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.466, f_gate[0] = 0.711, o_gate[0] = 0.100, c_hat[0] = 0.735
c_state[0] = 0.623, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.423, f_gate[0] = 0.714, o_gate[0] = 0.092, c_hat[0] = 0.720
c_state[0] = 0.750, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.386, f_gate[0] = 0.713, o_gate[0] = 0.089, c_hat[0] = 0.769
c_state[0] = 0.831, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.361, f_gate[0] = 0.726, o_gate[0] = 0.085, c_hat[0] = 0.738
c_state[0] = 0.870, h_state[0] = 0.059
Backward Time Step 4:
Gradient di[0] = -73739.633, df[0] = -62558.195, dc_hat[0] = -71340.320
Gradient do_[0] = -7072098.500
Backward Time Step 3:
Gradient di[0] = -115174.148, df[0] = -92261.594, dc_hat[0] = -99852.391
Gradient do_[0] = -9059656.000
Backward Time Step 2:
Gradient di[0] = -133115.000, df[0] = -110395.594, dc_hat[0] = -153889.094
Gradient do_[0] = -8771943.000
Backward Time Step 1:
Gradient di[0] = -146943.406, df[0] = -117314.648, dc_hat[0] = -171686.094
Gradient do_[0] = -6387727.500
Backward Time Step 0:
Gradient di[0] = -130555.727, df[0] = -106661.094, dc_hat[0] = -189395.234
Gradient do_[0] = -2550051.000
Time Step 0:
i_gate[0] = 0.534, f_gate[0] = 0.696, o_gate[0] = 0.120, c_hat[0] = 0.699
c_state[0] = 0.373, h_state[0] = 0.043
Time Step 1:
i_gate[0] = 0.451, f_gate[0] = 0.700, o_gate[0] = 0.094, c_hat[0] = 0.706
c_state[0] = 0.579, h_state[0] = 0.049
Time Step 2:
i_gate[0] = 0.408, f_gate[0] = 0.704, o_gate[0] = 0.086, c_hat[0] = 0.681
c_state[0] = 0.685, h_state[0] = 0.051
Time Step 3:
i_gate[0] = 0.372, f_gate[0] = 0.705, o_gate[0] = 0.083, c_hat[0] = 0.725
c_state[0] = 0.753, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.349, f_gate[0] = 0.719, o_gate[0] = 0.079, c_hat[0] = 0.677
c_state[0] = 0.777, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = 146083.219, df[0] = 138027.844, dc_hat[0] = 179603.078
Gradient do_[0] = 13608191.000
Backward Time Step 3:
Gradient di[0] = 214381.219, df[0] = 185719.750, dc_hat[0] = 223562.547
Gradient do_[0] = 16385177.000
Backward Time Step 2:
Gradient di[0] = 233747.078, df[0] = 208624.250, dc_hat[0] = 311168.469
Gradient do_[0] = 15302966.000
Backward Time Step 1:
Gradient di[0] = 251770.547, df[0] = 211850.641, dc_hat[0] = 326147.375
Gradient do_[0] = 11002393.000
Backward Time Step 0:
Gradient di[0] = 232949.734, df[0] = 197219.656, dc_hat[0] = 365494.250
Gradient do_[0] = 4577073.000
Time Step 0:
i_gate[0] = 0.543, f_gate[0] = 0.706, o_gate[0] = 0.124, c_hat[0] = 0.724
c_state[0] = 0.393, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.460, f_gate[0] = 0.710, o_gate[0] = 0.100, c_hat[0] = 0.740
c_state[0] = 0.620, h_state[0] = 0.055
Time Step 2:
i_gate[0] = 0.417, f_gate[0] = 0.712, o_gate[0] = 0.092, c_hat[0] = 0.727
c_state[0] = 0.745, h_state[0] = 0.058
Time Step 3:
i_gate[0] = 0.380, f_gate[0] = 0.710, o_gate[0] = 0.089, c_hat[0] = 0.776
c_state[0] = 0.823, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.354, f_gate[0] = 0.722, o_gate[0] = 0.085, c_hat[0] = 0.746
c_state[0] = 0.859, h_state[0] = 0.059
Backward Time Step 4:
Gradient di[0] = -23692.381, df[0] = -20102.199, dc_hat[0] = -21781.342
Gradient do_[0] = -2212562.750
Backward Time Step 3:
Gradient di[0] = -32930.461, df[0] = -26356.469, dc_hat[0] = -27275.008
Gradient do_[0] = -2536505.500
Backward Time Step 2:
Gradient di[0] = -37235.426, df[0] = -30745.373, dc_hat[0] = -41349.879
Gradient do_[0] = -2411869.250
Backward Time Step 1:
Gradient di[0] = -40695.391, df[0] = -32348.102, dc_hat[0] = -46010.855
Gradient do_[0] = -1748439.625
Backward Time Step 0:
Gradient di[0] = -35967.871, df[0] = -29355.129, dc_hat[0] = -51856.578
Gradient do_[0] = -701117.938
Time Step 0:
i_gate[0] = 0.531, f_gate[0] = 0.696, o_gate[0] = 0.119, c_hat[0] = 0.699
c_state[0] = 0.371, h_state[0] = 0.042
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.700, o_gate[0] = 0.094, c_hat[0] = 0.712
c_state[0] = 0.577, h_state[0] = 0.049
Time Step 2:
i_gate[0] = 0.402, f_gate[0] = 0.704, o_gate[0] = 0.086, c_hat[0] = 0.690
c_state[0] = 0.683, h_state[0] = 0.051
Time Step 3:
i_gate[0] = 0.366, f_gate[0] = 0.703, o_gate[0] = 0.083, c_hat[0] = 0.733
c_state[0] = 0.749, h_state[0] = 0.053
Time Step 4:
i_gate[0] = 0.343, f_gate[0] = 0.717, o_gate[0] = 0.079, c_hat[0] = 0.688
c_state[0] = 0.773, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = 16726.080, df[0] = 15708.189, dc_hat[0] = 19520.246
Gradient do_[0] = 1530749.875
Backward Time Step 3:
Gradient di[0] = 22060.404, df[0] = 19020.312, dc_hat[0] = 21953.279
Gradient do_[0] = 1664023.250
Backward Time Step 2:
Gradient di[0] = 23266.469, df[0] = 20592.277, dc_hat[0] = 29578.615
Gradient do_[0] = 1502980.625
Backward Time Step 1:
Gradient di[0] = 24542.098, df[0] = 20508.000, dc_hat[0] = 30567.152
Gradient do_[0] = 1060830.625
Backward Time Step 0:
Gradient di[0] = 22313.322, df[0] = 18877.002, dc_hat[0] = 34798.410
Gradient do_[0] = 437646.750
Time Step 0:
i_gate[0] = 0.519, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.710
c_state[0] = 0.368, h_state[0] = 0.040
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.687, o_gate[0] = 0.090, c_hat[0] = 0.728
c_state[0] = 0.568, h_state[0] = 0.046
Time Step 2:
i_gate[0] = 0.391, f_gate[0] = 0.689, o_gate[0] = 0.082, c_hat[0] = 0.711
c_state[0] = 0.669, h_state[0] = 0.048
Time Step 3:
i_gate[0] = 0.356, f_gate[0] = 0.687, o_gate[0] = 0.079, c_hat[0] = 0.755
c_state[0] = 0.728, h_state[0] = 0.049
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.700, o_gate[0] = 0.075, c_hat[0] = 0.714
c_state[0] = 0.746, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = 56733.000, df[0] = 52717.938, dc_hat[0] = 58204.023
Gradient do_[0] = 5016909.500
Backward Time Step 3:
Gradient di[0] = 73999.547, df[0] = 63219.883, dc_hat[0] = 65488.531
Gradient do_[0] = 5461911.000
Backward Time Step 2:
Gradient di[0] = 76348.141, df[0] = 66592.289, dc_hat[0] = 87197.977
Gradient do_[0] = 4871231.500
Backward Time Step 1:
Gradient di[0] = 77240.484, df[0] = 63865.691, dc_hat[0] = 87976.352
Gradient do_[0] = 3350873.750
Backward Time Step 0:
Gradient di[0] = 67010.461, df[0] = 55816.094, dc_hat[0] = 97109.961
Gradient do_[0] = 1328409.875
Time Step 0:
i_gate[0] = 0.514, f_gate[0] = 0.697, o_gate[0] = 0.118, c_hat[0] = 0.734
c_state[0] = 0.378, h_state[0] = 0.043
Time Step 1:
i_gate[0] = 0.432, f_gate[0] = 0.695, o_gate[0] = 0.095, c_hat[0] = 0.756
c_state[0] = 0.589, h_state[0] = 0.050
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.694, o_gate[0] = 0.087, c_hat[0] = 0.748
c_state[0] = 0.701, h_state[0] = 0.053
Time Step 3:
i_gate[0] = 0.353, f_gate[0] = 0.689, o_gate[0] = 0.084, c_hat[0] = 0.796
c_state[0] = 0.765, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.327, f_gate[0] = 0.700, o_gate[0] = 0.079, c_hat[0] = 0.772
c_state[0] = 0.787, h_state[0] = 0.052
Backward Time Step 4:
Gradient di[0] = 154.828, df[0] = 133.981, dc_hat[0] = 120.274
Gradient do_[0] = 13343.745
Backward Time Step 3:
Gradient di[0] = 173.806, df[0] = 140.975, dc_hat[0] = 123.466
Gradient do_[0] = 12560.574
Backward Time Step 2:
Gradient di[0] = 176.425, df[0] = 145.992, dc_hat[0] = 170.174
Gradient do_[0] = 10878.720
Backward Time Step 1:
Gradient di[0] = 174.288, df[0] = 138.468, dc_hat[0] = 174.060
Gradient do_[0] = 7293.926
Backward Time Step 0:
Gradient di[0] = 143.380, df[0] = 115.107, dc_hat[0] = 185.480
Gradient do_[0] = 2741.481
Time Step 0:
i_gate[0] = 0.508, f_gate[0] = 0.703, o_gate[0] = 0.122, c_hat[0] = 0.751
c_state[0] = 0.381, h_state[0] = 0.044
Time Step 1:
i_gate[0] = 0.428, f_gate[0] = 0.700, o_gate[0] = 0.099, c_hat[0] = 0.775
c_state[0] = 0.599, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.388, f_gate[0] = 0.698, o_gate[0] = 0.091, c_hat[0] = 0.774
c_state[0] = 0.718, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.351, f_gate[0] = 0.690, o_gate[0] = 0.088, c_hat[0] = 0.825
c_state[0] = 0.785, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.323, f_gate[0] = 0.698, o_gate[0] = 0.082, c_hat[0] = 0.812
c_state[0] = 0.810, h_state[0] = 0.055
Backward Time Step 4:
Gradient di[0] = 0.060, df[0] = 0.050, dc_hat[0] = 0.037
Gradient do_[0] = 5.055
Backward Time Step 3:
Gradient di[0] = 0.077, df[0] = 0.061, dc_hat[0] = 0.046
Gradient do_[0] = 5.456
Backward Time Step 2:
Gradient di[0] = 0.092, df[0] = 0.074, dc_hat[0] = 0.078
Gradient do_[0] = 5.301
Backward Time Step 1:
Gradient di[0] = 0.097, df[0] = 0.075, dc_hat[0] = 0.088
Gradient do_[0] = 3.749
Backward Time Step 0:
Gradient di[0] = 0.087, df[0] = 0.068, dc_hat[0] = 0.103
Gradient do_[0] = 1.474
Time Step 0:
i_gate[0] = 0.508, f_gate[0] = 0.703, o_gate[0] = 0.125, c_hat[0] = 0.756
c_state[0] = 0.384, h_state[0] = 0.046
Time Step 1:
i_gate[0] = 0.430, f_gate[0] = 0.699, o_gate[0] = 0.103, c_hat[0] = 0.783
c_state[0] = 0.605, h_state[0] = 0.056
Time Step 2:
i_gate[0] = 0.390, f_gate[0] = 0.695, o_gate[0] = 0.095, c_hat[0] = 0.785
c_state[0] = 0.727, h_state[0] = 0.059
Time Step 3:
i_gate[0] = 0.353, f_gate[0] = 0.685, o_gate[0] = 0.091, c_hat[0] = 0.837
c_state[0] = 0.794, h_state[0] = 0.060
Time Step 4:
i_gate[0] = 0.325, f_gate[0] = 0.689, o_gate[0] = 0.085, c_hat[0] = 0.829
c_state[0] = 0.816, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.006
Gradient do_[0] = 0.949
Backward Time Step 3:
Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.009
Gradient do_[0] = 1.188
Backward Time Step 2:
Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.021
Gradient do_[0] = 1.362
Backward Time Step 1:
Gradient di[0] = 0.031, df[0] = 0.024, dc_hat[0] = 0.027
Gradient do_[0] = 1.057
Backward Time Step 0:
Gradient di[0] = 0.034, df[0] = 0.027, dc_hat[0] = 0.040
Gradient do_[0] = 0.490
Time Step 0:
i_gate[0] = 0.508, f_gate[0] = 0.704, o_gate[0] = 0.130, c_hat[0] = 0.758
c_state[0] = 0.385, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.427, f_gate[0] = 0.699, o_gate[0] = 0.108, c_hat[0] = 0.787
c_state[0] = 0.606, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.385, f_gate[0] = 0.694, o_gate[0] = 0.100, c_hat[0] = 0.792
c_state[0] = 0.726, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.684, o_gate[0] = 0.096, c_hat[0] = 0.844
c_state[0] = 0.788, h_state[0] = 0.063
Time Step 4:
i_gate[0] = 0.316, f_gate[0] = 0.689, o_gate[0] = 0.091, c_hat[0] = 0.836
c_state[0] = 0.807, h_state[0] = 0.061
Backward Time Step 4:
Gradient di[0] = -0.016, df[0] = -0.013, dc_hat[0] = -0.008
Gradient do_[0] = -1.133
Backward Time Step 3:
Gradient di[0] = -0.022, df[0] = -0.017, dc_hat[0] = -0.011
Gradient do_[0] = -1.317
Backward Time Step 2:
Gradient di[0] = -0.018, df[0] = -0.014, dc_hat[0] = -0.014
Gradient do_[0] = -1.112
Backward Time Step 1:
Gradient di[0] = -0.011, df[0] = -0.008, dc_hat[0] = -0.009
Gradient do_[0] = -0.534
Backward Time Step 0:
Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005
Gradient do_[0] = -0.041
Time Step 0:
i_gate[0] = 0.509, f_gate[0] = 0.704, o_gate[0] = 0.136, c_hat[0] = 0.760
c_state[0] = 0.387, h_state[0] = 0.050
Time Step 1:
i_gate[0] = 0.425, f_gate[0] = 0.700, o_gate[0] = 0.113, c_hat[0] = 0.791
c_state[0] = 0.606, h_state[0] = 0.061
Time Step 2:
i_gate[0] = 0.380, f_gate[0] = 0.696, o_gate[0] = 0.106, c_hat[0] = 0.797
c_state[0] = 0.725, h_state[0] = 0.066
Time Step 3:
i_gate[0] = 0.338, f_gate[0] = 0.687, o_gate[0] = 0.103, c_hat[0] = 0.848
c_state[0] = 0.784, h_state[0] = 0.068
Time Step 4:
i_gate[0] = 0.307, f_gate[0] = 0.693, o_gate[0] = 0.099, c_hat[0] = 0.841
c_state[0] = 0.802, h_state[0] = 0.066
Backward Time Step 4:
Gradient di[0] = -0.170, df[0] = -0.140, dc_hat[0] = -0.085
Gradient do_[0] = -11.423
Backward Time Step 3:
Gradient di[0] = -0.344, df[0] = -0.268, dc_hat[0] = -0.172
Gradient do_[0] = -20.123
Backward Time Step 2:
Gradient di[0] = -0.466, df[0] = -0.365, dc_hat[0] = -0.343
Gradient do_[0] = -23.660
Backward Time Step 1:
Gradient di[0] = -0.480, df[0] = -0.365, dc_hat[0] = -0.394
Gradient do_[0] = -16.983
Backward Time Step 0:
Gradient di[0] = -0.362, df[0] = -0.279, dc_hat[0] = -0.410
Gradient do_[0] = -6.113
Time Step 0:
i_gate[0] = 0.508, f_gate[0] = 0.703, o_gate[0] = 0.132, c_hat[0] = 0.753
c_state[0] = 0.383, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.419, f_gate[0] = 0.698, o_gate[0] = 0.109, c_hat[0] = 0.786
c_state[0] = 0.596, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.371, f_gate[0] = 0.694, o_gate[0] = 0.102, c_hat[0] = 0.793
c_state[0] = 0.708, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.327, f_gate[0] = 0.685, o_gate[0] = 0.100, c_hat[0] = 0.845
c_state[0] = 0.761, h_state[0] = 0.064
Time Step 4:
i_gate[0] = 0.295, f_gate[0] = 0.693, o_gate[0] = 0.097, c_hat[0] = 0.838
c_state[0] = 0.775, h_state[0] = 0.063
Backward Time Step 4:
Gradient di[0] = -0.461, df[0] = -0.390, dc_hat[0] = -0.232
Gradient do_[0] = -30.819
Backward Time Step 3:
Gradient di[0] = -1.192, df[0] = -0.948, dc_hat[0] = -0.600
Gradient do_[0] = -70.159
Backward Time Step 2:
Gradient di[0] = -1.779, df[0] = -1.417, dc_hat[0] = -1.322
Gradient do_[0] = -91.300
Backward Time Step 1:
Gradient di[0] = -1.916, df[0] = -1.474, dc_hat[0] = -1.605
Gradient do_[0] = -69.009
Backward Time Step 0:
Gradient di[0] = -1.529, df[0] = -1.193, dc_hat[0] = -1.789
Gradient do_[0] = -26.138
Time Step 0:
i_gate[0] = 0.508, f_gate[0] = 0.701, o_gate[0] = 0.133, c_hat[0] = 0.742
c_state[0] = 0.377, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.694, o_gate[0] = 0.110, c_hat[0] = 0.775
c_state[0] = 0.581, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.360, f_gate[0] = 0.689, o_gate[0] = 0.103, c_hat[0] = 0.782
c_state[0] = 0.682, h_state[0] = 0.061
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.681, o_gate[0] = 0.102, c_hat[0] = 0.836
c_state[0] = 0.726, h_state[0] = 0.063
Time Step 4:
i_gate[0] = 0.280, f_gate[0] = 0.690, o_gate[0] = 0.100, c_hat[0] = 0.830
c_state[0] = 0.734, h_state[0] = 0.062
Backward Time Step 4:
Gradient di[0] = -2.784, df[0] = -2.454, dc_hat[0] = -1.451
Gradient do_[0] = -171.761
Backward Time Step 3:
Gradient di[0] = -6.392, df[0] = -5.257, dc_hat[0] = -3.346
Gradient do_[0] = -353.260
Backward Time Step 2:
Gradient di[0] = -9.690, df[0] = -7.939, dc_hat[0] = -7.502
Gradient do_[0] = -475.423
Backward Time Step 1:
Gradient di[0] = -10.680, df[0] = -8.384, dc_hat[0] = -9.369
Gradient do_[0] = -373.815
Backward Time Step 0:
Gradient di[0] = -8.665, df[0] = -6.864, dc_hat[0] = -10.660
Gradient do_[0] = -145.118
Epoch 200, Train Loss=0.009384, Weight Norm=12.257568
Sample Predictions at Epoch 200:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 56.99 | 63.87 | 6.88 |
| 193 | 2024-10-14 | 56.25 | 66.55 | 10.30 |
| 194 | 2024-10-15 | 56.34 | 66.00 | 9.66 |
| 195 | 2024-10-16 | 57.58 | 67.20 | 9.62 |
| 196 | 2024-10-17 | 57.06 | 66.76 | 9.70 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.520, f_gate[0] = 0.711, o_gate[0] = 0.139, c_hat[0] = 0.763
c_state[0] = 0.397, h_state[0] = 0.052
Time Step 1:
i_gate[0] = 0.420, f_gate[0] = 0.705, o_gate[0] = 0.116, c_hat[0] = 0.793
c_state[0] = 0.613, h_state[0] = 0.063
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.701, o_gate[0] = 0.110, c_hat[0] = 0.800
c_state[0] = 0.721, h_state[0] = 0.068
Time Step 3:
i_gate[0] = 0.316, f_gate[0] = 0.695, o_gate[0] = 0.110, c_hat[0] = 0.850
c_state[0] = 0.770, h_state[0] = 0.071
Time Step 4:
i_gate[0] = 0.283, f_gate[0] = 0.707, o_gate[0] = 0.109, c_hat[0] = 0.845
c_state[0] = 0.783, h_state[0] = 0.072
Backward Time Step 4:
Gradient di[0] = 1522.780, df[0] = 1300.379, dc_hat[0] = 718.487
Gradient do_[0] = 93034.031
Backward Time Step 3:
Gradient di[0] = 2982.618, df[0] = 2393.300, dc_hat[0] = 1423.484
Gradient do_[0] = 164322.766
Backward Time Step 2:
Gradient di[0] = 4506.055, df[0] = 3574.061, dc_hat[0] = 3189.600
Gradient do_[0] = 220299.234
Backward Time Step 1:
Gradient di[0] = 5144.803, df[0] = 3903.392, dc_hat[0] = 4150.191
Gradient do_[0] = 179017.641
Backward Time Step 0:
Gradient di[0] = 4264.341, df[0] = 3270.638, dc_hat[0] = 4866.328
Gradient do_[0] = 70828.250
Time Step 0:
i_gate[0] = 0.508, f_gate[0] = 0.701, o_gate[0] = 0.133, c_hat[0] = 0.741
c_state[0] = 0.377, h_state[0] = 0.048
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.693, o_gate[0] = 0.110, c_hat[0] = 0.770
c_state[0] = 0.578, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.358, f_gate[0] = 0.687, o_gate[0] = 0.103, c_hat[0] = 0.773
c_state[0] = 0.674, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.310, f_gate[0] = 0.677, o_gate[0] = 0.101, c_hat[0] = 0.826
c_state[0] = 0.713, h_state[0] = 0.062
Time Step 4:
i_gate[0] = 0.277, f_gate[0] = 0.685, o_gate[0] = 0.098, c_hat[0] = 0.817
c_state[0] = 0.715, h_state[0] = 0.060
Backward Time Step 4:
Gradient di[0] = -0.295, df[0] = -0.267, dc_hat[0] = -0.166
Gradient do_[0] = -18.131
Backward Time Step 3:
Gradient di[0] = -0.647, df[0] = -0.542, dc_hat[0] = -0.360
Gradient do_[0] = -35.647
Backward Time Step 2:
Gradient di[0] = -0.958, df[0] = -0.796, dc_hat[0] = -0.776
Gradient do_[0] = -47.114
Backward Time Step 1:
Gradient di[0] = -1.029, df[0] = -0.813, dc_hat[0] = -0.926
Gradient do_[0] = -36.118
Backward Time Step 0:
Gradient di[0] = -0.780, df[0] = -0.618, dc_hat[0] = -0.962
Gradient do_[0] = -13.164
Time Step 0:
i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750
c_state[0] = 0.386, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.414, f_gate[0] = 0.697, o_gate[0] = 0.116, c_hat[0] = 0.777
c_state[0] = 0.590, h_state[0] = 0.061
Time Step 2:
i_gate[0] = 0.356, f_gate[0] = 0.692, o_gate[0] = 0.110, c_hat[0] = 0.782
c_state[0] = 0.687, h_state[0] = 0.066
Time Step 3:
i_gate[0] = 0.307, f_gate[0] = 0.685, o_gate[0] = 0.109, c_hat[0] = 0.834
c_state[0] = 0.726, h_state[0] = 0.068
Time Step 4:
i_gate[0] = 0.273, f_gate[0] = 0.695, o_gate[0] = 0.109, c_hat[0] = 0.826
c_state[0] = 0.731, h_state[0] = 0.068
Backward Time Step 4:
Gradient di[0] = -1635.764, df[0] = -1467.936, dc_hat[0] = -865.277
Gradient do_[0] = -93575.961
Backward Time Step 3:
Gradient di[0] = -3132.884, df[0] = -2612.606, dc_hat[0] = -1653.787
Gradient do_[0] = -162956.734
Backward Time Step 2:
Gradient di[0] = -4653.047, df[0] = -3829.213, dc_hat[0] = -3597.654
Gradient do_[0] = -218080.344
Backward Time Step 1:
Gradient di[0] = -5223.136, df[0] = -4079.051, dc_hat[0] = -4536.766
Gradient do_[0] = -176568.062
Backward Time Step 0:
Gradient di[0] = -4193.728, df[0] = -3282.793, dc_hat[0] = -5045.143
Gradient do_[0] = -68450.875
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.715, o_gate[0] = 0.145, c_hat[0] = 0.771
c_state[0] = 0.406, h_state[0] = 0.056
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.709, o_gate[0] = 0.122, c_hat[0] = 0.800
c_state[0] = 0.626, h_state[0] = 0.068
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.707, o_gate[0] = 0.118, c_hat[0] = 0.808
c_state[0] = 0.735, h_state[0] = 0.074
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.704, o_gate[0] = 0.120, c_hat[0] = 0.856
c_state[0] = 0.786, h_state[0] = 0.079
Time Step 4:
i_gate[0] = 0.282, f_gate[0] = 0.719, o_gate[0] = 0.121, c_hat[0] = 0.852
c_state[0] = 0.805, h_state[0] = 0.081
Backward Time Step 4:
Gradient di[0] = 76061.062, df[0] = 63983.848, dc_hat[0] = 33997.895
Gradient do_[0] = 4362782.000
Backward Time Step 3:
Gradient di[0] = 132061.312, df[0] = 105149.242, dc_hat[0] = 59839.422
Gradient do_[0] = 6877376.000
Backward Time Step 2:
Gradient di[0] = 193161.328, df[0] = 151514.859, dc_hat[0] = 130319.930
Gradient do_[0] = 9016668.000
Backward Time Step 1:
Gradient di[0] = 226880.391, df[0] = 170017.641, dc_hat[0] = 176419.766
Gradient do_[0] = 7626913.500
Backward Time Step 0:
Gradient di[0] = 198442.781, df[0] = 150441.016, dc_hat[0] = 220976.109
Gradient do_[0] = 3218233.250
Time Step 0:
i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750
c_state[0] = 0.386, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.414, f_gate[0] = 0.697, o_gate[0] = 0.116, c_hat[0] = 0.779
c_state[0] = 0.591, h_state[0] = 0.062
Time Step 2:
i_gate[0] = 0.356, f_gate[0] = 0.692, o_gate[0] = 0.111, c_hat[0] = 0.784
c_state[0] = 0.689, h_state[0] = 0.066
Time Step 3:
i_gate[0] = 0.306, f_gate[0] = 0.685, o_gate[0] = 0.110, c_hat[0] = 0.836
c_state[0] = 0.728, h_state[0] = 0.069
Time Step 4:
i_gate[0] = 0.273, f_gate[0] = 0.696, o_gate[0] = 0.110, c_hat[0] = 0.829
c_state[0] = 0.733, h_state[0] = 0.068
Backward Time Step 4:
Gradient di[0] = -1534.954, df[0] = -1372.850, dc_hat[0] = -794.810
Gradient do_[0] = -87191.062
Backward Time Step 3:
Gradient di[0] = -2954.472, df[0] = -2457.093, dc_hat[0] = -1531.078
Gradient do_[0] = -152687.891
Backward Time Step 2:
Gradient di[0] = -4402.364, df[0] = -3610.604, dc_hat[0] = -3353.604
Gradient do_[0] = -205079.594
Backward Time Step 1:
Gradient di[0] = -4900.543, df[0] = -3817.786, dc_hat[0] = -4219.986
Gradient do_[0] = -164928.219
Backward Time Step 0:
Gradient di[0] = -3810.322, df[0] = -2982.668, dc_hat[0] = -4583.897
Gradient do_[0] = -62192.859
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.715, o_gate[0] = 0.145, c_hat[0] = 0.771
c_state[0] = 0.406, h_state[0] = 0.056
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.709, o_gate[0] = 0.123, c_hat[0] = 0.802
c_state[0] = 0.627, h_state[0] = 0.068
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.707, o_gate[0] = 0.119, c_hat[0] = 0.810
c_state[0] = 0.737, h_state[0] = 0.075
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.704, o_gate[0] = 0.121, c_hat[0] = 0.859
c_state[0] = 0.788, h_state[0] = 0.079
Time Step 4:
i_gate[0] = 0.282, f_gate[0] = 0.719, o_gate[0] = 0.122, c_hat[0] = 0.855
c_state[0] = 0.807, h_state[0] = 0.082
Backward Time Step 4:
Gradient di[0] = 145734.156, df[0] = 122238.758, dc_hat[0] = 63949.379
Gradient do_[0] = 8315011.500
Backward Time Step 3:
Gradient di[0] = 252169.734, df[0] = 200293.594, dc_hat[0] = 112308.781
Gradient do_[0] = 13058593.000
Backward Time Step 2:
Gradient di[0] = 368026.375, df[0] = 287816.188, dc_hat[0] = 244754.734
Gradient do_[0] = 17082716.000
Backward Time Step 1:
Gradient di[0] = 428825.781, df[0] = 320646.031, dc_hat[0] = 330641.000
Gradient do_[0] = 14355376.000
Backward Time Step 0:
Gradient di[0] = 365769.125, df[0] = 277292.438, dc_hat[0] = 407302.500
Gradient do_[0] = 5931837.500
Time Step 0:
i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750
c_state[0] = 0.386, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.414, f_gate[0] = 0.698, o_gate[0] = 0.117, c_hat[0] = 0.781
c_state[0] = 0.593, h_state[0] = 0.062
Time Step 2:
i_gate[0] = 0.357, f_gate[0] = 0.693, o_gate[0] = 0.111, c_hat[0] = 0.788
c_state[0] = 0.692, h_state[0] = 0.067
Time Step 3:
i_gate[0] = 0.307, f_gate[0] = 0.686, o_gate[0] = 0.111, c_hat[0] = 0.839
c_state[0] = 0.732, h_state[0] = 0.069
Time Step 4:
i_gate[0] = 0.274, f_gate[0] = 0.697, o_gate[0] = 0.110, c_hat[0] = 0.832
c_state[0] = 0.738, h_state[0] = 0.069
Backward Time Step 4:
Gradient di[0] = -878.506, df[0] = -781.224, dc_hat[0] = -446.047
Gradient do_[0] = -49924.977
Backward Time Step 3:
Gradient di[0] = -1681.915, df[0] = -1391.690, dc_hat[0] = -855.288
Gradient do_[0] = -86817.898
Backward Time Step 2:
Gradient di[0] = -2499.135, df[0] = -2038.982, dc_hat[0] = -1873.171
Gradient do_[0] = -116011.289
Backward Time Step 1:
Gradient di[0] = -2761.230, df[0] = -2143.854, dc_hat[0] = -2352.998
Gradient do_[0] = -92560.633
Backward Time Step 0:
Gradient di[0] = -2100.726, df[0] = -1644.420, dc_hat[0] = -2527.218
Gradient do_[0] = -34288.543
Time Step 0:
i_gate[0] = 0.527, f_gate[0] = 0.715, o_gate[0] = 0.145, c_hat[0] = 0.771
c_state[0] = 0.406, h_state[0] = 0.056
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.710, o_gate[0] = 0.123, c_hat[0] = 0.804
c_state[0] = 0.628, h_state[0] = 0.069
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.708, o_gate[0] = 0.120, c_hat[0] = 0.813
c_state[0] = 0.740, h_state[0] = 0.075
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.705, o_gate[0] = 0.122, c_hat[0] = 0.861
c_state[0] = 0.792, h_state[0] = 0.080
Time Step 4:
i_gate[0] = 0.283, f_gate[0] = 0.720, o_gate[0] = 0.123, c_hat[0] = 0.857
c_state[0] = 0.812, h_state[0] = 0.082
Backward Time Step 4:
Gradient di[0] = 138715.172, df[0] = 115804.641, dc_hat[0] = 59869.789
Gradient do_[0] = 7925267.500
Backward Time Step 3:
Gradient di[0] = 238509.000, df[0] = 188616.172, dc_hat[0] = 104409.516
Gradient do_[0] = 12342997.000
Backward Time Step 2:
Gradient di[0] = 346294.406, df[0] = 269584.312, dc_hat[0] = 226781.000
Gradient do_[0] = 16024729.000
Backward Time Step 1:
Gradient di[0] = 400095.531, df[0] = 298260.906, dc_hat[0] = 305391.750
Gradient do_[0] = 13344412.000
Backward Time Step 0:
Gradient di[0] = 334596.219, df[0] = 253660.000, dc_hat[0] = 372589.875
Gradient do_[0] = 5426293.000
Time Step 0:
i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750
c_state[0] = 0.386, h_state[0] = 0.051
Time Step 1:
i_gate[0] = 0.415, f_gate[0] = 0.699, o_gate[0] = 0.118, c_hat[0] = 0.785
c_state[0] = 0.595, h_state[0] = 0.063
Time Step 2:
i_gate[0] = 0.358, f_gate[0] = 0.695, o_gate[0] = 0.113, c_hat[0] = 0.793
c_state[0] = 0.698, h_state[0] = 0.068
Time Step 3:
i_gate[0] = 0.308, f_gate[0] = 0.688, o_gate[0] = 0.112, c_hat[0] = 0.844
c_state[0] = 0.740, h_state[0] = 0.071
Time Step 4:
i_gate[0] = 0.275, f_gate[0] = 0.699, o_gate[0] = 0.112, c_hat[0] = 0.838
c_state[0] = 0.748, h_state[0] = 0.071
Backward Time Step 4:
Gradient di[0] = 246.630, df[0] = 217.044, dc_hat[0] = 120.772
Gradient do_[0] = 14023.690
Backward Time Step 3:
Gradient di[0] = 465.916, df[0] = 382.035, dc_hat[0] = 228.934
Gradient do_[0] = 23992.588
Backward Time Step 2:
Gradient di[0] = 688.049, df[0] = 556.141, dc_hat[0] = 500.825
Gradient do_[0] = 31724.008
Backward Time Step 1:
Gradient di[0] = 752.028, df[0] = 580.509, dc_hat[0] = 629.454
Gradient do_[0] = 25030.568
Backward Time Step 0:
Gradient di[0] = 556.902, df[0] = 435.935, dc_hat[0] = 669.965
Gradient do_[0] = 9089.733
Time Step 0:
i_gate[0] = 0.502, f_gate[0] = 0.695, o_gate[0] = 0.133, c_hat[0] = 0.727
c_state[0] = 0.365, h_state[0] = 0.047
Time Step 1:
i_gate[0] = 0.407, f_gate[0] = 0.687, o_gate[0] = 0.112, c_hat[0] = 0.761
c_state[0] = 0.560, h_state[0] = 0.057
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.681, o_gate[0] = 0.105, c_hat[0] = 0.766
c_state[0] = 0.652, h_state[0] = 0.060
Time Step 3:
i_gate[0] = 0.304, f_gate[0] = 0.670, o_gate[0] = 0.103, c_hat[0] = 0.821
c_state[0] = 0.686, h_state[0] = 0.061
Time Step 4:
i_gate[0] = 0.269, f_gate[0] = 0.678, o_gate[0] = 0.100, c_hat[0] = 0.811
c_state[0] = 0.683, h_state[0] = 0.060
Backward Time Step 4:
Gradient di[0] = 10.519, df[0] = 9.771, dc_hat[0] = 6.085
Gradient do_[0] = 602.033
Backward Time Step 3:
Gradient di[0] = 22.955, df[0] = 19.585, dc_hat[0] = 13.083
Gradient do_[0] = 1183.318
Backward Time Step 2:
Gradient di[0] = 33.960, df[0] = 28.730, dc_hat[0] = 28.228
Gradient do_[0] = 1571.613
Backward Time Step 1:
Gradient di[0] = 35.046, df[0] = 28.199, dc_hat[0] = 32.717
Gradient do_[0] = 1172.555
Backward Time Step 0:
Gradient di[0] = 23.757, df[0] = 19.254, dc_hat[0] = 30.921
Gradient do_[0] = 390.645
Time Step 0:
i_gate[0] = 0.490, f_gate[0] = 0.684, o_gate[0] = 0.128, c_hat[0] = 0.703
c_state[0] = 0.344, h_state[0] = 0.042
Time Step 1:
i_gate[0] = 0.399, f_gate[0] = 0.675, o_gate[0] = 0.106, c_hat[0] = 0.738
c_state[0] = 0.527, h_state[0] = 0.051
Time Step 2:
i_gate[0] = 0.348, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.743
c_state[0] = 0.611, h_state[0] = 0.054
Time Step 3:
i_gate[0] = 0.302, f_gate[0] = 0.657, o_gate[0] = 0.095, c_hat[0] = 0.803
c_state[0] = 0.644, h_state[0] = 0.054
Time Step 4:
i_gate[0] = 0.268, f_gate[0] = 0.661, o_gate[0] = 0.091, c_hat[0] = 0.790
c_state[0] = 0.637, h_state[0] = 0.051
Backward Time Step 4:
Gradient di[0] = -0.014, df[0] = -0.013, dc_hat[0] = -0.009
Gradient do_[0] = -0.823
Backward Time Step 3:
Gradient di[0] = -0.072, df[0] = -0.063, dc_hat[0] = -0.046
Gradient do_[0] = -3.736
Backward Time Step 2:
Gradient di[0] = -0.106, df[0] = -0.093, dc_hat[0] = -0.098
Gradient do_[0] = -5.060
Backward Time Step 1:
Gradient di[0] = -0.098, df[0] = -0.082, dc_hat[0] = -0.101
Gradient do_[0] = -3.422
Backward Time Step 0:
Gradient di[0] = -0.054, df[0] = -0.045, dc_hat[0] = -0.076
Gradient do_[0] = -0.993
Time Step 0:
i_gate[0] = 0.489, f_gate[0] = 0.683, o_gate[0] = 0.123, c_hat[0] = 0.698
c_state[0] = 0.341, h_state[0] = 0.040
Time Step 1:
i_gate[0] = 0.394, f_gate[0] = 0.674, o_gate[0] = 0.101, c_hat[0] = 0.734
c_state[0] = 0.519, h_state[0] = 0.048
Time Step 2:
i_gate[0] = 0.340, f_gate[0] = 0.668, o_gate[0] = 0.094, c_hat[0] = 0.740
c_state[0] = 0.598, h_state[0] = 0.050
Time Step 3:
i_gate[0] = 0.292, f_gate[0] = 0.657, o_gate[0] = 0.090, c_hat[0] = 0.800
c_state[0] = 0.627, h_state[0] = 0.050
Time Step 4:
i_gate[0] = 0.258, f_gate[0] = 0.665, o_gate[0] = 0.087, c_hat[0] = 0.788
c_state[0] = 0.620, h_state[0] = 0.048
Backward Time Step 4:
Gradient di[0] = 26.020, df[0] = 25.578, dc_hat[0] = 16.904
Gradient do_[0] = 1574.091
Backward Time Step 3:
Gradient di[0] = 48.911, df[0] = 43.744, dc_hat[0] = 31.055
Gradient do_[0] = 2629.933
Backward Time Step 2:
Gradient di[0] = 67.793, df[0] = 60.447, dc_hat[0] = 62.708
Gradient do_[0] = 3273.653
Backward Time Step 1:
Gradient di[0] = 70.116, df[0] = 59.298, dc_hat[0] = 72.830
Gradient do_[0] = 2455.016
Backward Time Step 0:
Gradient di[0] = 50.062, df[0] = 42.457, dc_hat[0] = 71.964
Gradient do_[0] = 859.918
Time Step 0:
i_gate[0] = 0.476, f_gate[0] = 0.672, o_gate[0] = 0.118, c_hat[0] = 0.671
c_state[0] = 0.320, h_state[0] = 0.036
Time Step 1:
i_gate[0] = 0.386, f_gate[0] = 0.662, o_gate[0] = 0.095, c_hat[0] = 0.705
c_state[0] = 0.484, h_state[0] = 0.043
Time Step 2:
i_gate[0] = 0.335, f_gate[0] = 0.654, o_gate[0] = 0.088, c_hat[0] = 0.708
c_state[0] = 0.554, h_state[0] = 0.044
Time Step 3:
i_gate[0] = 0.289, f_gate[0] = 0.641, o_gate[0] = 0.083, c_hat[0] = 0.771
c_state[0] = 0.578, h_state[0] = 0.044
Time Step 4:
i_gate[0] = 0.254, f_gate[0] = 0.645, o_gate[0] = 0.079, c_hat[0] = 0.752
c_state[0] = 0.564, h_state[0] = 0.040
Backward Time Step 4:
Gradient di[0] = -0.034, df[0] = -0.035, dc_hat[0] = -0.026
Gradient do_[0] = -2.070
Backward Time Step 3:
Gradient di[0] = -0.074, df[0] = -0.069, dc_hat[0] = -0.055
Gradient do_[0] = -3.993
Backward Time Step 2:
Gradient di[0] = -0.096, df[0] = -0.090, dc_hat[0] = -0.101
Gradient do_[0] = -4.752
Backward Time Step 1:
Gradient di[0] = -0.088, df[0] = -0.078, dc_hat[0] = -0.102
Gradient do_[0] = -3.214
Backward Time Step 0:
Gradient di[0] = -0.049, df[0] = -0.043, dc_hat[0] = -0.076
Gradient do_[0] = -0.938
Time Step 0:
i_gate[0] = 0.477, f_gate[0] = 0.673, o_gate[0] = 0.123, c_hat[0] = 0.674
c_state[0] = 0.321, h_state[0] = 0.038
Time Step 1:
i_gate[0] = 0.382, f_gate[0] = 0.663, o_gate[0] = 0.100, c_hat[0] = 0.708
c_state[0] = 0.484, h_state[0] = 0.045
Time Step 2:
i_gate[0] = 0.329, f_gate[0] = 0.656, o_gate[0] = 0.093, c_hat[0] = 0.712
c_state[0] = 0.551, h_state[0] = 0.047
Time Step 3:
i_gate[0] = 0.281, f_gate[0] = 0.644, o_gate[0] = 0.090, c_hat[0] = 0.775
c_state[0] = 0.573, h_state[0] = 0.046
Time Step 4:
i_gate[0] = 0.246, f_gate[0] = 0.650, o_gate[0] = 0.086, c_hat[0] = 0.757
c_state[0] = 0.559, h_state[0] = 0.044
Backward Time Step 4:
Gradient di[0] = 10.092, df[0] = 10.635, dc_hat[0] = 7.543
Gradient do_[0] = 571.171
Backward Time Step 3:
Gradient di[0] = 18.426, df[0] = 17.387, dc_hat[0] = 13.205
Gradient do_[0] = 928.326
Backward Time Step 2:
Gradient di[0] = 24.810, df[0] = 23.368, dc_hat[0] = 25.564
Gradient do_[0] = 1137.920
Backward Time Step 1:
Gradient di[0] = 25.377, df[0] = 22.479, dc_hat[0] = 28.946
Gradient do_[0] = 853.199
Backward Time Step 0:
Gradient di[0] = 17.881, df[0] = 15.756, dc_hat[0] = 27.708
Gradient do_[0] = 297.790
Time Step 0:
i_gate[0] = 0.465, f_gate[0] = 0.662, o_gate[0] = 0.118, c_hat[0] = 0.646
c_state[0] = 0.300, h_state[0] = 0.034
Time Step 1:
i_gate[0] = 0.375, f_gate[0] = 0.651, o_gate[0] = 0.095, c_hat[0] = 0.677
c_state[0] = 0.449, h_state[0] = 0.040
Time Step 2:
i_gate[0] = 0.325, f_gate[0] = 0.642, o_gate[0] = 0.087, c_hat[0] = 0.677
c_state[0] = 0.508, h_state[0] = 0.041
Time Step 3:
i_gate[0] = 0.278, f_gate[0] = 0.628, o_gate[0] = 0.083, c_hat[0] = 0.743
c_state[0] = 0.526, h_state[0] = 0.040
Time Step 4:
i_gate[0] = 0.243, f_gate[0] = 0.631, o_gate[0] = 0.078, c_hat[0] = 0.717
c_state[0] = 0.506, h_state[0] = 0.036
Backward Time Step 4:
Gradient di[0] = -0.106, df[0] = -0.118, dc_hat[0] = -0.095
Gradient do_[0] = -6.168
Backward Time Step 3:
Gradient di[0] = -0.205, df[0] = -0.202, dc_hat[0] = -0.172
Gradient do_[0] = -10.427
Backward Time Step 2:
Gradient di[0] = -0.259, df[0] = -0.258, dc_hat[0] = -0.307
Gradient do_[0] = -12.114
Backward Time Step 1:
Gradient di[0] = -0.244, df[0] = -0.227, dc_hat[0] = -0.311
Gradient do_[0] = -8.372
Backward Time Step 0:
Gradient di[0] = -0.149, df[0] = -0.137, dc_hat[0] = -0.251
Gradient do_[0] = -2.600
Time Step 0:
i_gate[0] = 0.467, f_gate[0] = 0.665, o_gate[0] = 0.123, c_hat[0] = 0.656
c_state[0] = 0.307, h_state[0] = 0.037
Time Step 1:
i_gate[0] = 0.373, f_gate[0] = 0.654, o_gate[0] = 0.100, c_hat[0] = 0.689
c_state[0] = 0.457, h_state[0] = 0.043
Time Step 2:
i_gate[0] = 0.319, f_gate[0] = 0.646, o_gate[0] = 0.092, c_hat[0] = 0.691
c_state[0] = 0.516, h_state[0] = 0.044
Time Step 3:
i_gate[0] = 0.271, f_gate[0] = 0.633, o_gate[0] = 0.089, c_hat[0] = 0.755
c_state[0] = 0.531, h_state[0] = 0.043
Time Step 4:
i_gate[0] = 0.235, f_gate[0] = 0.638, o_gate[0] = 0.085, c_hat[0] = 0.733
c_state[0] = 0.512, h_state[0] = 0.040
Backward Time Step 4:
Gradient di[0] = -13.589, df[0] = -15.173, dc_hat[0] = -11.209
Gradient do_[0] = -730.753
Backward Time Step 3:
Gradient di[0] = -24.522, df[0] = -24.182, dc_hat[0] = -19.131
Gradient do_[0] = -1175.594
Backward Time Step 2:
Gradient di[0] = -32.433, df[0] = -31.928, dc_hat[0] = -36.049
Gradient do_[0] = -1431.292
Backward Time Step 1:
Gradient di[0] = -32.940, df[0] = -30.262, dc_hat[0] = -40.094
Gradient do_[0] = -1074.297
Backward Time Step 0:
Gradient di[0] = -22.819, df[0] = -20.691, dc_hat[0] = -37.163
Gradient do_[0] = -371.204
Time Step 0:
i_gate[0] = 0.480, f_gate[0] = 0.676, o_gate[0] = 0.128, c_hat[0] = 0.684
c_state[0] = 0.328, h_state[0] = 0.041
Time Step 1:
i_gate[0] = 0.381, f_gate[0] = 0.666, o_gate[0] = 0.105, c_hat[0] = 0.719
c_state[0] = 0.492, h_state[0] = 0.048
Time Step 2:
i_gate[0] = 0.323, f_gate[0] = 0.660, o_gate[0] = 0.099, c_hat[0] = 0.725
c_state[0] = 0.559, h_state[0] = 0.050
Time Step 3:
i_gate[0] = 0.274, f_gate[0] = 0.651, o_gate[0] = 0.097, c_hat[0] = 0.787
c_state[0] = 0.580, h_state[0] = 0.051
Time Step 4:
i_gate[0] = 0.240, f_gate[0] = 0.660, o_gate[0] = 0.095, c_hat[0] = 0.772
c_state[0] = 0.568, h_state[0] = 0.049
Backward Time Step 4:
Gradient di[0] = -59007.633, df[0] = -62085.996, dc_hat[0] = -40699.586
Gradient do_[0] = -3087282.750
Backward Time Step 3:
Gradient di[0] = -101361.938, df[0] = -95751.867, dc_hat[0] = -67624.703
Gradient do_[0] = -4799500.000
Backward Time Step 2:
Gradient di[0] = -138141.906, df[0] = -128878.289, dc_hat[0] = -133345.609
Gradient do_[0] = -6019691.500
Backward Time Step 1:
Gradient di[0] = -149881.547, df[0] = -130996.055, dc_hat[0] = -162582.484
Gradient do_[0] = -4835544.000
Backward Time Step 0:
Gradient di[0] = -114608.344, df[0] = -99444.844, dc_hat[0] = -171621.484
Gradient do_[0] = -1843199.750
Time Step 0:
i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709
c_state[0] = 0.349, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.388, f_gate[0] = 0.677, o_gate[0] = 0.110, c_hat[0] = 0.742
c_state[0] = 0.524, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.327, f_gate[0] = 0.673, o_gate[0] = 0.104, c_hat[0] = 0.748
c_state[0] = 0.597, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.278, f_gate[0] = 0.667, o_gate[0] = 0.104, c_hat[0] = 0.805
c_state[0] = 0.622, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.246, f_gate[0] = 0.680, o_gate[0] = 0.103, c_hat[0] = 0.793
c_state[0] = 0.618, h_state[0] = 0.057
Backward Time Step 4:
Gradient di[0] = -538430848.000, df[0] = -542115904.000, dc_hat[0] = -334196576.000
Gradient do_[0] = -28003743744.000
Backward Time Step 3:
Gradient di[0] = -894501568.000, df[0] = -820606720.000, dc_hat[0] = -541224512.000
Gradient do_[0] = -42478477312.000
Backward Time Step 2:
Gradient di[0] = -1237090816.000, df[0] = -1113104640.000, dc_hat[0] = -1084785920.000
Gradient do_[0] = -54138720256.000
Backward Time Step 1:
Gradient di[0] = -1443009920.000, df[0] = -1213306496.000, dc_hat[0] = -1428586496.000
Gradient do_[0] = -46599806976.000
Backward Time Step 0:
Gradient di[0] = -1261675520.000, df[0] = -1051560896.000, dc_hat[0] = -1740872704.000
Gradient do_[0] = -20093313024.000
Time Step 0:
i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733
c_state[0] = 0.370, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.396, f_gate[0] = 0.689, o_gate[0] = 0.115, c_hat[0] = 0.764
c_state[0] = 0.557, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.333, f_gate[0] = 0.688, o_gate[0] = 0.110, c_hat[0] = 0.770
c_state[0] = 0.640, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.285, f_gate[0] = 0.686, o_gate[0] = 0.111, c_hat[0] = 0.822
c_state[0] = 0.673, h_state[0] = 0.065
Time Step 4:
i_gate[0] = 0.255, f_gate[0] = 0.702, o_gate[0] = 0.111, c_hat[0] = 0.811
c_state[0] = 0.679, h_state[0] = 0.066
Backward Time Step 4:
Gradient di[0] = 230659325952.000, df[0] = 219999502336.000, dc_hat[0] = 130854019072.000
Gradient do_[0] = 12234542546944.000
Backward Time Step 3:
Gradient di[0] = 373047263232.000, df[0] = 329246375936.000, dc_hat[0] = 205306396672.000
Gradient do_[0] = 18020896342016.000
Backward Time Step 2:
Gradient di[0] = 511328616448.000, df[0] = 441360842752.000, dc_hat[0] = 405705424896.000
Gradient do_[0] = 22602133274624.000
Backward Time Step 1:
Gradient di[0] = 621603061760.000, df[0] = 502193520640.000, dc_hat[0] = 559379709952.000
Gradient do_[0] = 20122297171968.000
Backward Time Step 0:
Gradient di[0] = 605582721024.000, df[0] = 486429229056.000, dc_hat[0] = 771354132480.000
Gradient do_[0] = 9563190329344.000
Time Step 0:
i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709
c_state[0] = 0.349, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.388, f_gate[0] = 0.676, o_gate[0] = 0.110, c_hat[0] = 0.741
c_state[0] = 0.524, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.327, f_gate[0] = 0.672, o_gate[0] = 0.104, c_hat[0] = 0.748
c_state[0] = 0.596, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.277, f_gate[0] = 0.665, o_gate[0] = 0.104, c_hat[0] = 0.805
c_state[0] = 0.620, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.245, f_gate[0] = 0.678, o_gate[0] = 0.103, c_hat[0] = 0.794
c_state[0] = 0.615, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = -73820720.000, df[0] = -74399184.000, dc_hat[0] = -45636792.000
Gradient do_[0] = -3809626112.000
Backward Time Step 3:
Gradient di[0] = -124529768.000, df[0] = -114297888.000, dc_hat[0] = -75171720.000
Gradient do_[0] = -5880279040.000
Backward Time Step 2:
Gradient di[0] = -173315488.000, df[0] = -156012448.000, dc_hat[0] = -151893232.000
Gradient do_[0] = -7560016896.000
Backward Time Step 1:
Gradient di[0] = -200900496.000, df[0] = -168995280.000, dc_hat[0] = -199160464.000
Gradient do_[0] = -6482587136.000
Backward Time Step 0:
Gradient di[0] = -172885424.000, df[0] = -144093744.000, dc_hat[0] = -238549088.000
Gradient do_[0] = -2753355520.000
Time Step 0:
i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733
c_state[0] = 0.370, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.396, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.764
c_state[0] = 0.557, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.333, f_gate[0] = 0.686, o_gate[0] = 0.110, c_hat[0] = 0.770
c_state[0] = 0.639, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.284, f_gate[0] = 0.684, o_gate[0] = 0.111, c_hat[0] = 0.822
c_state[0] = 0.671, h_state[0] = 0.065
Time Step 4:
i_gate[0] = 0.254, f_gate[0] = 0.699, o_gate[0] = 0.111, c_hat[0] = 0.811
c_state[0] = 0.675, h_state[0] = 0.066
Backward Time Step 4:
Gradient di[0] = 5734780928.000, df[0] = 5480751104.000, dc_hat[0] = 3245264640.000
Gradient do_[0] = 301448134656.000
Backward Time Step 3:
Gradient di[0] = 9375862784.000, df[0] = 8284593664.000, dc_hat[0] = 5152923136.000
Gradient do_[0] = 449973518336.000
Backward Time Step 2:
Gradient di[0] = 12900345856.000, df[0] = 11145098240.000, dc_hat[0] = 10236901376.000
Gradient do_[0] = 568093966336.000
Backward Time Step 1:
Gradient di[0] = 15583946752.000, df[0] = 12598241280.000, dc_hat[0] = 14047940608.000
Gradient do_[0] = 504000905216.000
Backward Time Step 0:
Gradient di[0] = 14996421632.000, df[0] = 12045751296.000, dc_hat[0] = 19101521920.000
Gradient do_[0] = 236819267584.000
Time Step 0:
i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709
c_state[0] = 0.349, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.388, f_gate[0] = 0.676, o_gate[0] = 0.110, c_hat[0] = 0.741
c_state[0] = 0.524, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.327, f_gate[0] = 0.672, o_gate[0] = 0.104, c_hat[0] = 0.748
c_state[0] = 0.596, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.277, f_gate[0] = 0.665, o_gate[0] = 0.104, c_hat[0] = 0.805
c_state[0] = 0.620, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.245, f_gate[0] = 0.678, o_gate[0] = 0.103, c_hat[0] = 0.793
c_state[0] = 0.615, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = -52794228.000, df[0] = -53213832.000, dc_hat[0] = -32655412.000
Gradient do_[0] = -2722256128.000
Backward Time Step 3:
Gradient di[0] = -89103728.000, df[0] = -81785184.000, dc_hat[0] = -53805124.000
Gradient do_[0] = -4205185792.000
Backward Time Step 2:
Gradient di[0] = -123994856.000, df[0] = -111624192.000, dc_hat[0] = -108704840.000
Gradient do_[0] = -5406868480.000
Backward Time Step 1:
Gradient di[0] = -143592656.000, df[0] = -120796936.000, dc_hat[0] = -142381280.000
Gradient do_[0] = -4632870400.000
Backward Time Step 0:
Gradient di[0] = -123510528.000, df[0] = -102941560.000, dc_hat[0] = -170421088.000
Gradient do_[0] = -1967015808.000
Time Step 0:
i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733
c_state[0] = 0.370, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.396, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.764
c_state[0] = 0.557, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.333, f_gate[0] = 0.686, o_gate[0] = 0.110, c_hat[0] = 0.770
c_state[0] = 0.639, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.284, f_gate[0] = 0.684, o_gate[0] = 0.111, c_hat[0] = 0.822
c_state[0] = 0.670, h_state[0] = 0.065
Time Step 4:
i_gate[0] = 0.254, f_gate[0] = 0.699, o_gate[0] = 0.111, c_hat[0] = 0.811
c_state[0] = 0.675, h_state[0] = 0.066
Backward Time Step 4:
Gradient di[0] = 1483947520.000, df[0] = 1418497024.000, dc_hat[0] = 840254656.000
Gradient do_[0] = 77940498432.000
Backward Time Step 3:
Gradient di[0] = 2426511360.000, df[0] = 2144266240.000, dc_hat[0] = 1334088320.000
Gradient do_[0] = 116393222144.000
Backward Time Step 2:
Gradient di[0] = 3337781760.000, df[0] = 2883934720.000, dc_hat[0] = 2649557248.000
Gradient do_[0] = 146938953728.000
Backward Time Step 1:
Gradient di[0] = 4028459520.000, df[0] = 3256907008.000, dc_hat[0] = 3632219904.000
Gradient do_[0] = 130270339072.000
Backward Time Step 0:
Gradient di[0] = 3875390208.000, df[0] = 3112874752.000, dc_hat[0] = 4936235008.000
Gradient do_[0] = 61199065088.000
Time Step 0:
i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709
c_state[0] = 0.349, h_state[0] = 0.045
Time Step 1:
i_gate[0] = 0.388, f_gate[0] = 0.676, o_gate[0] = 0.110, c_hat[0] = 0.741
c_state[0] = 0.523, h_state[0] = 0.053
Time Step 2:
i_gate[0] = 0.327, f_gate[0] = 0.671, o_gate[0] = 0.104, c_hat[0] = 0.748
c_state[0] = 0.596, h_state[0] = 0.056
Time Step 3:
i_gate[0] = 0.277, f_gate[0] = 0.665, o_gate[0] = 0.104, c_hat[0] = 0.805
c_state[0] = 0.620, h_state[0] = 0.057
Time Step 4:
i_gate[0] = 0.245, f_gate[0] = 0.677, o_gate[0] = 0.103, c_hat[0] = 0.793
c_state[0] = 0.614, h_state[0] = 0.056
Backward Time Step 4:
Gradient di[0] = -31030956.000, df[0] = -31281024.000, dc_hat[0] = -19204178.000
Gradient do_[0] = -1598732928.000
Backward Time Step 3:
Gradient di[0] = -52398980.000, df[0] = -48096676.000, dc_hat[0] = -31651684.000
Gradient do_[0] = -2471589632.000
Backward Time Step 2:
Gradient di[0] = -72907688.000, df[0] = -65638796.000, dc_hat[0] = -63938548.000
Gradient do_[0] = -3178128384.000
Backward Time Step 1:
Gradient di[0] = -84349528.000, df[0] = -70963744.000, dc_hat[0] = -83657000.000
Gradient do_[0] = -2721141248.000
Backward Time Step 0:
Gradient di[0] = -72518024.000, df[0] = -60441156.000, dc_hat[0] = -100061128.000
Gradient do_[0] = -1154914560.000
Time Step 0:
i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733
c_state[0] = 0.370, h_state[0] = 0.049
Time Step 1:
i_gate[0] = 0.395, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.764
c_state[0] = 0.557, h_state[0] = 0.058
Time Step 2:
i_gate[0] = 0.333, f_gate[0] = 0.686, o_gate[0] = 0.110, c_hat[0] = 0.770
c_state[0] = 0.638, h_state[0] = 0.062
Time Step 3:
i_gate[0] = 0.284, f_gate[0] = 0.684, o_gate[0] = 0.111, c_hat[0] = 0.822
c_state[0] = 0.670, h_state[0] = 0.065
Time Step 4:
i_gate[0] = 0.254, f_gate[0] = 0.699, o_gate[0] = 0.111, c_hat[0] = 0.811
c_state[0] = 0.674, h_state[0] = 0.065
Backward Time Step 4:
Gradient di[0] = -43258628.000, df[0] = -41372240.000, dc_hat[0] = -24508188.000
Gradient do_[0] = -2268284160.000
Backward Time Step 3:
Gradient di[0] = -70760968.000, df[0] = -62556228.000, dc_hat[0] = -38924360.000
Gradient do_[0] = -3390198016.000
Backward Time Step 2:
Gradient di[0] = -97374600.000, df[0] = -84165688.000, dc_hat[0] = -77342368.000
Gradient do_[0] = -4283812352.000
Backward Time Step 1:
Gradient di[0] = -117517400.000, df[0] = -95030048.000, dc_hat[0] = -106004696.000
Gradient do_[0] = -3799421952.000
Backward Time Step 0:
Gradient di[0] = -113085320.000, df[0] = -90834840.000, dc_hat[0] = -144041152.000
Gradient do_[0] = -1785811328.000
Time Step 0:
i_gate[0] = 0.517, f_gate[0] = 0.707, o_gate[0] = 0.146, c_hat[0] = 0.755
c_state[0] = 0.391, h_state[0] = 0.054
Time Step 1:
i_gate[0] = 0.404, f_gate[0] = 0.700, o_gate[0] = 0.120, c_hat[0] = 0.785
c_state[0] = 0.590, h_state[0] = 0.064
Time Step 2:
i_gate[0] = 0.340, f_gate[0] = 0.701, o_gate[0] = 0.116, c_hat[0] = 0.790
c_state[0] = 0.683, h_state[0] = 0.069
Time Step 3:
i_gate[0] = 0.293, f_gate[0] = 0.702, o_gate[0] = 0.119, c_hat[0] = 0.838
c_state[0] = 0.725, h_state[0] = 0.074
Time Step 4:
i_gate[0] = 0.265, f_gate[0] = 0.719, o_gate[0] = 0.120, c_hat[0] = 0.826
c_state[0] = 0.741, h_state[0] = 0.076
Backward Time Step 4:
Gradient di[0] = -2951044136960.000, df[0] = -2659334225920.000, dc_hat[0] = -1541281808384.000
Gradient do_[0] = -159024939008000.000
Backward Time Step 3:
Gradient di[0] = -4712402780160.000, df[0] = -3989636644864.000, dc_hat[0] = -2368184516608.000
Gradient do_[0] = -230211908009984.000
Backward Time Step 2:
Gradient di[0] = -6374583959552.000, df[0] = -5279795642368.000, dc_hat[0] = -4584784265216.000
Gradient do_[0] = -283295539527680.000
Backward Time Step 1:
Gradient di[0] = -7880765865984.000, df[0] = -6128523542528.000, dc_hat[0] = -6458225721344.000
Gradient do_[0] = -255434237673472.000
Backward Time Step 0:
Gradient di[0] = -8310984540160.000, df[0] = -6452621082624.000, dc_hat[0] = -9787926380544.000
Gradient do_[0] = -130307789422592.000
Time Step 0:
i_gate[0] = 0.529, f_gate[0] = 0.718, o_gate[0] = 0.152, c_hat[0] = 0.776
c_state[0] = 0.411, h_state[0] = 0.059
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.712, o_gate[0] = 0.126, c_hat[0] = 0.805
c_state[0] = 0.625, h_state[0] = 0.070
Time Step 2:
i_gate[0] = 0.348, f_gate[0] = 0.717, o_gate[0] = 0.123, c_hat[0] = 0.810
c_state[0] = 0.730, h_state[0] = 0.077
Time Step 3:
i_gate[0] = 0.303, f_gate[0] = 0.721, o_gate[0] = 0.127, c_hat[0] = 0.852
c_state[0] = 0.784, h_state[0] = 0.083
Time Step 4:
i_gate[0] = 0.279, f_gate[0] = 0.740, o_gate[0] = 0.129, c_hat[0] = 0.840
c_state[0] = 0.814, h_state[0] = 0.087
Backward Time Step 4:
Gradient di[0] = -18106463280955392.000, df[0] = -15275344319741952.000, dc_hat[0] = -8816178973638656.000
Gradient do_[0] = -1015781755736555520.000
Backward Time Step 3:
Gradient di[0] = -28348923047313408.000, df[0] = -22848725651030016.000, dc_hat[0] = -13108312128094208.000
Gradient do_[0] = -1421152451444080640.000
Backward Time Step 2:
Gradient di[0] = -37491860448477184.000, df[0] = -29688579181510656.000, dc_hat[0] = -24458421411512320.000
Gradient do_[0] = -1686239619659595776.000
Backward Time Step 1:
Gradient di[0] = -46854257793564672.000, df[0] = -35054387051102208.000, dc_hat[0] = -34852270185119744.000
Gradient do_[0] = -1522876930783707136.000
Backward Time Step 0:
Gradient di[0] = -53166889056075776.000, df[0] = -40003879445725184.000, dc_hat[0] = -57976685491912704.000
Gradient do_[0] = -828696210138202112.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.724, o_gate[0] = 0.132, c_hat[0] = 0.823
c_state[0] = 0.660, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.359, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.826
c_state[0] = 0.779, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.316, f_gate[0] = 0.739, o_gate[0] = 0.136, c_hat[0] = 0.863
c_state[0] = 0.849, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.295, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.850
c_state[0] = 0.894, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -101980545289814016.000, df[0] = -80244609157955584.000, dc_hat[0] = -47186275945414656.000
Gradient do_[0] = -6031987310432616448.000
Backward Time Step 3:
Gradient di[0] = -158601734808141824.000, df[0] = -121093673743024128.000, dc_hat[0] = -68639444315406336.000
Gradient do_[0] = -8224412946462146560.000
Backward Time Step 2:
Gradient di[0] = -205309693130702848.000, df[0] = -155138238820909056.000, dc_hat[0] = -122958531363078144.000
Gradient do_[0] = -9384368026951876608.000
Backward Time Step 1:
Gradient di[0] = -257790310837387264.000, df[0] = -185687258865598464.000, dc_hat[0] = -175027648253132800.000
Gradient do_[0] = -8418781863241318400.000
Backward Time Step 0:
Gradient di[0] = -310038708252311552.000, df[0] = -226607387979546624.000, dc_hat[0] = -313438741802778624.000
Gradient do_[0] = -4809923017113075712.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -2139848269889536.000, df[0] = -1573007612968960.000, dc_hat[0] = -947072903675904.000
Gradient do_[0] = -134049184893370368.000
Backward Time Step 3:
Gradient di[0] = -3328388932567040.000, df[0] = -2403894537748480.000, dc_hat[0] = -1354085747916800.000
Gradient do_[0] = -179012604809707520.000
Backward Time Step 2:
Gradient di[0] = -4193047722065920.000, df[0] = -3019229401972736.000, dc_hat[0] = -2304750854864896.000
Gradient do_[0] = -194846139185364992.000
Backward Time Step 1:
Gradient di[0] = -5244740794580992.000, df[0] = -3637416124481536.000, dc_hat[0] = -3249862166446080.000
Gradient do_[0] = -172178504387919872.000
Backward Time Step 0:
Gradient di[0] = -6547313644273664.000, df[0] = -4658376290074624.000, dc_hat[0] = -6143588865933312.000
Gradient do_[0] = -101220877244301312.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 12787719864320.000, df[0] = 8799122358272.000, dc_hat[0] = 5441820557312.000
Gradient do_[0] = 857433714982912.000
Backward Time Step 3:
Gradient di[0] = 19931185807360.000, df[0] = 13601958002688.000, dc_hat[0] = 7692902989824.000
Gradient do_[0] = 1121597691789312.000
Backward Time Step 2:
Gradient di[0] = 24235414126592.000, df[0] = 16606877974528.000, dc_hat[0] = 12365238108160.000
Gradient do_[0] = 1151787553783808.000
Backward Time Step 1:
Gradient di[0] = 30172784885760.000, df[0] = 20160184320000.000, dc_hat[0] = 17275903016960.000
Gradient do_[0] = 1000161920679936.000
Backward Time Step 0:
Gradient di[0] = 38972403220480.000, df[0] = 27043716661248.000, dc_hat[0] = 33977505153024.000
Gradient do_[0] = 601104026959872.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -1907859369492480.000, df[0] = -1402466675458048.000, dc_hat[0] = -844360237187072.000
Gradient do_[0] = -119515032743575552.000
Backward Time Step 3:
Gradient di[0] = -2967483904425984.000, df[0] = -2143231361941504.000, dc_hat[0] = -1207197597958144.000
Gradient do_[0] = -159599799308386304.000
Backward Time Step 2:
Gradient di[0] = -3737943323705344.000, df[0] = -2691525007900672.000, dc_hat[0] = -2054534650134528.000
Gradient do_[0] = -173697101744570368.000
Backward Time Step 1:
Gradient di[0] = -4675552401162240.000, df[0] = -3242663801257984.000, dc_hat[0] = -2897169115774976.000
Gradient do_[0] = -153492750950465536.000
Backward Time Step 0:
Gradient di[0] = -5838666052141056.000, df[0] = -4154177462730752.000, dc_hat[0] = -5478638271070208.000
Gradient do_[0] = -90265257485795328.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 14036002406400.000, df[0] = 9658014105600.000, dc_hat[0] = 5972785889280.000
Gradient do_[0] = 941123166011392.000
Backward Time Step 3:
Gradient di[0] = 21876579500032.000, df[0] = 14929554833408.000, dc_hat[0] = 8443375124480.000
Gradient do_[0] = 1231058288771072.000
Backward Time Step 2:
Gradient di[0] = 26597925060608.000, df[0] = 18225731469312.000, dc_hat[0] = 13570188247040.000
Gradient do_[0] = 1264058938425344.000
Backward Time Step 1:
Gradient di[0] = 33114329972736.000, df[0] = 22125601619968.000, dc_hat[0] = 18960131031040.000
Gradient do_[0] = 1097667811737600.000
Backward Time Step 0:
Gradient di[0] = 42785822474240.000, df[0] = 29689915637760.000, dc_hat[0] = 37302172647424.000
Gradient do_[0] = 659921523703808.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -1682640008642560.000, df[0] = -1236903605043200.000, dc_hat[0] = -744653041172480.000
Gradient do_[0] = -105405300672036864.000
Backward Time Step 3:
Gradient di[0] = -2617116846981120.000, df[0] = -1890179807707136.000, dc_hat[0] = -1064612132814848.000
Gradient do_[0] = -140754573735231488.000
Backward Time Step 2:
Gradient di[0] = -3296226875277312.000, df[0] = -2373461473230848.000, dc_hat[0] = -1811692635815936.000
Gradient do_[0] = -153170336345489408.000
Backward Time Step 1:
Gradient di[0] = -4123089147265024.000, df[0] = -2859510574088192.000, dc_hat[0] = -2554839552753664.000
Gradient do_[0] = -135356048952262656.000
Backward Time Step 0:
Gradient di[0] = -5150450156306432.000, df[0] = -3664515757506560.000, dc_hat[0] = -4832859336474624.000
Gradient do_[0] = -79625489622237184.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 15324808216576.000, df[0] = 10544774905856.000, dc_hat[0] = 6520949440512.000
Gradient do_[0] = 1027527640350720.000
Backward Time Step 3:
Gradient di[0] = 23885059719168.000, df[0] = 16300200951808.000, dc_hat[0] = 9218120744960.000
Gradient do_[0] = 1344066528739328.000
Backward Time Step 2:
Gradient di[0] = 29036600360960.000, df[0] = 19896765251584.000, dc_hat[0] = 14813919969280.000
Gradient do_[0] = 1379948967231488.000
Backward Time Step 1:
Gradient di[0] = 36150781673472.000, df[0] = 24154428407808.000, dc_hat[0] = 20698697302016.000
Gradient do_[0] = 1198319497904128.000
Backward Time Step 0:
Gradient di[0] = 46724408147968.000, df[0] = 32422980747264.000, dc_hat[0] = 40735973834752.000
Gradient do_[0] = 720669675356160.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -1464533918941184.000, df[0] = -1076573079863296.000, dc_hat[0] = -648092110028800.000
Gradient do_[0] = -91741145687654400.000
Backward Time Step 3:
Gradient di[0] = -2277830536724480.000, df[0] = -1645135146254336.000, dc_hat[0] = -926534906937344.000
Gradient do_[0] = -122505420903284736.000
Backward Time Step 2:
Gradient di[0] = -2868630668705792.000, df[0] = -2065566810505216.000, dc_hat[0] = -1576621559513088.000
Gradient do_[0] = -133299867719041024.000
Backward Time Step 1:
Gradient di[0] = -3588270961197056.000, df[0] = -2488594782486528.000, dc_hat[0] = -2223443634290688.000
Gradient do_[0] = -117798600603336704.000
Backward Time Step 0:
Gradient di[0] = -4483704265113600.000, df[0] = -3190129640341504.000, dc_hat[0] = -4207226751287296.000
Gradient do_[0] = -69317662601117696.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 16724554416128.000, df[0] = 11507913981952.000, dc_hat[0] = 7116144246784.000
Gradient do_[0] = 1121362810765312.000
Backward Time Step 3:
Gradient di[0] = 26066057953280.000, df[0] = 17788634660864.000, dc_hat[0] = 10059216060416.000
Gradient do_[0] = 1466779113095168.000
Backward Time Step 2:
Gradient di[0] = 31685013929984.000, df[0] = 21711531540480.000, dc_hat[0] = 16164497391616.000
Gradient do_[0] = 1505804393906176.000
Backward Time Step 1:
Gradient di[0] = 39448033099776.000, df[0] = 26357513846784.000, dc_hat[0] = 22586591281152.000
Gradient do_[0] = 1307616215040000.000
Backward Time Step 0:
Gradient di[0] = 51000404606976.000, df[0] = 35390178197504.000, dc_hat[0] = 44463934144512.000
Gradient do_[0] = 786621985193984.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -1259865708167168.000, df[0] = -926120912355328.000, dc_hat[0] = -557488936058880.000
Gradient do_[0] = -78919173660475392.000
Backward Time Step 3:
Gradient di[0] = -1959452932571136.000, df[0] = -1415191724032000.000, dc_hat[0] = -796981043658752.000
Gradient do_[0] = -105381377704198144.000
Backward Time Step 2:
Gradient di[0] = -2467449316311040.000, df[0] = -1776693081538560.000, dc_hat[0] = -1356083713015808.000
Gradient do_[0] = -114657089724350464.000
Backward Time Step 1:
Gradient di[0] = -3086476241797120.000, df[0] = -2140582172426240.000, dc_hat[0] = -1912510416420864.000
Gradient do_[0] = -101325287899267072.000
Backward Time Step 0:
Gradient di[0] = -3857843778224128.000, df[0] = -2744833605107712.000, dc_hat[0] = -3619958693036032.000
Gradient do_[0] = -59641917327015936.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 18136527011840.000, df[0] = 12479464734720.000, dc_hat[0] = 7716471832576.000
Gradient do_[0] = 1216015971123200.000
Backward Time Step 3:
Gradient di[0] = 28265978789888.000, df[0] = 19289985777664.000, dc_hat[0] = 10907500412928.000
Gradient do_[0] = 1590553493897216.000
Backward Time Step 2:
Gradient di[0] = 34355816693760.000, df[0] = 23541640593408.000, dc_hat[0] = 17526407823360.000
Gradient do_[0] = 1632723764510720.000
Backward Time Step 1:
Gradient di[0] = 42773214396416.000, df[0] = 28579257647104.000, dc_hat[0] = 24490476044288.000
Gradient do_[0] = 1417838631845888.000
Backward Time Step 0:
Gradient di[0] = 55314976079872.000, df[0] = 38384147890176.000, dc_hat[0] = 48225532772352.000
Gradient do_[0] = 853169215111168.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -1067033957498880.000, df[0] = -784370415697920.000, dc_hat[0] = -472133977243648.000
Gradient do_[0] = -66838998319890432.000
Backward Time Step 3:
Gradient di[0] = -1659505167302656.000, df[0] = -1198558808113152.000, dc_hat[0] = -674938339983360.000
Gradient do_[0] = -89248784755720192.000
Backward Time Step 2:
Gradient di[0] = -2089538096726016.000, df[0] = -1504575630606336.000, dc_hat[0] = -1148349566681088.000
Gradient do_[0] = -97095878984073216.000
Backward Time Step 1:
Gradient di[0] = -2613783147053056.000, df[0] = -1812752553213952.000, dc_hat[0] = -1619610021396480.000
Gradient do_[0] = -85807347720519680.000
Backward Time Step 0:
Gradient di[0] = -3267989881225216.000, df[0] = -2325155976052736.000, dc_hat[0] = -3066476995018752.000
Gradient do_[0] = -50522829879246848.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 19426460041216.000, df[0] = 13367039229952.000, dc_hat[0] = 8264806825984.000
Gradient do_[0] = 1302483997556736.000
Backward Time Step 3:
Gradient di[0] = 30275662774272.000, df[0] = 20661518991360.000, dc_hat[0] = 11682291122176.000
Gradient do_[0] = 1703621326536704.000
Backward Time Step 2:
Gradient di[0] = 36794972241920.000, df[0] = 25213014114304.000, dc_hat[0] = 18770051465216.000
Gradient do_[0] = 1748633254887424.000
Backward Time Step 1:
Gradient di[0] = 45809936629760.000, df[0] = 30608266887168.000, dc_hat[0] = 26229197504512.000
Gradient do_[0] = 1518499511926784.000
Backward Time Step 0:
Gradient di[0] = 59258825605120.000, df[0] = 41120859947008.000, dc_hat[0] = 51663918333952.000
Gradient do_[0] = 913998501380096.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -876562224250880.000, df[0] = -644354750283776.000, dc_hat[0] = -387832896028672.000
Gradient do_[0] = -54907016530886656.000
Backward Time Step 3:
Gradient di[0] = -1363240739143680.000, df[0] = -984585684910080.000, dc_hat[0] = -554409712943104.000
Gradient do_[0] = -73314679425859584.000
Backward Time Step 2:
Gradient di[0] = -1716341677490176.000, df[0] = -1235853619757056.000, dc_hat[0] = -943220183793664.000
Gradient do_[0] = -79753857604780032.000
Backward Time Step 1:
Gradient di[0] = -2146979123560448.000, df[0] = -1489007045246976.000, dc_hat[0] = -1330358603743232.000
Gradient do_[0] = -70482732609699840.000
Backward Time Step 0:
Gradient di[0] = -2685149665820672.000, df[0] = -1910468427907072.000, dc_hat[0] = -2519575992205312.000
Gradient do_[0] = -41512168880865280.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 20528391782400.000, df[0] = 14125249855488.000, dc_hat[0] = 8733094576128.000
Gradient do_[0] = 1376343207968768.000
Backward Time Step 3:
Gradient di[0] = 31992202657792.000, df[0] = 21832992292864.000, dc_hat[0] = 12343847157760.000
Gradient do_[0] = 1800188431695872.000
Backward Time Step 2:
Gradient di[0] = 38877486120960.000, df[0] = 26640006512640.000, dc_hat[0] = 19831680139264.000
Gradient do_[0] = 1847591717306368.000
Backward Time Step 1:
Gradient di[0] = 48402654035968.000, df[0] = 32340617199616.000, dc_hat[0] = 27713702199296.000
Gradient do_[0] = 1604442478608384.000
Backward Time Step 0:
Gradient di[0] = 62630307823616.000, df[0] = 43460396580864.000, dc_hat[0] = 54603299160064.000
Gradient do_[0] = 965999683698688.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -703317604827136.000, df[0] = -517003198595072.000, dc_hat[0] = -311163199946752.000
Gradient do_[0] = -44054471692517376.000
Backward Time Step 3:
Gradient di[0] = -1093780665008128.000, df[0] = -789971522813952.000, dc_hat[0] = -444795973533696.000
Gradient do_[0] = -58822523466285056.000
Backward Time Step 2:
Gradient di[0] = -1376959267340288.000, df[0] = -991479778508800.000, dc_hat[0] = -756685861814272.000
Gradient do_[0] = -63983338989289472.000
Backward Time Step 1:
Gradient di[0] = -1722462811193344.000, df[0] = -1194589855678464.000, dc_hat[0] = -1067310446018560.000
Gradient do_[0] = -56546375482998784.000
Backward Time Step 0:
Gradient di[0] = -2154866562564096.000, df[0] = -1533175146741760.000, dc_hat[0] = -2021991682932736.000
Gradient do_[0] = -33314042680442880.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 21468396126208.000, df[0] = 14772044038144.000, dc_hat[0] = 9132448940032.000
Gradient do_[0] = 1439345143709696.000
Backward Time Step 3:
Gradient di[0] = 33456362881024.000, df[0] = 22832239083520.000, dc_hat[0] = 12907967414272.000
Gradient do_[0] = 1882554898579456.000
Backward Time Step 2:
Gradient di[0] = 40652876283904.000, df[0] = 27856541319168.000, dc_hat[0] = 20736565575680.000
Gradient do_[0] = 1931953599152128.000
Backward Time Step 1:
Gradient di[0] = 50613039661056.000, df[0] = 33817498746880.000, dc_hat[0] = 28979291488256.000
Gradient do_[0] = 1677711667888128.000
Backward Time Step 0:
Gradient di[0] = 65508825104384.000, df[0] = 45457854169088.000, dc_hat[0] = 57112885460992.000
Gradient do_[0] = 1010397431726080.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -552834298806272.000, df[0] = -406383530868736.000, dc_hat[0] = -244571896283136.000
Gradient do_[0] = -34627974108020736.000
Backward Time Step 3:
Gradient di[0] = -859732596228096.000, df[0] = -620933152768000.000, dc_hat[0] = -349596043706368.000
Gradient do_[0] = -46235091013206016.000
Backward Time Step 2:
Gradient di[0] = -1082215190495232.000, df[0] = -779248331653120.000, dc_hat[0] = -594694392250368.000
Gradient do_[0] = -50287156433780736.000
Backward Time Step 1:
Gradient di[0] = -1353777047142400.000, df[0] = -938893004242944.000, dc_hat[0] = -838857243230208.000
Gradient do_[0] = -44442869880061952.000
Backward Time Step 0:
Gradient di[0] = -1694133206908928.000, df[0] = -1205365928624128.000, dc_hat[0] = -1589668327981056.000
Gradient do_[0] = -26191144362704896.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 22265764773888.000, df[0] = 15320690458624.000, dc_hat[0] = 9471071879168.000
Gradient do_[0] = 1492781247758336.000
Backward Time Step 3:
Gradient di[0] = 34698109648896.000, df[0] = 23679702401024.000, dc_hat[0] = 13386208247808.000
Gradient do_[0] = 1952403414843392.000
Backward Time Step 2:
Gradient di[0] = 42157687701504.000, df[0] = 28887677403136.000, dc_hat[0] = 21503368232960.000
Gradient do_[0] = 2003456214695936.000
Backward Time Step 1:
Gradient di[0] = 52486526205952.000, df[0] = 35069286678528.000, dc_hat[0] = 30051984736256.000
Gradient do_[0] = 1739813807980544.000
Backward Time Step 0:
Gradient di[0] = 67952825073664.000, df[0] = 47153795825664.000, dc_hat[0] = 59243654807552.000
Gradient do_[0] = 1048093420158976.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -425947677327360.000, df[0] = -313109927428096.000, dc_hat[0] = -188426657726464.000
Gradient do_[0] = -26679757022167040.000
Backward Time Step 3:
Gradient di[0] = -662390257483776.000, df[0] = -478405032148992.000, dc_hat[0] = -269333037056000.000
Gradient do_[0] = -35621898259791872.000
Backward Time Step 2:
Gradient di[0] = -833725763944448.000, df[0] = -600323148218368.000, dc_hat[0] = -458129934385152.000
Gradient do_[0] = -38740411736391680.000
Backward Time Step 1:
Gradient di[0] = -1042945566310400.000, df[0] = -723320207048704.000, dc_hat[0] = -646252924502016.000
Gradient do_[0] = -34238641765089280.000
Backward Time Step 0:
Gradient di[0] = -1305544832843776.000, df[0] = -928887609491456.000, dc_hat[0] = -1225041442242560.000
Gradient do_[0] = -20183606102065152.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 22935249092608.000, df[0] = 15781346672640.000, dc_hat[0] = 9755283161088.000
Gradient do_[0] = 1537642718035968.000
Backward Time Step 3:
Gradient di[0] = 35740524216320.000, df[0] = 24391136051200.000, dc_hat[0] = 13787495137280.000
Gradient do_[0] = 2011033610747904.000
Backward Time Step 2:
Gradient di[0] = 43420064153600.000, df[0] = 29752685494272.000, dc_hat[0] = 22146470379520.000
Gradient do_[0] = 2063437580468224.000
Backward Time Step 1:
Gradient di[0] = 54058215800832.000, df[0] = 36119420862464.000, dc_hat[0] = 30951881048064.000
Gradient do_[0] = 1791911895498752.000
Backward Time Step 0:
Gradient di[0] = 70007392305152.000, df[0] = 48579498475520.000, dc_hat[0] = 61034895245312.000
Gradient do_[0] = 1079782628392960.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -321297208836096.000, df[0] = -236182029991936.000, dc_hat[0] = -142124192366592.000
Gradient do_[0] = -20124541711810560.000
Backward Time Step 3:
Gradient di[0] = -499636229898240.000, df[0] = -360857649283072.000, dc_hat[0] = -203143027621888.000
Gradient do_[0] = -26869014756065280.000
Backward Time Step 2:
Gradient di[0] = -628814820409344.000, df[0] = -452776828076032.000, dc_hat[0] = -345520354623488.000
Gradient do_[0] = -29218752709001216.000
Backward Time Step 1:
Gradient di[0] = -786620777234432.000, df[0] = -545549765836800.000, dc_hat[0] = -487423322619904.000
Gradient do_[0] = -25823808694779904.000
Backward Time Step 0:
Gradient di[0] = -984973976797184.000, df[0] = -700803371237376.000, dc_hat[0] = -924237837631488.000
Gradient do_[0] = -15227608979472384.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 23487601180672.000, df[0] = 16161392558080.000, dc_hat[0] = 9989622071296.000
Gradient do_[0] = 1574649632653312.000
Backward Time Step 3:
Gradient di[0] = 36600385896448.000, df[0] = 24977986289664.000, dc_hat[0] = 14118300942336.000
Gradient do_[0] = 2059392392364032.000
Backward Time Step 2:
Gradient di[0] = 44460444483584.000, df[0] = 30465568276480.000, dc_hat[0] = 22676294860800.000
Gradient do_[0] = 2112866748465152.000
Backward Time Step 1:
Gradient di[0] = 55353433653248.000, df[0] = 36984831606784.000, dc_hat[0] = 31693478035456.000
Gradient do_[0] = 1834845596549120.000
Backward Time Step 0:
Gradient di[0] = 71705032654848.000, df[0] = 49757527474176.000, dc_hat[0] = 62514956075008.000
Gradient do_[0] = 1105966829404160.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -236550558318592.000, df[0] = -173885509074944.000, dc_hat[0] = -104630830759936.000
Gradient do_[0] = -14816185472253952.000
Backward Time Step 3:
Gradient di[0] = -367841098334208.000, df[0] = -265670000377856.000, dc_hat[0] = -149548076892160.000
Gradient do_[0] = -19781208468619264.000
Backward Time Step 2:
Gradient di[0] = -462900703526912.000, df[0] = -333310366580736.000, dc_hat[0] = -254345547350016.000
Gradient do_[0] = -21509204808302592.000
Backward Time Step 1:
Gradient di[0] = -579076146331648.000, df[0] = -401610110926848.000, dc_hat[0] = -358819955736576.000
Gradient do_[0] = -19010371360653312.000
Backward Time Step 0:
Gradient di[0] = -725313407418368.000, df[0] = -516056292524032.000, dc_hat[0] = -680588570787840.000
Gradient do_[0] = -11213279141560320.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 23930437894144.000, df[0] = 16466093015040.000, dc_hat[0] = 10177367506944.000
Gradient do_[0] = 1604314032242688.000
Backward Time Step 3:
Gradient di[0] = 37289610706944.000, df[0] = 25448383774720.000, dc_hat[0] = 14383248834560.000
Gradient do_[0] = 2098146956017664.000
Backward Time Step 2:
Gradient di[0] = 45293332594688.000, df[0] = 31036278833152.000, dc_hat[0] = 23100267692032.000
Gradient do_[0] = 2152437087469568.000
Backward Time Step 1:
Gradient di[0] = 56390374653952.000, df[0] = 37677676101632.000, dc_hat[0] = 32287198543872.000
Gradient do_[0] = 1869217951383552.000
Backward Time Step 0:
Gradient di[0] = 73068760268800.000, df[0] = 50703842148352.000, dc_hat[0] = 63703902846976.000
Gradient do_[0] = 1127000626429952.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -169105864785920.000, df[0] = -124307543949312.000, dc_hat[0] = -74794464706560.000
Gradient do_[0] = -10591673893519360.000
Backward Time Step 3:
Gradient di[0] = -262956604456960.000, df[0] = -189918185783296.000, dc_hat[0] = -106899856949248.000
Gradient do_[0] = -14140721334321152.000
Backward Time Step 2:
Gradient di[0] = -330881327693824.000, df[0] = -238249888972800.000, dc_hat[0] = -181800060059648.000
Gradient do_[0] = -15374687987040256.000
Backward Time Step 1:
Gradient di[0] = -413928144240640.000, df[0] = -287074003451904.000, dc_hat[0] = -256487276412928.000
Gradient do_[0] = -13588761128468480.000
Backward Time Step 0:
Gradient di[0] = -518613576450048.000, df[0] = -368990572511232.000, dc_hat[0] = -486634424369152.000
Gradient do_[0] = -8017718246637568.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 24270430273536.000, df[0] = 16700024029184.000, dc_hat[0] = 10321342234624.000
Gradient do_[0] = 1627082996056064.000
Backward Time Step 3:
Gradient di[0] = 37818449526784.000, df[0] = 25809330896896.000, dc_hat[0] = 14586305576960.000
Gradient do_[0] = 2127877390729216.000
Backward Time Step 2:
Gradient di[0] = 45931277844480.000, df[0] = 31473402904576.000, dc_hat[0] = 23424764215296.000
Gradient do_[0] = 2182741302968320.000
Backward Time Step 1:
Gradient di[0] = 57184629030912.000, df[0] = 38208360415232.000, dc_hat[0] = 32741959663616.000
Gradient do_[0] = 1895545698254848.000
Backward Time Step 0:
Gradient di[0] = 74118946881536.000, df[0] = 51432589885440.000, dc_hat[0] = 64619490050048.000
Gradient do_[0] = 1143198558715904.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -116380284223488.000, df[0] = -85549532577792.000, dc_hat[0] = -51471232008192.000
Gradient do_[0] = -7289182271569920.000
Backward Time Step 3:
Gradient di[0] = -180964789583872.000, df[0] = -130700393054208.000, dc_hat[0] = -73563000274944.000
Gradient do_[0] = -9731426303868928.000
Backward Time Step 2:
Gradient di[0] = -227688430174208.000, df[0] = -163946065559552.000, dc_hat[0] = -125097364946944.000
Gradient do_[0] = -10579685566054400.000
Backward Time Step 1:
Gradient di[0] = -284838137430016.000, df[0] = -197545477275648.000, dc_hat[0] = -176497688051712.000
Gradient do_[0] = -9350892201443328.000
Backward Time Step 0:
Gradient di[0] = -356982447931392.000, df[0] = -253990960889856.000, dc_hat[0] = -334969935233024.000
Gradient do_[0] = -5518915937501184.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 24511982338048.000, df[0] = 16866221228032.000, dc_hat[0] = 10423453614080.000
Gradient do_[0] = 1643250997788672.000
Backward Time Step 3:
Gradient di[0] = 38193873289216.000, df[0] = 26065577705472.000, dc_hat[0] = 14730166009856.000
Gradient do_[0] = 2148976014917632.000
Backward Time Step 2:
Gradient di[0] = 46382899527680.000, df[0] = 31782858653696.000, dc_hat[0] = 23654232489984.000
Gradient do_[0] = 2204190772297728.000
Backward Time Step 1:
Gradient di[0] = 57746883870720.000, df[0] = 38584035835904.000, dc_hat[0] = 33063885078528.000
Gradient do_[0] = 1914183171964928.000
Backward Time Step 0:
Gradient di[0] = 74868754219008.000, df[0] = 51952893296640.000, dc_hat[0] = 65273201688576.000
Gradient do_[0] = 1154763429249024.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -75988130070528.000, df[0] = -55857765154816.000, dc_hat[0] = -33605178884096.000
Gradient do_[0] = -4759250576343040.000
Backward Time Step 3:
Gradient di[0] = -118154416095232.000, df[0] = -85336151556096.000, dc_hat[0] = -48027238662144.000
Gradient do_[0] = -6353705108504576.000
Backward Time Step 2:
Gradient di[0] = -148647056506880.000, df[0] = -107032531173376.000, dc_hat[0] = -81667410624512.000
Gradient do_[0] = -6906943067127808.000
Backward Time Step 1:
Gradient di[0] = -185959568113664.000, df[0] = -128969655451648.000, dc_hat[0] = -115228377350144.000
Gradient do_[0] = -6104828933570560.000
Backward Time Step 0:
Gradient di[0] = -233129281323008.000, df[0] = -165870143799296.000, dc_hat[0] = -218753908342784.000
Gradient do_[0] = -3604157508354048.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 24660034977792.000, df[0] = 16968085143552.000, dc_hat[0] = 10485792505856.000
Gradient do_[0] = 1653150494752768.000
Backward Time Step 3:
Gradient di[0] = 38423649845248.000, df[0] = 26222427897856.000, dc_hat[0] = 14817853177856.000
Gradient do_[0] = 2161877828239360.000
Backward Time Step 2:
Gradient di[0] = 46657437696000.000, df[0] = 31970968993792.000, dc_hat[0] = 23793384816640.000
Gradient do_[0] = 2217225058516992.000
Backward Time Step 1:
Gradient di[0] = 58088711258112.000, df[0] = 38812424077312.000, dc_hat[0] = 33259603886080.000
Gradient do_[0] = 1925513832562688.000
Backward Time Step 0:
Gradient di[0] = 75333164335104.000, df[0] = 52275154255872.000, dc_hat[0] = 65678086242304.000
Gradient do_[0] = 1161926495174656.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -45791095816192.000, df[0] = -33660310913024.000, dc_hat[0] = -20249562841088.000
Gradient do_[0] = -2867921999101952.000
Backward Time Step 3:
Gradient di[0] = -71199090540544.000, df[0] = -51423035260928.000, dc_hat[0] = -28939045044224.000
Gradient do_[0] = -3828657059528704.000
Backward Time Step 2:
Gradient di[0] = -89565343776768.000, df[0] = -64490989158400.000, dc_hat[0] = -49205993275392.000
Gradient do_[0] = -4161666275082240.000
Backward Time Step 1:
Gradient di[0] = -112048734208000.000, df[0] = -77709824753664.000, dc_hat[0] = -69430113468416.000
Gradient do_[0] = -3678424740659200.000
Backward Time Step 0:
Gradient di[0] = -140512447168512.000, df[0] = -99973794365440.000, dc_hat[0] = -131848055291904.000
Gradient do_[0] = -2172309498494976.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 24718216265728.000, df[0] = 17008106143744.000, dc_hat[0] = 10509900316672.000
Gradient do_[0] = 1657026165866496.000
Backward Time Step 3:
Gradient di[0] = 38513361813504.000, df[0] = 26283689902080.000, dc_hat[0] = 14851503030272.000
Gradient do_[0] = 2166899047661568.000
Backward Time Step 2:
Gradient di[0] = 46761817145344.000, df[0] = 32042486071296.000, dc_hat[0] = 23845748604928.000
Gradient do_[0] = 2222174068801536.000
Backward Time Step 1:
Gradient di[0] = 58218713710592.000, df[0] = 38899292307456.000, dc_hat[0] = 33334040199168.000
Gradient do_[0] = 1929823295373312.000
Backward Time Step 0:
Gradient di[0] = 75523099197440.000, df[0] = 52406956064768.000, dc_hat[0] = 65843677364224.000
Gradient do_[0] = 1164856065523712.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -23907960619008.000, df[0] = -17574345572352.000, dc_hat[0] = -10571873255424.000
Gradient do_[0] = -1497346126905344.000
Backward Time Step 3:
Gradient di[0] = -37172774174720.000, df[0] = -26847792332800.000, dc_hat[0] = -15108011982848.000
Gradient do_[0] = -1998903146708992.000
Backward Time Step 2:
Gradient di[0] = -46757308268544.000, df[0] = -33667281846272.000, dc_hat[0] = -25686964174848.000
Gradient do_[0] = -2172572028370944.000
Backward Time Step 1:
Gradient di[0] = -58495340642304.000, df[0] = -40568621105152.000, dc_hat[0] = -36246176923648.000
Gradient do_[0] = -1920331551866880.000
Backward Time Step 0:
Gradient di[0] = -73376789954560.000, df[0] = -52207164588032.000, dc_hat[0] = -68852180320256.000
Gradient do_[0] = -1134398439161856.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 24690481430528.000, df[0] = 16989010526208.000, dc_hat[0] = 10497478885376.000
Gradient do_[0] = 1655140943659008.000
Backward Time Step 3:
Gradient di[0] = 38469246124032.000, df[0] = 26253623033856.000, dc_hat[0] = 14833548263424.000
Gradient do_[0] = 2164391726284800.000
Backward Time Step 2:
Gradient di[0] = 46703843475456.000, df[0] = 32002745040896.000, dc_hat[0] = 23815310540800.000
Gradient do_[0] = 2219407036121088.000
Backward Time Step 1:
Gradient di[0] = 58146450046976.000, df[0] = 38851007479808.000, dc_hat[0] = 33292665487360.000
Gradient do_[0] = 1927427911581696.000
Backward Time Step 0:
Gradient di[0] = 75450537738240.000, df[0] = 52356607639552.000, dc_hat[0] = 65780423065600.000
Gradient do_[0] = 1163736823889920.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -8712473804800.000, df[0] = -6404386062336.000, dc_hat[0] = -3852349734912.000
Gradient do_[0] = -545650362023936.000
Backward Time Step 3:
Gradient di[0] = -13546077290496.000, df[0] = -9783570595840.000, dc_hat[0] = -5505140916224.000
Gradient do_[0] = -728408266899456.000
Backward Time Step 2:
Gradient di[0] = -17037168476160.000, df[0] = -12267484610560.000, dc_hat[0] = -9359354494976.000
Gradient do_[0] = -791625689202688.000
Backward Time Step 1:
Gradient di[0] = -21314458877952.000, df[0] = -14782342103040.000, dc_hat[0] = -13207334813696.000
Gradient do_[0] = -699727884582912.000
Backward Time Step 0:
Gradient di[0] = -26744937512960.000, df[0] = -19028869382144.000, dc_hat[0] = -25095770734592.000
Gradient do_[0] = -413474320547840.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855
c_state[0] = 0.884, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.118
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.063, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 24580613734400.000, df[0] = 16913407148032.000, dc_hat[0] = 10450155601920.000
Gradient do_[0] = 1647750915555328.000
Backward Time Step 3:
Gradient di[0] = 38297099304960.000, df[0] = 26136178327552.000, dc_hat[0] = 14766224441344.000
Gradient do_[0] = 2154681610534912.000
Backward Time Step 2:
Gradient di[0] = 46490399539200.000, df[0] = 31856474980352.000, dc_hat[0] = 23705616908288.000
Gradient do_[0] = 2209251854385152.000
Backward Time Step 1:
Gradient di[0] = 57880711528448.000, df[0] = 38673450008576.000, dc_hat[0] = 33140508721152.000
Gradient do_[0] = 1918619067875328.000
Backward Time Step 0:
Gradient di[0] = 75126972350464.000, df[0] = 52132082352128.000, dc_hat[0] = 65498326761472.000
Gradient do_[0] = 1158746340327424.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840
c_state[0] = 0.695, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.830, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873
c_state[0] = 0.916, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1187031154688.000, df[0] = 872564260864.000, dc_hat[0] = 524832538624.000
Gradient do_[0] = 74341052055552.000
Backward Time Step 3:
Gradient di[0] = 1845539766272.000, df[0] = 1332930805760.000, dc_hat[0] = 749981728768.000
Gradient do_[0] = 99238398656512.000
Backward Time Step 2:
Gradient di[0] = 2320955604992.000, df[0] = 1671184777216.000, dc_hat[0] = 1274972340224.000
Gradient do_[0] = 107841721466880.000
Backward Time Step 1:
Gradient di[0] = 2903676551168.000, df[0] = 2013804232704.000, dc_hat[0] = 1799240286208.000
Gradient do_[0] = 95324198666240.000
Backward Time Step 0:
Gradient di[0] = 3644559982592.000, df[0] = 2593083621376.000, dc_hat[0] = 3419826814976.000
Gradient do_[0] = 56344568659968.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.825
c_state[0] = 0.659, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.358, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.828
c_state[0] = 0.778, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.316, f_gate[0] = 0.739, o_gate[0] = 0.137, c_hat[0] = 0.864
c_state[0] = 0.848, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.295, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851
c_state[0] = 0.893, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -4890987423858688.000, df[0] = -3847528877391872.000, dc_hat[0] = -2254875010269184.000
Gradient do_[0] = -288815694294810624.000
Backward Time Step 3:
Gradient di[0] = -7608407326457856.000, df[0] = -5808903539392512.000, dc_hat[0] = -3270609005969408.000
Gradient do_[0] = -393598946263433216.000
Backward Time Step 2:
Gradient di[0] = -9843585079836672.000, df[0] = -7434920175599616.000, dc_hat[0] = -5831557881266176.000
Gradient do_[0] = -448408432676962304.000
Backward Time Step 1:
Gradient di[0] = -12363124564819968.000, df[0] = -8897320032665600.000, dc_hat[0] = -8286589104947200.000
Gradient do_[0] = -402262548134494208.000
Backward Time Step 0:
Gradient di[0] = -14895893622816768.000, df[0] = -10887413530361856.000, dc_hat[0] = -15059250556436480.000
Gradient do_[0] = -231094065435770880.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.431, f_gate[0] = 0.736, o_gate[0] = 0.139, c_hat[0] = 0.841
c_state[0] = 0.694, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.369, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.843
c_state[0] = 0.829, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.330, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.874
c_state[0] = 0.915, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -888198791168.000, df[0] = -652712673280.000, dc_hat[0] = -391856029696.000
Gradient do_[0] = -55559172653056.000
Backward Time Step 3:
Gradient di[0] = -1381722488832.000, df[0] = -997821054976.000, dc_hat[0] = -558874361856.000
Gradient do_[0] = -74154179035136.000
Backward Time Step 2:
Gradient di[0] = -1739546034176.000, df[0] = -1252109189120.000, dc_hat[0] = -946914459648.000
Gradient do_[0] = -80582159630336.000
Backward Time Step 1:
Gradient di[0] = -2175829147648.000, df[0] = -1507879157760.000, dc_hat[0] = -1331884457984.000
Gradient do_[0] = -71178060300288.000
Backward Time Step 0:
Gradient di[0] = -2719982551040.000, df[0] = -1935251865600.000, dc_hat[0] = -2552261246976.000
Gradient do_[0] = -42050682617856.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.443, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.855
c_state[0] = 0.731, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.382, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.856
c_state[0] = 0.883, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882
c_state[0] = 0.986, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868
c_state[0] = 1.062, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 24050648743936.000, df[0] = 16542823612416.000, dc_hat[0] = 10203755970560.000
Gradient do_[0] = 1610281352429568.000
Backward Time Step 3:
Gradient di[0] = 37493109948416.000, df[0] = 25581783613440.000, dc_hat[0] = 14390737764352.000
Gradient do_[0] = 2105229625524224.000
Backward Time Step 2:
Gradient di[0] = 45563605155840.000, df[0] = 31209293873152.000, dc_hat[0] = 23017562308608.000
Gradient do_[0] = 2158281732653056.000
Backward Time Step 1:
Gradient di[0] = 56707912826880.000, df[0] = 37860912660480.000, dc_hat[0] = 32049341661184.000
Gradient do_[0] = 1872576078938112.000
Backward Time Step 0:
Gradient di[0] = 73285211521024.000, df[0] = 50854044368896.000, dc_hat[0] = 63892608778240.000
Gradient do_[0] = 1130339158196224.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.431, f_gate[0] = 0.736, o_gate[0] = 0.139, c_hat[0] = 0.841
c_state[0] = 0.694, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.369, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.843
c_state[0] = 0.829, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.330, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.874
c_state[0] = 0.915, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.976, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 5571085336576.000, df[0] = 4094030249984.000, dc_hat[0] = 2457711673344.000
Gradient do_[0] = 348480727941120.000
Backward Time Step 3:
Gradient di[0] = 8666417725440.000, df[0] = 6258521276416.000, dc_hat[0] = 3505138696192.000
Gradient do_[0] = 465102847344640.000
Backward Time Step 2:
Gradient di[0] = 10909720248320.000, df[0] = 7852707545088.000, dc_hat[0] = 5938456559616.000
Gradient do_[0] = 505375715491840.000
Backward Time Step 1:
Gradient di[0] = 13646051672064.000, df[0] = 9456899325952.000, dc_hat[0] = 8353122091008.000
Gradient do_[0] = 446404304568320.000
Backward Time Step 0:
Gradient di[0] = 17063897726976.000, df[0] = 12140864864256.000, dc_hat[0] = 16011691556864.000
Gradient do_[0] = 263806320115712.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.420, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.826
c_state[0] = 0.658, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.357, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.829
c_state[0] = 0.778, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.315, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.864
c_state[0] = 0.848, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.295, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851
c_state[0] = 0.893, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3851269525471232.000, df[0] = -3029021122101248.000, dc_hat[0] = -1771289878462464.000
Gradient do_[0] = -227136974627012608.000
Backward Time Step 3:
Gradient di[0] = -5994380628328448.000, df[0] = -4576522803347456.000, dc_hat[0] = -2564058700054528.000
Gradient do_[0] = -309508984165892096.000
Backward Time Step 2:
Gradient di[0] = -7762617590349824.000, df[0] = -5861173828255744.000, dc_hat[0] = -4557597566828544.000
Gradient do_[0] = -352591732392591360.000
Backward Time Step 1:
Gradient di[0] = -9749418995613696.000, df[0] = -7011011735322624.000, dc_hat[0] = -6460340556529664.000
Gradient do_[0] = -316185218769747968.000
Backward Time Step 0:
Gradient di[0] = -11710066197528576.000, df[0] = -8558892178997248.000, dc_hat[0] = -11838486793420800.000
Gradient do_[0] = -181669316960190464.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.430, f_gate[0] = 0.736, o_gate[0] = 0.139, c_hat[0] = 0.842
c_state[0] = 0.694, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.368, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.844
c_state[0] = 0.829, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.330, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.874
c_state[0] = 0.914, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.975, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 4127742492672.000, df[0] = 3032606572544.000, dc_hat[0] = 1817677135872.000
Gradient do_[0] = 257935368257536.000
Backward Time Step 3:
Gradient di[0] = 6424655560704.000, df[0] = 4639119376384.000, dc_hat[0] = 2588271443968.000
Gradient do_[0] = 344224549568512.000
Backward Time Step 2:
Gradient di[0] = 8095635341312.000, df[0] = 5825458339840.000, dc_hat[0] = 4372755382272.000
Gradient do_[0] = 374045077929984.000
Backward Time Step 1:
Gradient di[0] = 10125283688448.000, df[0] = 7012514529280.000, dc_hat[0] = 6132332494848.000
Gradient do_[0] = 330213057626112.000
Backward Time Step 0:
Gradient di[0] = 12615629668352.000, df[0] = 8975947923456.000, dc_hat[0] = 11837716299776.000
Gradient do_[0] = 195036494954496.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.419, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.827
c_state[0] = 0.658, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.356, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.830
c_state[0] = 0.777, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.315, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.865
c_state[0] = 0.847, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851
c_state[0] = 0.893, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -4045101131104256.000, df[0] = -3180911298347008.000, dc_hat[0] = -1856776236433408.000
Gradient do_[0] = -238319403018485760.000
Backward Time Step 3:
Gradient di[0] = -6299549328998400.000, df[0] = -4809410425651200.000, dc_hat[0] = -2683494224363520.000
Gradient do_[0] = -324736910972944384.000
Backward Time Step 2:
Gradient di[0] = -8165443646783488.000, df[0] = -6163601802919936.000, dc_hat[0] = -4757687745118208.000
Gradient do_[0] = -369970716260433920.000
Backward Time Step 1:
Gradient di[0] = -10256010388176896.000, df[0] = -7370597134761984.000, dc_hat[0] = -6728436173242368.000
Gradient do_[0] = -331670122060054528.000
Backward Time Step 0:
Gradient di[0] = -12280851245039616.000, df[0] = -8976077921714176.000, dc_hat[0] = -12415529608282112.000
Gradient do_[0] = -190524422833045504.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.429, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.843
c_state[0] = 0.693, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.367, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.845
c_state[0] = 0.828, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.330, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.874
c_state[0] = 0.914, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.975, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2645138866176.000, df[0] = 1942922461184.000, dc_hat[0] = 1162985472000.000
Gradient do_[0] = 165143656792064.000
Backward Time Step 3:
Gradient di[0] = 4119195549696.000, df[0] = 2974102847488.000, dc_hat[0] = 1653856141312.000
Gradient do_[0] = 220381684170752.000
Backward Time Step 2:
Gradient di[0] = 5195264688128.000, df[0] = 3737468534784.000, dc_hat[0] = 2787297984512.000
Gradient do_[0] = 239490362769408.000
Backward Time Step 1:
Gradient di[0] = 6497768046592.000, df[0] = 4497723097088.000, dc_hat[0] = 3898063978496.000
Gradient do_[0] = 211330158034944.000
Backward Time Step 0:
Gradient di[0] = 8069355405312.000, df[0] = 5741300154368.000, dc_hat[0] = 7571777257472.000
Gradient do_[0] = 124751502639104.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.418, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.828
c_state[0] = 0.658, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.355, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.831
c_state[0] = 0.777, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.315, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.865
c_state[0] = 0.847, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851
c_state[0] = 0.892, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -4245605773737984.000, df[0] = -3338048045580288.000, dc_hat[0] = -1945491470286848.000
Gradient do_[0] = -249904699342323712.000
Backward Time Step 3:
Gradient di[0] = -6615324988276736.000, df[0] = -5050383323889664.000, dc_hat[0] = -2807932848701440.000
Gradient do_[0] = -340528096971063296.000
Backward Time Step 2:
Gradient di[0] = -8582279886536704.000, df[0] = -6476685054574592.000, dc_hat[0] = -4967296107806720.000
Gradient do_[0] = -388008513751744512.000
Backward Time Step 1:
Gradient di[0] = -10780964913414144.000, df[0] = -7743565115424768.000, dc_hat[0] = -7009881622052864.000
Gradient do_[0] = -347763398877904896.000
Backward Time Step 0:
Gradient di[0] = -12873553510662144.000, df[0] = -9409284429316096.000, dc_hat[0] = -13014732306907136.000
Gradient do_[0] = -199719569856659456.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.428, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.844
c_state[0] = 0.693, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.367, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.845
c_state[0] = 0.828, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.330, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875
c_state[0] = 0.914, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.975, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1099228053504.000, df[0] = 807249903616.000, dc_hat[0] = 482642886656.000
Gradient do_[0] = 68574987157504.000
Backward Time Step 3:
Gradient di[0] = 1712662642688.000, df[0] = 1236451196928.000, dc_hat[0] = 685610041344.000
Gradient do_[0] = 91512792678400.000
Backward Time Step 2:
Gradient di[0] = 2161880334336.000, df[0] = 1554915262464.000, dc_hat[0] = 1153043398656.000
Gradient do_[0] = 99457257439232.000
Backward Time Step 1:
Gradient di[0] = 2704064905216.000, df[0] = 1870850031616.000, dc_hat[0] = 1608437989376.000
Gradient do_[0] = 87730922979328.000
Backward Time Step 0:
Gradient di[0] = 3348076167168.000, df[0] = 2382136868864.000, dc_hat[0] = 3141624922112.000
Gradient do_[0] = 51760957751296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.417, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.829
c_state[0] = 0.657, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.355, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.831
c_state[0] = 0.776, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.865
c_state[0] = 0.846, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851
c_state[0] = 0.892, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -4451681794260992.000, df[0] = -3499564854149120.000, dc_hat[0] = -2036900990812160.000
Gradient do_[0] = -261826188526223360.000
Backward Time Step 3:
Gradient di[0] = -6940004114759680.000, df[0] = -5298142807326720.000, dc_hat[0] = -2936568461393920.000
Gradient do_[0] = -356791969889910784.000
Backward Time Step 2:
Gradient di[0] = -9010912253968384.000, df[0] = -6798739956039680.000, dc_hat[0] = -5184930153758720.000
Gradient do_[0] = -406601255377436672.000
Backward Time Step 1:
Gradient di[0] = -11321400648269824.000, df[0] = -8127821880754176.000, dc_hat[0] = -7302640551591936.000
Gradient do_[0] = -364370766101217280.000
Backward Time Step 0:
Gradient di[0] = -13484724135657472.000, df[0] = -9855990019129344.000, dc_hat[0] = -13632606302109696.000
Gradient do_[0] = -209201259918393344.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.427, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.845
c_state[0] = 0.693, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.846
c_state[0] = 0.827, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875
c_state[0] = 0.913, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -510535401472.000, df[0] = -374863298560.000, dc_hat[0] = -223917162496.000
Gradient do_[0] = -31829734195200.000
Backward Time Step 3:
Gradient di[0] = -795820687360.000, df[0] = -574497357824.000, dc_hat[0] = -317823516672.000
Gradient do_[0] = -42479063662592.000
Backward Time Step 2:
Gradient di[0] = -1005301530624.000, df[0] = -722927288320.000, dc_hat[0] = -533623799808.000
Gradient do_[0] = -46172798451712.000
Backward Time Step 1:
Gradient di[0] = -1257585901568.000, df[0] = -869745885184.000, dc_hat[0] = -742772113408.000
Gradient do_[0] = -40718743633920.000
Backward Time Step 0:
Gradient di[0] = -1553133207552.000, df[0] = -1105045487616.000, dc_hat[0] = -1457362829312.000
Gradient do_[0] = -24011297783808.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.439, f_gate[0] = 0.748, o_gate[0] = 0.147, c_hat[0] = 0.859
c_state[0] = 0.729, h_state[0] = 0.091
Time Step 2:
i_gate[0] = 0.380, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.859
c_state[0] = 0.881, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.346, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.884
c_state[0] = 0.984, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.869
c_state[0] = 1.061, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 22810523074560.000, df[0] = 15673729220608.000, dc_hat[0] = 9625830162432.000
Gradient do_[0] = 1522248347287552.000
Backward Time Step 3:
Gradient di[0] = 35631883354112.000, df[0] = 24295919058944.000, dc_hat[0] = 13512977940480.000
Gradient do_[0] = 1989686952198144.000
Backward Time Step 2:
Gradient di[0] = 43452276408320.000, df[0] = 29731766403072.000, dc_hat[0] = 21400555356160.000
Gradient do_[0] = 2039898609549312.000
Backward Time Step 1:
Gradient di[0] = 54068290519040.000, df[0] = 36024335990784.000, dc_hat[0] = 29417019539456.000
Gradient do_[0] = 1765696354648064.000
Backward Time Step 0:
Gradient di[0] = 68901035573248.000, df[0] = 47811777265664.000, dc_hat[0] = 60070331154432.000
Gradient do_[0] = 1062718320672768.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.427, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.845
c_state[0] = 0.693, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.846
c_state[0] = 0.827, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875
c_state[0] = 0.913, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 4962075017216.000, df[0] = 3643424636928.000, dc_hat[0] = 2176203751424.000
Gradient do_[0] = 309359481454592.000
Backward Time Step 3:
Gradient di[0] = 7734678257664.000, df[0] = 5583615295488.000, dc_hat[0] = 3088766992384.000
Gradient do_[0] = 412853899100160.000
Backward Time Step 2:
Gradient di[0] = 9769721004032.000, df[0] = 7025545707520.000, dc_hat[0] = 5185685946368.000
Gradient do_[0] = 448713990340608.000
Backward Time Step 1:
Gradient di[0] = 12221570613248.000, df[0] = 8452432723968.000, dc_hat[0] = 7218466914304.000
Gradient do_[0] = 395716073619456.000
Backward Time Step 0:
Gradient di[0] = 15098293780480.000, df[0] = 10742349692928.000, dc_hat[0] = 14167293558784.000
Gradient do_[0] = 233418268868608.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.416, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.830
c_state[0] = 0.657, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.354, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.832
c_state[0] = 0.776, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866
c_state[0] = 0.846, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.892, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3519205202722816.000, df[0] = -2766172613246976.000, dc_hat[0] = -1608211446628352.000
Gradient do_[0] = -206846621407772672.000
Backward Time Step 3:
Gradient di[0] = -5488868883169280.000, df[0] = -4190234786922496.000, dc_hat[0] = -2316519870562304.000
Gradient do_[0] = -281894285777829888.000
Backward Time Step 2:
Gradient di[0] = -7131400205500416.000, df[0] = -5379731616694272.000, dc_hat[0] = -4083791807119360.000
Gradient do_[0] = -321280596171030528.000
Backward Time Step 1:
Gradient di[0] = -8962138068484096.000, df[0] = -6431542800809984.000, dc_hat[0] = -5742452342259712.000
Gradient do_[0] = -287897636445356032.000
Backward Time Step 0:
Gradient di[0] = -10654938761789440.000, df[0] = -7787698588745728.000, dc_hat[0] = -10771787642044416.000
Gradient do_[0] = -165300131443245056.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.427, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.846
c_state[0] = 0.692, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.846
c_state[0] = 0.827, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875
c_state[0] = 0.913, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 3820313116672.000, df[0] = 2804691238912.000, dc_hat[0] = 1674017898496.000
Gradient do_[0] = 238061430505472.000
Backward Time Step 3:
Gradient di[0] = 5957576294400.000, df[0] = 4300474417152.000, dc_hat[0] = 2374679658496.000
Gradient do_[0] = 317738727768064.000
Backward Time Step 2:
Gradient di[0] = 7529908666368.000, df[0] = 5414112460800.000, dc_hat[0] = 3981889241088.000
Gradient do_[0] = 345392713564160.000
Backward Time Step 1:
Gradient di[0] = 9421282344960.000, df[0] = 6513787666432.000, dc_hat[0] = 5533144186880.000
Gradient do_[0] = 304554386128896.000
Backward Time Step 0:
Gradient di[0] = 11614657970176.000, df[0] = 8263762968576.000, dc_hat[0] = 10898466930688.000
Gradient do_[0] = 179561576792064.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.415, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.831
c_state[0] = 0.657, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.354, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.833
c_state[0] = 0.775, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866
c_state[0] = 0.846, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.892, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3691189651898368.000, df[0] = -2901048008114176.000, dc_hat[0] = -1685188937515008.000
Gradient do_[0] = -216846903000694784.000
Backward Time Step 3:
Gradient di[0] = -5759739585626112.000, df[0] = -4396941093896192.000, dc_hat[0] = -2426003150340096.000
Gradient do_[0] = -295565905415372800.000
Backward Time Step 2:
Gradient di[0] = -7488289976090624.000, df[0] = -5648210559238144.000, dc_hat[0] = -4272091494875136.000
Gradient do_[0] = -336936301720764416.000
Backward Time Step 1:
Gradient di[0] = -9413011387187200.000, df[0] = -6753020029173760.000, dc_hat[0] = -5999181864894464.000
Gradient do_[0] = -301926580182056960.000
Backward Time Step 0:
Gradient di[0] = -11171262550245376.000, df[0] = -8165079648305152.000, dc_hat[0] = -11293773271138304.000
Gradient do_[0] = -173310331349630976.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.426, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.846
c_state[0] = 0.692, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847
c_state[0] = 0.826, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875
c_state[0] = 0.913, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2607822143488.000, df[0] = 1914296336384.000, dc_hat[0] = 1141849980928.000
Gradient do_[0] = 162436988534784.000
Backward Time Step 3:
Gradient di[0] = 4068512104448.000, df[0] = 2936692015104.000, dc_hat[0] = 1619062161408.000
Gradient do_[0] = 216832531234816.000
Backward Time Step 2:
Gradient di[0] = 5145399656448.000, df[0] = 3699164577792.000, dc_hat[0] = 2712007081984.000
Gradient do_[0] = 235746023702528.000
Backward Time Step 1:
Gradient di[0] = 6439066664960.000, df[0] = 4450733260800.000, dc_hat[0] = 3762652971008.000
Gradient do_[0] = 207851486183424.000
Backward Time Step 0:
Gradient di[0] = 7923264651264.000, df[0] = 5637357436928.000, dc_hat[0] = 7434694295552.000
Gradient do_[0] = 122492962209792.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.415, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.831
c_state[0] = 0.656, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.354, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.833
c_state[0] = 0.775, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866
c_state[0] = 0.845, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.892, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3869897872375808.000, df[0] = -3041204602142720.000, dc_hat[0] = -1765281756086272.000
Gradient do_[0] = -227246771170967552.000
Backward Time Step 3:
Gradient di[0] = -6041278449975296.000, df[0] = -4611788242944000.000, dc_hat[0] = -2540140295618560.000
Gradient do_[0] = -309791902251614208.000
Backward Time Step 2:
Gradient di[0] = -7859268313153536.000, df[0] = -5927339410063360.000, dc_hat[0] = -4468923672035328.000
Gradient do_[0] = -353236630322020352.000
Backward Time Step 1:
Gradient di[0] = -9881971350044672.000, df[0] = -7087532483280896.000, dc_hat[0] = -6268060776267776.000
Gradient do_[0] = -316543831359094784.000
Backward Time Step 0:
Gradient di[0] = -11709146000785408.000, df[0] = -8558219479744512.000, dc_hat[0] = -11837556933001216.000
Gradient do_[0] = -181655040488898560.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.426, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.847
c_state[0] = 0.692, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847
c_state[0] = 0.826, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1317845860352.000, df[0] = 967262732288.000, dc_hat[0] = 576625967104.000
Gradient do_[0] = 82054981091328.000
Backward Time Step 3:
Gradient di[0] = 2056865316864.000, df[0] = 1484588711936.000, dc_hat[0] = 817315643392.000
Gradient do_[0] = 109549323288576.000
Backward Time Step 2:
Gradient di[0] = 2602805493760.000, df[0] = 1871017934848.000, dc_hat[0] = 1367761354752.000
Gradient do_[0] = 119126949363712.000
Backward Time Step 1:
Gradient di[0] = 3257881591808.000, df[0] = 2251332517888.000, dc_hat[0] = 1894889553920.000
Gradient do_[0] = 105024306806784.000
Backward Time Step 0:
Gradient di[0] = 4001863041024.000, df[0] = 2847302483968.000, dc_hat[0] = 3755097456640.000
Gradient do_[0] = 61868437667840.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.414, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.832
c_state[0] = 0.656, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.833
c_state[0] = 0.775, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866
c_state[0] = 0.845, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -4054508988530688.000, df[0] = -3185987949690880.000, dc_hat[0] = -1848063626838016.000
Gradient do_[0] = -237993483720196096.000
Backward Time Step 3:
Gradient di[0] = -6332232956379136.000, df[0] = -4833817114181632.000, dc_hat[0] = -2658239548227584.000
Gradient do_[0] = -324499656979513344.000
Backward Time Step 2:
Gradient di[0] = -8242806208331776.000, df[0] = -6215942421872640.000, dc_hat[0] = -4672864825376768.000
Gradient do_[0] = -370097950371610624.000
Backward Time Step 1:
Gradient di[0] = -10367044956454912.000, df[0] = -7433605378736128.000, dc_hat[0] = -6546834755420160.000
Gradient do_[0] = -331671908766449664.000
Backward Time Step 0:
Gradient di[0] = -12265779131056128.000, df[0] = -8965061867470848.000, dc_hat[0] = -12400293211799552.000
Gradient do_[0] = -190290604813451264.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.847
c_state[0] = 0.692, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847
c_state[0] = 0.826, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -53605457920.000, df[0] = -39340789760.000, dc_hat[0] = -23441201152.000
Gradient do_[0] = -3336640135168.000
Backward Time Step 3:
Gradient di[0] = -83700424704.000, df[0] = -60409909248.000, dc_hat[0] = -33217296384.000
Gradient do_[0] = -4455423016960.000
Backward Time Step 2:
Gradient di[0] = -105973432320.000, df[0] = -76171427840.000, dc_hat[0] = -55546757120.000
Gradient do_[0] = -4845913767936.000
Backward Time Step 1:
Gradient di[0] = -132673929216.000, df[0] = -91664515072.000, dc_hat[0] = -76858621952.000
Gradient do_[0] = -4272139010048.000
Backward Time Step 0:
Gradient di[0] = -162724757504.000, df[0] = -115777724416.000, dc_hat[0] = -152690704384.000
Gradient do_[0] = -2515709984768.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.437, f_gate[0] = 0.748, o_gate[0] = 0.147, c_hat[0] = 0.861
c_state[0] = 0.728, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.378, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.860
c_state[0] = 0.879, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.884
c_state[0] = 0.983, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.869
c_state[0] = 1.060, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 21692873179136.000, df[0] = 14897366695936.000, dc_hat[0] = 9129976397824.000
Gradient do_[0] = 1445326019887104.000
Backward Time Step 3:
Gradient di[0] = 33938867224576.000, df[0] = 23133291544576.000, dc_hat[0] = 12795915534336.000
Gradient do_[0] = 1889871106932736.000
Backward Time Step 2:
Gradient di[0] = 41473923874816.000, df[0] = 28363133550592.000, dc_hat[0] = 20172322635776.000
Gradient do_[0] = 1938159692677120.000
Backward Time Step 1:
Gradient di[0] = 51634054889472.000, df[0] = 34367837569024.000, dc_hat[0] = 27523704422400.000
Gradient do_[0] = 1676247620911104.000
Backward Time Step 0:
Gradient di[0] = 65273977634816.000, df[0] = 45294892875776.000, dc_hat[0] = 56908140511232.000
Gradient do_[0] = 1006775163682816.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.847
c_state[0] = 0.692, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847
c_state[0] = 0.826, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 4537703202816.000, df[0] = 3330194276352.000, dc_hat[0] = 1984182091776.000
Gradient do_[0] = 282442216767488.000
Backward Time Step 3:
Gradient di[0] = 7085068648448.000, df[0] = 5113578520576.000, dc_hat[0] = 2811593752576.000
Gradient do_[0] = 377137689264128.000
Backward Time Step 2:
Gradient di[0] = 8969579921408.000, df[0] = 6447136505856.000, dc_hat[0] = 4701308846080.000
Gradient do_[0] = 410155351015424.000
Backward Time Step 1:
Gradient di[0] = 11229576822784.000, df[0] = 7758522351616.000, dc_hat[0] = 6505345056768.000
Gradient do_[0] = 361595611906048.000
Backward Time Step 0:
Gradient di[0] = 13777175052288.000, df[0] = 9802381000704.000, dc_hat[0] = 12927637651456.000
Gradient do_[0] = 212993887436800.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.414, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.833
c_state[0] = 0.656, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.834
c_state[0] = 0.775, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866
c_state[0] = 0.845, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3200392733130752.000, df[0] = -2514617184026624.000, dc_hat[0] = -1457710289649664.000
Gradient do_[0] = -187793476467818496.000
Backward Time Step 3:
Gradient di[0] = -5000319574474752.000, df[0] = -3817019308769280.000, dc_hat[0] = -2096133958533120.000
Gradient do_[0] = -256099210414260224.000
Backward Time Step 2:
Gradient di[0] = -6512163631923200.000, df[0] = -4910406581616640.000, dc_hat[0] = -3682146329821184.000
Gradient do_[0] = -292137112763891712.000
Backward Time Step 1:
Gradient di[0] = -8192842853777408.000, df[0] = -5873376367214592.000, dc_hat[0] = -5154170134855680.000
Gradient do_[0] = -261834005366702080.000
Backward Time Step 0:
Gradient di[0] = -9683824107585536.000, df[0] = -7077910682796032.000, dc_hat[0] = -9790022542688256.000
Gradient do_[0] = -150234296701943808.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.848
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848
c_state[0] = 0.826, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 3568919379968.000, df[0] = 2618950156288.000, dc_hat[0] = 1559714988032.000
Gradient do_[0] = 222077105733632.000
Backward Time Step 3:
Gradient di[0] = 5574676709376.000, df[0] = 4023296720896.000, dc_hat[0] = 2209678622720.000
Gradient do_[0] = 296588127764480.000
Backward Time Step 2:
Gradient di[0] = 7061104492544.000, df[0] = 5074929057792.000, dc_hat[0] = 3692393660416.000
Gradient do_[0] = 322620125872128.000
Backward Time Step 1:
Gradient di[0] = 8842242949120.000, df[0] = 6107975647232.000, dc_hat[0] = 5103439839232.000
Gradient do_[0] = 284424209956864.000
Backward Time Step 0:
Gradient di[0] = 10833075634176.000, df[0] = 7707671134208.000, dc_hat[0] = 10165078196224.000
Gradient do_[0] = 167478357393408.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.414, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.833
c_state[0] = 0.656, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.834
c_state[0] = 0.774, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.845, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3357549344587776.000, df[0] = -2637887308824576.000, dc_hat[0] = -1528354851258368.000
Gradient do_[0] = -196956084819197952.000
Backward Time Step 3:
Gradient di[0] = -5248048456269824.000, df[0] = -4006064173350912.000, dc_hat[0] = -2197252386848768.000
Gradient do_[0] = -268650135645323264.000
Backward Time Step 2:
Gradient di[0] = -6838593494450176.000, df[0] = -5156123808104448.000, dc_hat[0] = -3857612923731968.000
Gradient do_[0] = -306535973443862528.000
Backward Time Step 1:
Gradient di[0] = -8605956430626816.000, df[0] = -6168352036749312.000, dc_hat[0] = -5395110283968512.000
Gradient do_[0] = -274767251406585856.000
Backward Time Step 0:
Gradient di[0] = -10159966329503744.000, df[0] = -7425922755985408.000, dc_hat[0] = -10271386371096576.000
Gradient do_[0] = -157621142234857472.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.848
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848
c_state[0] = 0.826, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2532843192320.000, df[0] = 1858481815552.000, dc_hat[0] = 1106365513728.000
Gradient do_[0] = 157565237329920.000
Backward Time Step 3:
Gradient di[0] = 3957893103616.000, df[0] = 2856335704064.000, dc_hat[0] = 1567178620928.000
Gradient do_[0] = 210472523530240.000
Backward Time Step 2:
Gradient di[0] = 5015737466880.000, df[0] = 3604605566976.000, dc_hat[0] = 2617249366016.000
Gradient do_[0] = 228995039756288.000
Backward Time Step 1:
Gradient di[0] = 6282385817600.000, df[0] = 4338962399232.000, dc_hat[0] = 3613634330624.000
Gradient do_[0] = 201887387222016.000
Backward Time Step 0:
Gradient di[0] = 7686917718016.000, df[0] = 5469197828096.000, dc_hat[0] = 7212922044416.000
Gradient do_[0] = 118839060725760.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.413, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.833
c_state[0] = 0.656, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.834
c_state[0] = 0.774, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.845, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3521024121372672.000, df[0] = -2766112752140288.000, dc_hat[0] = -1601865632448512.000
Gradient do_[0] = -206489984503382016.000
Backward Time Step 3:
Gradient di[0] = -5505831856504832.000, df[0] = -4202780386394112.000, dc_hat[0] = -2302573306445824.000
Gradient do_[0] = -281715082562371584.000
Backward Time Step 2:
Gradient di[0] = -7178386073976832.000, df[0] = -5411914343514112.000, dc_hat[0] = -4040566182510592.000
Gradient do_[0] = -321531800218238976.000
Backward Time Step 1:
Gradient di[0] = -9036138744381440.000, df[0] = -6475557088788480.000, dc_hat[0] = -5646516194639872.000
Gradient do_[0] = -288242195901710336.000
Backward Time Step 0:
Gradient di[0] = -10656046863351808.000, df[0] = -7788508190081024.000, dc_hat[0] = -10772907554766848.000
Gradient do_[0] = -165317294132559872.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.848
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1423825698816.000, df[0] = 1044644298752.000, dc_hat[0] = 621651099648.000
Gradient do_[0] = 88553375662080.000
Backward Time Step 3:
Gradient di[0] = 2225780817920.000, df[0] = 1606243450880.000, dc_hat[0] = 880483172352.000
Gradient do_[0] = 118311870267392.000
Backward Time Step 2:
Gradient di[0] = 2822038093824.000, df[0] = 2027936940032.000, dc_hat[0] = 1469696311296.000
Gradient do_[0] = 128751870410752.000
Backward Time Step 1:
Gradient di[0] = 3535531933696.000, df[0] = 2441456123904.000, dc_hat[0] = 2027262181376.000
Gradient do_[0] = 113515096440832.000
Backward Time Step 0:
Gradient di[0] = 4320826228736.000, df[0] = 3074242904064.000, dc_hat[0] = 4054392504320.000
Gradient do_[0] = 66799584411648.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.413, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.834
c_state[0] = 0.656, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835
c_state[0] = 0.774, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.845, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3690022226100224.000, df[0] = -2898666717184000.000, dc_hat[0] = -1677881453314048.000
Gradient do_[0] = -216348171398283264.000
Backward Time Step 3:
Gradient di[0] = -5772436582694912.000, df[0] = -4406225202577408.000, dc_hat[0] = -2411571523354624.000
Gradient do_[0] = -295230966685761536.000
Backward Time Step 2:
Gradient di[0] = -7529948474507264.000, df[0] = -5676576133873664.000, dc_hat[0] = -4230100136493056.000
Gradient do_[0] = -337052506355924992.000
Backward Time Step 1:
Gradient di[0] = -9481401225183232.000, df[0] = -6793562372964352.000, dc_hat[0] = -5907145547579392.000
Gradient do_[0] = -302195067177664512.000
Backward Time Step 0:
Gradient di[0] = -11169719583244288.000, df[0] = -8163952219389952.000, dc_hat[0] = -11292214198009856.000
Gradient do_[0] = -173286399791857664.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.974, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 236308938752.000, df[0] = 173362692096.000, dc_hat[0] = 103130161152.000
Gradient do_[0] = 14693817122816.000
Backward Time Step 3:
Gradient di[0] = 369550032896.000, df[0] = 266677895168.000, dc_hat[0] = 146060115968.000
Gradient do_[0] = 19635833405440.000
Backward Time Step 2:
Gradient di[0] = 468766621696.000, df[0] = 336836853760.000, dc_hat[0] = 243695566848.000
Gradient do_[0] = 21373311254528.000
Backward Time Step 1:
Gradient di[0] = 587424399360.000, df[0] = 405587820544.000, dc_hat[0] = 335853748224.000
Gradient do_[0] = 18844984803328.000
Backward Time Step 0:
Gradient di[0] = 717116669952.000, df[0] = 510224367616.000, dc_hat[0] = 672897302528.000
Gradient do_[0] = 11086558396416.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.413, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.834
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835
c_state[0] = 0.774, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3863815661813760.000, df[0] = -3034979047047168.000, dc_hat[0] = -1756066501099520.000
Gradient do_[0] = -226488142507540480.000
Backward Time Step 3:
Gradient di[0] = -6046736279666688.000, df[0] = -4615540433747968.000, dc_hat[0] = -2523775933349888.000
Gradient do_[0] = -309139857496604672.000
Backward Time Step 2:
Gradient di[0] = -7891806985388032.000, df[0] = -5949000003878912.000, dc_hat[0] = -4425382904201216.000
Gradient do_[0] = -353032842713759744.000
Backward Time Step 1:
Gradient di[0] = -9939873951645696.000, df[0] = -7121029470093312.000, dc_hat[0] = -6175839775358976.000
Gradient do_[0] = -316566474426679296.000
Backward Time Step 0:
Gradient di[0] = -11698816604438528.000, df[0] = -8550669464109056.000, dc_hat[0] = -11827113720020992.000
Gradient do_[0] = -181494786669150208.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -1030644039680.000, df[0] = -756050165760.000, dc_hat[0] = -449622441984.000
Gradient do_[0] = -64073886597120.000
Backward Time Step 3:
Gradient di[0] = -1612381028352.000, df[0] = -1163503337472.000, dc_hat[0] = -636778446848.000
Gradient do_[0] = -85643342381056.000
Backward Time Step 2:
Gradient di[0] = -2046189109248.000, df[0] = -1470223089664.000, dc_hat[0] = -1062064029696.000
Gradient do_[0] = -93242968244224.000
Backward Time Step 1:
Gradient di[0] = -2564748476416.000, df[0] = -1770611933184.000, dc_hat[0] = -1462580674560.000
Gradient do_[0] = -82218986766336.000
Backward Time Step 0:
Gradient di[0] = -3127927111680.000, df[0] = -2225502158848.000, dc_hat[0] = -2935050731520.000
Gradient do_[0] = -48357468798976.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.435, f_gate[0] = 0.747, o_gate[0] = 0.147, c_hat[0] = 0.863
c_state[0] = 0.727, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.378, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.878, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.982, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869
c_state[0] = 1.060, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 20361691267072.000, df[0] = 13976444338176.000, dc_hat[0] = 8551847690240.000
Gradient do_[0] = 1354971182268416.000
Backward Time Step 3:
Gradient di[0] = 31911395196928.000, df[0] = 21744909811712.000, dc_hat[0] = 11977612066816.000
Gradient do_[0] = 1773109468200960.000
Backward Time Step 2:
Gradient di[0] = 39072747749376.000, df[0] = 26710005252096.000, dc_hat[0] = 18821840633856.000
Gradient do_[0] = 1819401028370432.000
Backward Time Step 1:
Gradient di[0] = 48686931378176.000, df[0] = 32381260005376.000, dc_hat[0] = 25525384904704.000
Gradient do_[0] = 1573046804545536.000
Backward Time Step 0:
Gradient di[0] = 61148825124864.000, df[0] = 42432368476160.000, dc_hat[0] = 53311684214784.000
Gradient do_[0] = 943149518159872.000
Epoch 300, Train Loss=0.011598, Weight Norm=12.423228
Sample Predictions at Epoch 300:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.63 | 63.87 | 6.24 |
| 193 | 2024-10-14 | 56.97 | 66.55 | 9.58 |
| 194 | 2024-10-15 | 57.12 | 66.00 | 8.88 |
| 195 | 2024-10-16 | 58.01 | 67.20 | 9.19 |
| 196 | 2024-10-17 | 57.48 | 66.76 | 9.28 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.912, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 3265897168896.000, df[0] = 2395762851840.000, dc_hat[0] = 1424677535744.000
Gradient do_[0] = 203033673728000.000
Backward Time Step 3:
Gradient di[0] = 5109171879936.000, df[0] = 3686811566080.000, dc_hat[0] = 2017637171200.000
Gradient do_[0] = 271375579217920.000
Backward Time Step 2:
Gradient di[0] = 6483168198656.000, df[0] = 4658268471296.000, dc_hat[0] = 3364937531392.000
Gradient do_[0] = 295430332088320.000
Backward Time Step 1:
Gradient di[0] = 8126215487488.000, df[0] = 5610051993600.000, dc_hat[0] = 4634078347264.000
Gradient do_[0] = 260504765333504.000
Backward Time Step 0:
Gradient di[0] = 9913542639616.000, df[0] = 7053428391936.000, dc_hat[0] = 9302246948864.000
Gradient do_[0] = 153262468628480.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.834
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835
c_state[0] = 0.774, h_state[0] = 0.085
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3062017745223680.000, df[0] = -2405014450470912.000, dc_hat[0] = -1390990254407680.000
Gradient do_[0] = -179451962784219136.000
Backward Time Step 3:
Gradient di[0] = -4793750974889984.000, df[0] = -3659075912990720.000, dc_hat[0] = -1999004749529088.000
Gradient do_[0] = -244993799376338944.000
Backward Time Step 2:
Gradient di[0] = -6259028996915200.000, df[0] = -4717913126731776.000, dc_hat[0] = -3504062758649856.000
Gradient do_[0] = -279837408759906304.000
Backward Time Step 1:
Gradient di[0] = -7885750544629760.000, df[0] = -5648704480477184.000, dc_hat[0] = -4887579904180224.000
Gradient do_[0] = -250975812646338560.000
Backward Time Step 0:
Gradient di[0] = -9276178024103936.000, df[0] = -6779961822150656.000, dc_hat[0] = -9377906471993344.000
Gradient do_[0] = -143910111977406464.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2370713419776.000, df[0] = 1738956210176.000, dc_hat[0] = 1033807265792.000
Gradient do_[0] = 147357173809152.000
Backward Time Step 3:
Gradient di[0] = 3710146838528.000, df[0] = 2677185970176.000, dc_hat[0] = 1464112775168.000
Gradient do_[0] = 197003606753280.000
Backward Time Step 2:
Gradient di[0] = 4709952782336.000, df[0] = 3384001167360.000, dc_hat[0] = 2441062121472.000
Gradient do_[0] = 214516101021696.000
Backward Time Step 1:
Gradient di[0] = 5905026383872.000, df[0] = 4076154912768.000, dc_hat[0] = 3359419662336.000
Gradient do_[0] = 189172790853632.000
Backward Time Step 0:
Gradient di[0] = 7197335486464.000, df[0] = 5120862978048.000, dc_hat[0] = 6753528315904.000
Gradient do_[0] = 111270162333696.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835
c_state[0] = 0.774, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3209887563644928.000, df[0] = -2520998968557568.000, dc_hat[0] = -1457580635324416.000
Gradient do_[0] = -188085912201068544.000
Backward Time Step 3:
Gradient di[0] = -5027208217231360.000, df[0] = -3837224982413312.000, dc_hat[0] = -2094741315387392.000
Gradient do_[0] = -256844455959592960.000
Backward Time Step 2:
Gradient di[0] = -6567025933549568.000, df[0] = -4949824180846592.000, dc_hat[0] = -3671099405500416.000
Gradient do_[0] = -293460752965042176.000
Backward Time Step 1:
Gradient di[0] = -8276148576321536.000, df[0] = -5927654553288704.000, dc_hat[0] = -5118007583965184.000
Gradient do_[0] = -263236535527145472.000
Backward Time Step 0:
Gradient di[0] = -9727818531340288.000, df[0] = -7110066029199360.000, dc_hat[0] = -9834500150263808.000
Gradient do_[0] = -150916835724754944.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.691, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1411668115456.000, df[0] = 1035408834560.000, dc_hat[0] = 615390445568.000
Gradient do_[0] = 87732080607232.000
Backward Time Step 3:
Gradient di[0] = 2210073149440.000, df[0] = 1594710949888.000, dc_hat[0] = 871581351936.000
Gradient do_[0] = 117317878939648.000
Backward Time Step 2:
Gradient di[0] = 2806829547520.000, df[0] = 2016548618240.000, dc_hat[0] = 1452795625472.000
Gradient do_[0] = 127777407762432.000
Backward Time Step 1:
Gradient di[0] = 3519867256832.000, df[0] = 2429459890176.000, dc_hat[0] = 1998110064640.000
Gradient do_[0] = 112692878639104.000
Backward Time Step 0:
Gradient di[0] = 4286650253312.000, df[0] = 3049926950912.000, dc_hat[0] = 4022323904512.000
Gradient do_[0] = 66271232131072.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835
c_state[0] = 0.774, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3363653029986304.000, df[0] = -2641605139890176.000, dc_hat[0] = -1526838996238336.000
Gradient do_[0] = -197065932902760448.000
Backward Time Step 3:
Gradient di[0] = -5270069827338240.000, df[0] = -4022550942187520.000, dc_hat[0] = -2194387241009152.000
Gradient do_[0] = -269175289886539776.000
Backward Time Step 2:
Gradient di[0] = -6887542699851776.000, df[0] = -5191171814981632.000, dc_hat[0] = -3845111616110592.000
Gradient do_[0] = -307642597537480704.000
Backward Time Step 1:
Gradient di[0] = -8682550193029120.000, df[0] = -6218060914491392.000, dc_hat[0] = -5358225104830464.000
Gradient do_[0] = -276004940722208768.000
Backward Time Step 0:
Gradient di[0] = -10198181874761728.000, df[0] = -7453854538924032.000, dc_hat[0] = -10310020675665920.000
Gradient do_[0] = -158214002340528128.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 383367249920.000, df[0] = 281169068032.000, dc_hat[0] = 167075856384.000
Gradient do_[0] = 23822675738624.000
Backward Time Step 3:
Gradient di[0] = 600409047040.000, df[0] = 433223827456.000, dc_hat[0] = 236657360896.000
Gradient do_[0] = 31864358174720.000
Backward Time Step 2:
Gradient di[0] = 762829733888.000, df[0] = 548028645376.000, dc_hat[0] = 394418323456.000
Gradient do_[0] = 34713733431296.000
Backward Time Step 1:
Gradient di[0] = 956845654016.000, df[0] = 660371931136.000, dc_hat[0] = 542212554752.000
Gradient do_[0] = 30619480358912.000
Backward Time Step 0:
Gradient di[0] = 1164518490112.000, df[0] = 828548186112.000, dc_hat[0] = 1092711153664.000
Gradient do_[0] = 18003349471232.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3522162019270656.000, df[0] = -2765937732222976.000, dc_hat[0] = -1598293159182336.000
Gradient do_[0] = -206328974769389568.000
Backward Time Step 3:
Gradient di[0] = -5520500243562496.000, df[0] = -4213652559233024.000, dc_hat[0] = -2297348747165696.000
Gradient do_[0] = -281902102618308608.000
Backward Time Step 2:
Gradient di[0] = -7218067813695488.000, df[0] = -5440088792104960.000, dc_hat[0] = -4025295694725120.000
Gradient do_[0] = -322287714462334976.000
Backward Time Step 1:
Gradient di[0] = -9101767992147968.000, df[0] = -6517716487766016.000, dc_hat[0] = -5607547419492352.000
Gradient do_[0] = -289198152542584832.000
Backward Time Step 0:
Gradient di[0] = -10684363582734336.000, df[0] = -7809204563738624.000, dc_hat[0] = -10801534585536512.000
Gradient do_[0] = -165756600567463936.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -718535720960.000, df[0] = -526958886912.000, dc_hat[0] = -313072648192.000
Gradient do_[0] = -44646289899520.000
Backward Time Step 3:
Gradient di[0] = -1125734023168.000, df[0] = -812254167040.000, dc_hat[0] = -443526709248.000
Gradient do_[0] = -59732786151424.000
Backward Time Step 2:
Gradient di[0] = -1430800433152.000, df[0] = -1027875995648.000, dc_hat[0] = -739149152256.000
Gradient do_[0] = -65090602663936.000
Backward Time Step 1:
Gradient di[0] = -1795133145088.000, df[0] = -1238833299456.000, dc_hat[0] = -1015766646784.000
Gradient do_[0] = -57421812727808.000
Backward Time Step 0:
Gradient di[0] = -2183578386432.000, df[0] = -1553603493888.000, dc_hat[0] = -2048933101568.000
Gradient do_[0] = -33757927047168.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.435, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.864
c_state[0] = 0.726, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862
c_state[0] = 0.878, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.982, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869
c_state[0] = 1.060, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 19276096339968.000, df[0] = 13227745345536.000, dc_hat[0] = 8087126147072.000
Gradient do_[0] = 1282003043352576.000
Backward Time Step 3:
Gradient di[0] = 30248162820096.000, df[0] = 20608431685632.000, dc_hat[0] = 11328484802560.000
Gradient do_[0] = 1678932109688832.000
Backward Time Step 2:
Gradient di[0] = 37079778066432.000, df[0] = 25342578262016.000, dc_hat[0] = 17779222642688.000
Gradient do_[0] = 1723599366914048.000
Backward Time Step 1:
Gradient di[0] = 46236686090240.000, df[0] = 30740234371072.000, dc_hat[0] = 24042711023616.000
Gradient do_[0] = 1490389085192192.000
Backward Time Step 0:
Gradient di[0] = 57897715236864.000, df[0] = 40176361406464.000, dc_hat[0] = 50477253263360.000
Gradient do_[0] = 893004969672704.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2905319931904.000, df[0] = 2130697256960.000, dc_hat[0] = 1265802412032.000
Gradient do_[0] = 180519555825664.000
Backward Time Step 3:
Gradient di[0] = 4551673380864.000, df[0] = 3284186169344.000, dc_hat[0] = 1793194852352.000
Gradient do_[0] = 241514114449408.000
Backward Time Step 2:
Gradient di[0] = 5784588517376.000, df[0] = 4155601059840.000, dc_hat[0] = 2988203048960.000
Gradient do_[0] = 263153501863936.000
Backward Time Step 1:
Gradient di[0] = 7257576701952.000, df[0] = 5008501768192.000, dc_hat[0] = 4106661134336.000
Gradient do_[0] = 232151672946688.000
Backward Time Step 0:
Gradient di[0] = 8830624727040.000, df[0] = 6282938941440.000, dc_hat[0] = 8286104453120.000
Gradient do_[0] = 136520660942848.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2788333033881600.000, df[0] = -2189549497221120.000, dc_hat[0] = -1264897396572160.000
Gradient do_[0] = -163324961883160576.000
Backward Time Step 3:
Gradient di[0] = -4371864390467584.000, df[0] = -3336895652167680.000, dc_hat[0] = -1818381913161728.000
Gradient do_[0] = -223203884095504384.000
Backward Time Step 2:
Gradient di[0] = -5718183092682752.000, df[0] = -4309525087649792.000, dc_hat[0] = -3185945536888832.000
Gradient do_[0] = -255239546940162048.000
Backward Time Step 1:
Gradient di[0] = -7212571866169344.000, df[0] = -5164500604944384.000, dc_hat[0] = -4437506254700544.000
Gradient do_[0] = -229084604856795136.000
Backward Time Step 0:
Gradient di[0] = -8465179750694912.000, df[0] = -6187203185082368.000, dc_hat[0] = -8558014395056128.000
Gradient do_[0] = -131328323111354368.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.825, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2137365282816.000, df[0] = 1567412322304.000, dc_hat[0] = 931009724416.000
Gradient do_[0] = 132792679661568.000
Backward Time Step 3:
Gradient di[0] = 3349729771520.000, df[0] = 2416895852544.000, dc_hat[0] = 1319138754560.000
Gradient do_[0] = 177707711201280.000
Backward Time Step 2:
Gradient di[0] = 4258649079808.000, df[0] = 3059286016000.000, dc_hat[0] = 2198162767872.000
Gradient do_[0] = 193679453061120.000
Backward Time Step 1:
Gradient di[0] = 5344339689472.000, df[0] = 3687923056640.000, dc_hat[0] = 3019980406784.000
Gradient do_[0] = 170887789674496.000
Backward Time Step 0:
Gradient di[0] = 6499475652608.000, df[0] = 4624339697664.000, dc_hat[0] = 6098699943936.000
Gradient do_[0] = 100481305149440.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2924400819044352.000, df[0] = -2296283595276288.000, dc_hat[0] = -1326287780052992.000
Gradient do_[0] = -171282041634029568.000
Backward Time Step 3:
Gradient di[0] = -4586895686238208.000, df[0] = -3500986488324096.000, dc_hat[0] = -1906981585551360.000
Gradient do_[0] = -234142512963387392.000
Backward Time Step 2:
Gradient di[0] = -6002001510924288.000, df[0] = -4523296884260864.000, dc_hat[0] = -3341335574609920.000
Gradient do_[0] = -267833593642876928.000
Backward Time Step 1:
Gradient di[0] = -7572647965622272.000, df[0] = -5421964566986752.000, dc_hat[0] = -4653069321109504.000
Gradient do_[0] = -240437010873974784.000
Backward Time Step 0:
Gradient di[0] = -8883825782292480.000, df[0] = -6493191150764032.000, dc_hat[0] = -8981251209822208.000
Gradient do_[0] = -137823181246300160.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1310817910784.000, df[0] = 961223000064.000, dc_hat[0] = 570855456768.000
Gradient do_[0] = 81434131824640.000
Backward Time Step 3:
Gradient di[0] = 2055077232640.000, df[0] = 1482751475712.000, dc_hat[0] = 808994734080.000
Gradient do_[0] = 109007251439616.000
Backward Time Step 2:
Gradient di[0] = 2613649866752.000, df[0] = 1877515960320.000, dc_hat[0] = 1348065296384.000
Gradient do_[0] = 118834715426816.000
Backward Time Step 1:
Gradient di[0] = 3280743956480.000, df[0] = 2263776755712.000, dc_hat[0] = 1851550990336.000
Gradient do_[0] = 104866684862464.000
Backward Time Step 0:
Gradient di[0] = 3988013973504.000, df[0] = 2837449015296.000, dc_hat[0] = 3742102192128.000
Gradient do_[0] = 61654339420160.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3065858519728128.000, df[0] = -2407242464755712.000, dc_hat[0] = -1390109517676544.000
Gradient do_[0] = -179555145078538240.000
Backward Time Step 3:
Gradient di[0] = -4810542149533696.000, df[0] = -3671649966620672.000, dc_hat[0] = -1999142456918016.000
Gradient do_[0] = -245520259287613440.000
Backward Time Step 2:
Gradient di[0] = -6297297692393472.000, df[0] = -4745717671264256.000, dc_hat[0] = -3503057467867136.000
Gradient do_[0] = -280938226057740288.000
Backward Time Step 1:
Gradient di[0] = -7947405873905664.000, df[0] = -5689931334680576.000, dc_hat[0] = -4877514648322048.000
Gradient do_[0] = -252253720035721216.000
Backward Time Step 0:
Gradient di[0] = -9319588332306432.000, df[0] = -6811690356178944.000, dc_hat[0] = -9421793521565696.000
Gradient do_[0] = -144583562849419264.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 419913170944.000, df[0] = 307906805760.000, dc_hat[0] = 182833905664.000
Gradient do_[0] = 26085280448512.000
Backward Time Step 3:
Gradient di[0] = 658565300224.000, df[0] = 475150614528.000, dc_hat[0] = 259157508096.000
Gradient do_[0] = 34927034761216.000
Backward Time Step 2:
Gradient di[0] = 837865111552.000, df[0] = 601864929280.000, dc_hat[0] = 431852847104.000
Gradient do_[0] = 38085723160576.000
Backward Time Step 1:
Gradient di[0] = 1051966767104.000, df[0] = 725836365824.000, dc_hat[0] = 593002692608.000
Gradient do_[0] = 33614473461760.000
Backward Time Step 0:
Gradient di[0] = 1278209032192.000, df[0] = 909438418944.000, dc_hat[0] = 1199391244288.000
Gradient do_[0] = 19760997728256.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -3212015988375552.000, df[0] = -2521882658078720.000, dc_hat[0] = -1456048539959296.000
Gradient do_[0] = -188103744905281536.000
Backward Time Step 3:
Gradient di[0] = -5041713932402688.000, df[0] = -3848055547756544.000, dc_hat[0] = -2094412213518336.000
Gradient do_[0] = -257281477471895552.000
Backward Time Step 2:
Gradient di[0] = -6602676846460928.000, df[0] = -4975733571059712.000, dc_hat[0] = -3670333559144448.000
Gradient do_[0] = -294491235878436864.000
Backward Time Step 1:
Gradient di[0] = -8335076064493568.000, df[0] = -5967133355802624.000, dc_hat[0] = -5109753730564096.000
Gradient do_[0] = -264478519810064384.000
Backward Time Step 0:
Gradient di[0] = -9770407427047424.000, df[0] = -7141193804677120.000, dc_hat[0] = -9877555049922560.000
Gradient do_[0] = -151577539133833216.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -537996034048.000, df[0] = -394475012096.000, dc_hat[0] = -234210557952.000
Gradient do_[0] = -33419373314048.000
Backward Time Step 3:
Gradient di[0] = -844050595840.000, df[0] = -608968376320.000, dc_hat[0] = -332062588928.000
Gradient do_[0] = -44759569661952.000
Backward Time Step 2:
Gradient di[0] = -1074213355520.000, df[0] = -771626893312.000, dc_hat[0] = -553391030272.000
Gradient do_[0] = -48820301856768.000
Backward Time Step 1:
Gradient di[0] = -1349022121984.000, df[0] = -930758721536.000, dc_hat[0] = -759802036224.000
Gradient do_[0] = -43096389713920.000
Backward Time Step 0:
Gradient di[0] = -1638663192576.000, df[0] = -1165899333632.000, dc_hat[0] = -1537618739200.000
Gradient do_[0] = -25333585674240.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.434, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.864
c_state[0] = 0.726, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862
c_state[0] = 0.878, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869
c_state[0] = 1.060, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 18203587641344.000, df[0] = 12489363292160.000, dc_hat[0] = 7631617916928.000
Gradient do_[0] = 1210303261966336.000
Backward Time Step 3:
Gradient di[0] = 28599113482240.000, df[0] = 19483041202176.000, dc_hat[0] = 10696618147840.000
Gradient do_[0] = 1586427238285312.000
Backward Time Step 2:
Gradient di[0] = 35091046727680.000, df[0] = 23980557729792.000, dc_hat[0] = 16779778719744.000
Gradient do_[0] = 1629490459443200.000
Backward Time Step 1:
Gradient di[0] = 43787313217536.000, df[0] = 29105267408896.000, dc_hat[0] = 22657972043776.000
Gradient do_[0] = 1409489886511104.000
Backward Time Step 0:
Gradient di[0] = 54746131988480.000, df[0] = 37989413552128.000, dc_hat[0] = 47729589878784.000
Gradient do_[0] = 844395469340672.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2521632866304.000, df[0] = 1848935186432.000, dc_hat[0] = 1097701720064.000
Gradient do_[0] = 156637021077504.000
Backward Time Step 3:
Gradient di[0] = 3956043677696.000, df[0] = 2854222299136.000, dc_hat[0] = 1556268318720.000
Gradient do_[0] = 209784271798272.000
Backward Time Step 2:
Gradient di[0] = 5034330292224.000, df[0] = 3616247906304.000, dc_hat[0] = 2593389805568.000
Gradient do_[0] = 228796414296064.000
Backward Time Step 1:
Gradient di[0] = 6322250579968.000, df[0] = 4362041556992.000, dc_hat[0] = 3560845344768.000
Gradient do_[0] = 201973118795776.000
Backward Time Step 0:
Gradient di[0] = 7681907097600.000, df[0] = 5465632145408.000, dc_hat[0] = 7208219705344.000
Gradient do_[0] = 118761583542272.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2542371262693376.000, df[0] = -1996027666104320.000, dc_hat[0] = -1152223090311168.000
Gradient do_[0] = -148880952506974208.000
Backward Time Step 3:
Gradient di[0] = -3991960842928128.000, df[0] = -3046811413512192.000, dc_hat[0] = -1657757115613184.000
Gradient do_[0] = -203689167610183680.000
Backward Time Step 2:
Gradient di[0] = -5229526946676736.000, df[0] = -3940861469523968.000, dc_hat[0] = -2905429680062464.000
Gradient do_[0] = -233204663904632832.000
Backward Time Step 1:
Gradient di[0] = -6603519733792768.000, df[0] = -4727292798435328.000, dc_hat[0] = -4044936580169728.000
Gradient do_[0] = -209488919648010240.000
Backward Time Step 0:
Gradient di[0] = -7740841032417280.000, df[0] = -5657783504470016.000, dc_hat[0] = -7825732134764544.000
Gradient do_[0] = -120090970678099968.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1863454949376.000, df[0] = 1366276440064.000, dc_hat[0] = 811048173568.000
Gradient do_[0] = 115747590242304.000
Backward Time Step 3:
Gradient di[0] = 2924486328320.000, df[0] = 2109936631808.000, dc_hat[0] = 1150133338112.000
Gradient do_[0] = 155063938973696.000
Backward Time Step 2:
Gradient di[0] = 3722880745472.000, df[0] = 2674154799104.000, dc_hat[0] = 1916736372736.000
Gradient do_[0] = 169160977940480.000
Backward Time Step 1:
Gradient di[0] = 4676391534592.000, df[0] = 3226329153536.000, dc_hat[0] = 2631365558272.000
Gradient do_[0] = 149354954358784.000
Backward Time Step 0:
Gradient di[0] = 5680200155136.000, df[0] = 4041429483520.000, dc_hat[0] = 5329943265280.000
Gradient do_[0] = 87815371096064.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2668273665572864.000, df[0] = -2094780641181696.000, dc_hat[0] = -1209045373419520.000
Gradient do_[0] = -156247697592942592.000
Backward Time Step 3:
Gradient di[0] = -4191167600132096.000, df[0] = -3198824533196800.000, dc_hat[0] = -1739941046386688.000
Gradient do_[0] = -213829344877871104.000
Backward Time Step 2:
Gradient di[0] = -5492708047060992.000, df[0] = -4139109106843648.000, dc_hat[0] = -3049908890238976.000
Gradient do_[0] = -244893984336379904.000
Backward Time Step 1:
Gradient di[0] = -6937716507803648.000, df[0] = -4966297527910400.000, dc_hat[0] = -4245832601698304.000
Gradient do_[0] = -220037496765939712.000
Backward Time Step 0:
Gradient di[0] = -8130054726877184.000, df[0] = -5942259589578752.000, dc_hat[0] = -8219213416103936.000
Gradient do_[0] = -126129196480069632.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1150981505024.000, df[0] = 843859689472.000, dc_hat[0] = 500883980288.000
Gradient do_[0] = 71491190259712.000
Backward Time Step 3:
Gradient di[0] = 1806956232704.000, df[0] = 1303653777408.000, dc_hat[0] = 710493011968.000
Gradient do_[0] = 95802324156416.000
Backward Time Step 2:
Gradient di[0] = 2301007233024.000, df[0] = 1652796424192.000, dc_hat[0] = 1184233291776.000
Gradient do_[0] = 104539797585920.000
Backward Time Step 1:
Gradient di[0] = 2891016306688.000, df[0] = 1994499293184.000, dc_hat[0] = 1625703317504.000
Gradient do_[0] = 92317377626112.000
Backward Time Step 0:
Gradient di[0] = 3510844260352.000, df[0] = 2497945534464.000, dc_hat[0] = 3294356307968.000
Gradient do_[0] = 54277338824704.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2798873688932352.000, df[0] = -2197222724730880.000, dc_hat[0] = -1268026682900480.000
Gradient do_[0] = -163893392214851584.000
Backward Time Step 3:
Gradient di[0] = -4397862364381184.000, df[0] = -3356553180610560.000, dc_hat[0] = -1825351973994496.000
Gradient do_[0] = -224358629002706944.000
Backward Time Step 2:
Gradient di[0] = -5765837365444608.000, df[0] = -4344871057883136.000, dc_hat[0] = -3200332335153152.000
Gradient do_[0] = -257038640020979712.000
Backward Time Step 1:
Gradient di[0] = -7284636585558016.000, df[0] = -5214458590789632.000, dc_hat[0] = -4455418684243968.000
Gradient do_[0] = -231002994949226496.000
Backward Time Step 0:
Gradient di[0] = -8534794728112128.000, df[0] = -6238085125767168.000, dc_hat[0] = -8628392802910208.000
Gradient do_[0] = -132408326997671936.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 379642249216.000, df[0] = 278329393152.000, dc_hat[0] = 165191122944.000
Gradient do_[0] = 23580393865216.000
Backward Time Step 3:
Gradient di[0] = 596215660544.000, df[0] = 430142750720.000, dc_hat[0] = 234386161664.000
Gradient do_[0] = 31608258166784.000
Backward Time Step 2:
Gradient di[0] = 759475601408.000, df[0] = 545518354432.000, dc_hat[0] = 390729531392.000
Gradient do_[0] = 34500272717824.000
Backward Time Step 1:
Gradient di[0] = 954434453504.000, df[0] = 658439143424.000, dc_hat[0] = 536376639488.000
Gradient do_[0] = 30472430157824.000
Backward Time Step 0:
Gradient di[0] = 1158832848896.000, df[0] = 824502910976.000, dc_hat[0] = 1087376064512.000
Gradient do_[0] = 17915451539456.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2933844378386432.000, df[0] = -2303086555037696.000, dc_hat[0] = -1328972671483904.000
Gradient do_[0] = -171795152786948096.000
Backward Time Step 3:
Gradient di[0] = -4611568662740992.000, df[0] = -3519630136049664.000, dc_hat[0] = -1913652609286144.000
Gradient do_[0] = -235244945168924672.000
Backward Time Step 2:
Gradient di[0] = -6048353334853632.000, df[0] = -4557704404140032.000, dc_hat[0] = -3355911351435264.000
Gradient do_[0] = -269600405749497856.000
Backward Time Step 1:
Gradient di[0] = -7643586866708480.000, df[0] = -5471222473162752.000, dc_hat[0] = -4672232391442432.000
Gradient do_[0] = -242348305680433152.000
Backward Time Step 0:
Gradient di[0] = -8953557260697600.000, df[0] = -6544157917052928.000, dc_hat[0] = -9051746655535104.000
Gradient do_[0] = -138904971839012864.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -191882510336.000, df[0] = -140675743744.000, dc_hat[0] = -83486957568.000
Gradient do_[0] = -11917436387328.000
Backward Time Step 3:
Gradient di[0] = -301358481408.000, df[0] = -217417580544.000, dc_hat[0] = -118452436992.000
Gradient do_[0] = -15974998736896.000
Backward Time Step 2:
Gradient di[0] = -383905431552.000, df[0] = -275750715392.000, dc_hat[0] = -197442863104.000
Gradient do_[0] = -17437188685824.000
Backward Time Step 1:
Gradient di[0] = -482473082880.000, df[0] = -332837191680.000, dc_hat[0] = -270986346496.000
Gradient do_[0] = -15401496870912.000
Backward Time Step 0:
Gradient di[0] = -585679831040.000, df[0] = -416707837952.000, dc_hat[0] = -549565202432.000
Gradient do_[0] = -9054557569024.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.434, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865
c_state[0] = 0.726, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 17184394838016.000, df[0] = 11788703760384.000, dc_hat[0] = 7200873906176.000
Gradient do_[0] = 1142316479807488.000
Backward Time Step 3:
Gradient di[0] = 27022548008960.000, df[0] = 18408101904384.000, dc_hat[0] = 10098611060736.000
Gradient do_[0] = 1498402554642432.000
Backward Time Step 2:
Gradient di[0] = 33178934837248.000, df[0] = 22672408838144.000, dc_hat[0] = 15839551029248.000
Gradient do_[0] = 1539751144325120.000
Backward Time Step 1:
Gradient di[0] = 41424347201536.000, df[0] = 27530889265152.000, dc_hat[0] = 21372533211136.000
Gradient do_[0] = 1332332409651200.000
Backward Time Step 0:
Gradient di[0] = 51755215749120.000, df[0] = 35913958686720.000, dc_hat[0] = 45122007859200.000
Gradient do_[0] = 798264030920704.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 2290124324864.000, df[0] = 1678967439360.000, dc_hat[0] = 996362354688.000
Gradient do_[0] = 142232774508544.000
Backward Time Step 3:
Gradient di[0] = 3596633505792.000, df[0] = 2594823995392.000, dc_hat[0] = 1413607194624.000
Gradient do_[0] = 190655007555584.000
Backward Time Step 2:
Gradient di[0] = 4581369053184.000, df[0] = 3290693369856.000, dc_hat[0] = 2356117766144.000
Gradient do_[0] = 208087038296064.000
Backward Time Step 1:
Gradient di[0] = 5757643259904.000, df[0] = 3971947429888.000, dc_hat[0] = 3233844035584.000
Gradient do_[0] = 183795407912960.000
Backward Time Step 0:
Gradient di[0] = 6991312846848.000, df[0] = 4974278868992.000, dc_hat[0] = 6560209174528.000
Gradient do_[0] = 108085058207744.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2274592936689664.000, df[0] = -1785568497238016.000, dc_hat[0] = -1030222195458048.000
Gradient do_[0] = -133181060333699072.000
Backward Time Step 3:
Gradient di[0] = -3575385254002688.000, df[0] = -2728818208931840.000, dc_hat[0] = -1483330038005760.000
Gradient do_[0] = -182368675074932736.000
Backward Time Step 2:
Gradient di[0] = -4689131477139456.000, df[0] = -3533439261212672.000, dc_hat[0] = -2600782817918976.000
Gradient do_[0] = -208986906690584576.000
Backward Time Step 1:
Gradient di[0] = -5926108365062144.000, df[0] = -4241754261815296.000, dc_hat[0] = -3620430334132224.000
Gradient do_[0] = -187865133702184960.000
Backward Time Step 0:
Gradient di[0] = -6942549419753472.000, df[0] = -5074312197308416.000, dc_hat[0] = -7018685768138752.000
Gradient do_[0] = -107706320810934272.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1908495220736.000, df[0] = 1399180886016.000, dc_hat[0] = 830275715072.000
Gradient do_[0] = 118523296743424.000
Backward Time Step 3:
Gradient di[0] = 2997418459136.000, df[0] = 2162524553216.000, dc_hat[0] = 1177918767104.000
Gradient do_[0] = 158877064626176.000
Backward Time Step 2:
Gradient di[0] = 3818344677376.000, df[0] = 2742611083264.000, dc_hat[0] = 1963081203712.000
Gradient do_[0] = 173408885145600.000
Backward Time Step 1:
Gradient di[0] = 4798899814400.000, df[0] = 3310473707520.000, dc_hat[0] = 2693878513664.000
Gradient do_[0] = 153166637170688.000
Backward Time Step 0:
Gradient di[0] = 5826056028160.000, df[0] = 4145205215232.000, dc_hat[0] = 5466806026240.000
Gradient do_[0] = 90070296035328.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2340991788908544.000, df[0] = -1837693562519552.000, dc_hat[0] = -1060221401169920.000
Gradient do_[0] = -137059793759109120.000
Backward Time Step 3:
Gradient di[0] = -3679883150491648.000, df[0] = -2808595615842304.000, dc_hat[0] = -1526435940401152.000
Gradient do_[0] = -187682322714198016.000
Backward Time Step 2:
Gradient di[0] = -4826409906208768.000, df[0] = -3636859926216704.000, dc_hat[0] = -2676053193523200.000
Gradient do_[0] = -215079334979829760.000
Backward Time Step 1:
Gradient di[0] = -6099753892839424.000, df[0] = -4365938409340928.000, dc_hat[0] = -3724584633237504.000
Gradient do_[0] = -193341801120137216.000
Backward Time Step 0:
Gradient di[0] = -7144711382892544.000, df[0] = -5222072494063616.000, dc_hat[0] = -7223065008144384.000
Gradient do_[0] = -110842643369426944.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1506172338176.000, df[0] = 1104223797248.000, dc_hat[0] = 655209070592.000
Gradient do_[0] = 93531989344256.000
Backward Time Step 3:
Gradient di[0] = 2365645389824.000, df[0] = 1706732290048.000, dc_hat[0] = 929512947712.000
Gradient do_[0] = 125379524165632.000
Backward Time Step 2:
Gradient di[0] = 3013743476736.000, df[0] = 2164674658304.000, dc_hat[0] = 1548949520384.000
Gradient do_[0] = 136851910295552.000
Backward Time Step 1:
Gradient di[0] = 3787811192832.000, df[0] = 2612926087168.000, dc_hat[0] = 2125191315456.000
Gradient do_[0] = 120877769293824.000
Backward Time Step 0:
Gradient di[0] = 4597739421696.000, df[0] = 3271265353728.000, dc_hat[0] = 4314230685696.000
Gradient do_[0] = 71080635006976.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2409957152522240.000, df[0] = -1891834175422464.000, dc_hat[0] = -1091382328426496.000
Gradient do_[0] = -141088653471383552.000
Backward Time Step 3:
Gradient di[0] = -3788422342770688.000, df[0] = -2891458956754944.000, dc_hat[0] = -1571216477388800.000
Gradient do_[0] = -193201956984979456.000
Backward Time Step 2:
Gradient di[0] = -4968996914855936.000, df[0] = -3744281084821504.000, dc_hat[0] = -2754258642403328.000
Gradient do_[0] = -221408020830093312.000
Backward Time Step 1:
Gradient di[0] = -6280103864565760.000, df[0] = -4494919766900736.000, dc_hat[0] = -3832811366645760.000
Gradient do_[0] = -199030811721203712.000
Backward Time Step 0:
Gradient di[0] = -7354723472506880.000, df[0] = -5375570330255360.000, dc_hat[0] = -7435379737100288.000
Gradient do_[0] = -114100762610499584.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1080610193408.000, df[0] = 792230297600.000, dc_hat[0] = 470056009728.000
Gradient do_[0] = 67100831907840.000
Backward Time Step 3:
Gradient di[0] = 1697315815424.000, df[0] = 1224561917952.000, dc_hat[0] = 666821787648.000
Gradient do_[0] = 89950699651072.000
Backward Time Step 2:
Gradient di[0] = 2162456264704.000, df[0] = 1553213423616.000, dc_hat[0] = 1111098654720.000
Gradient do_[0] = 98184487501824.000
Backward Time Step 1:
Gradient di[0] = 2717972168704.000, df[0] = 1874884427776.000, dc_hat[0] = 1524188315648.000
Gradient do_[0] = 86724466180096.000
Backward Time Step 0:
Gradient di[0] = 3298591768576.000, df[0] = 2346928832512.000, dc_hat[0] = 3095191355392.000
Gradient do_[0] = 50995925090304.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.844, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2480827602567168.000, df[0] = -1947471181774848.000, dc_hat[0] = -1123405873020928.000
Gradient do_[0] = -145228770016493568.000
Backward Time Step 3:
Gradient di[0] = -3899955127255040.000, df[0] = -2976608830881792.000, dc_hat[0] = -1617236380876800.000
Gradient do_[0] = -198874011055161344.000
Backward Time Step 2:
Gradient di[0] = -5115513281708032.000, df[0] = -3854663354941440.000, dc_hat[0] = -2834634391945216.000
Gradient do_[0] = -227911597748649984.000
Backward Time Step 1:
Gradient di[0] = -6465438682710016.000, df[0] = -4627468094799872.000, dc_hat[0] = -3944063367643136.000
Gradient do_[0] = -204877447622033408.000
Backward Time Step 0:
Gradient di[0] = -7570571885805568.000, df[0] = -5533333605842944.000, dc_hat[0] = -7653595214249984.000
Gradient do_[0] = -117449428352106496.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 630448586752.000, df[0] = 462203092992.000, dc_hat[0] = 274227478528.000
Gradient do_[0] = 39145871245312.000
Backward Time Step 3:
Gradient di[0] = 990286970880.000, df[0] = 714466525184.000, dc_hat[0] = 389011406848.000
Gradient do_[0] = 52477583949824.000
Backward Time Step 2:
Gradient di[0] = 1261741408256.000, df[0] = 906259922944.000, dc_hat[0] = 648153464832.000
Gradient do_[0] = 57283144843264.000
Backward Time Step 1:
Gradient di[0] = 1585926635520.000, df[0] = 1093969707008.000, dc_hat[0] = 889012682752.000
Gradient do_[0] = 50597839503360.000
Backward Time Step 0:
Gradient di[0] = 1924478664704.000, df[0] = 1369255444480.000, dc_hat[0] = 1805810139136.000
Gradient do_[0] = 29752266063872.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2553104117530624.000, df[0] = -2004215215947776.000, dc_hat[0] = -1156079299854336.000
Gradient do_[0] = -149452217517080576.000
Backward Time Step 3:
Gradient di[0] = -4013709215137792.000, df[0] = -3063455753961472.000, dc_hat[0] = -1664219028127744.000
Gradient do_[0] = -204661221788483584.000
Backward Time Step 2:
Gradient di[0] = -5264914255970304.000, df[0] = -3967226394705920.000, dc_hat[0] = -2916752824467456.000
Gradient do_[0] = -234547597098876928.000
Backward Time Step 1:
Gradient di[0] = -6654407043186688.000, df[0] = -4762635010572288.000, dc_hat[0] = -4057847520296960.000
Gradient do_[0] = -210843810031206400.000
Backward Time Step 0:
Gradient di[0] = -7790877938286592.000, df[0] = -5694355687866368.000, dc_hat[0] = -7876317185835008.000
Gradient do_[0] = -120867234477244416.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 154669252608.000, df[0] = 113393475584.000, dc_hat[0] = 67273957376.000
Gradient do_[0] = 9603261661184.000
Backward Time Step 3:
Gradient di[0] = 242958942208.000, df[0] = 175289712640.000, dc_hat[0] = 95431245824.000
Gradient do_[0] = 12874124623872.000
Backward Time Step 2:
Gradient di[0] = 309575122944.000, df[0] = 222355161088.000, dc_hat[0] = 158993727488.000
Gradient do_[0] = 14053528305664.000
Backward Time Step 1:
Gradient di[0] = 389129568256.000, df[0] = 268416696320.000, dc_hat[0] = 218050576384.000
Gradient do_[0] = 12413581656064.000
Backward Time Step 0:
Gradient di[0] = 472143495168.000, df[0] = 335927377920.000, dc_hat[0] = 443029848064.000
Gradient do_[0] = 7299295870976.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2626609798447104.000, df[0] = -2061924544020480.000, dc_hat[0] = -1189306911686656.000
Gradient do_[0] = -153747356611772416.000
Backward Time Step 3:
Gradient di[0] = -4129391474900992.000, df[0] = -3151776656130048.000, dc_hat[0] = -1711997989158912.000
Gradient do_[0] = -210546718553407488.000
Backward Time Step 2:
Gradient di[0] = -5416870198902784.000, df[0] = -4081715995738112.000, dc_hat[0] = -3000274905989120.000
Gradient do_[0] = -241296983585718272.000
Backward Time Step 1:
Gradient di[0] = -6846600924102656.000, df[0] = -4900108323782656.000, dc_hat[0] = -4173570582249472.000
Gradient do_[0] = -216911980345163776.000
Backward Time Step 0:
Gradient di[0] = -8014945845248000.000, df[0] = -5858127085830144.000, dc_hat[0] = -8102842887831552.000
Gradient do_[0] = -124343417797869568.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -347497431040.000, df[0] = -254763171840.000, dc_hat[0] = -151139188736.000
Gradient do_[0] = -21574704955392.000
Backward Time Step 3:
Gradient di[0] = -545880506368.000, df[0] = -393843736576.000, dc_hat[0] = -214394290176.000
Gradient do_[0] = -28923800846336.000
Backward Time Step 2:
Gradient di[0] = -695591895040.000, df[0] = -499613892608.000, dc_hat[0] = -357172477952.000
Gradient do_[0] = -31574584197120.000
Backward Time Step 1:
Gradient di[0] = -874374299648.000, df[0] = -603123220480.000, dc_hat[0] = -489783951360.000
Gradient do_[0] = -27890481627136.000
Backward Time Step 0:
Gradient di[0] = -1060789026816.000, df[0] = -754745278464.000, dc_hat[0] = -995377872896.000
Gradient do_[0] = -16399702425600.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.434, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865
c_state[0] = 0.726, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 15934027726848.000, df[0] = 10930823888896.000, dc_hat[0] = 6674673827840.000
Gradient do_[0] = 1058828288262144.000
Backward Time Step 3:
Gradient di[0] = 25063889829888.000, df[0] = 17074111905792.000, dc_hat[0] = 9359733030912.000
Gradient do_[0] = 1389152411058176.000
Backward Time Step 2:
Gradient di[0] = 30786046656512.000, df[0] = 21036533809152.000, dc_hat[0] = 14674194071552.000
Gradient do_[0] = 1427788997328896.000
Backward Time Step 1:
Gradient di[0] = 38447423160320.000, df[0] = 25549502152704.000, dc_hat[0] = 19780421550080.000
Gradient do_[0] = 1235575117971456.000
Backward Time Step 0:
Gradient di[0] = 48002995585024.000, df[0] = 33310218649600.000, dc_hat[0] = 41850694008832.000
Gradient do_[0] = 740390420348928.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1781651865600.000, df[0] = 1306192773120.000, dc_hat[0] = 774860439552.000
Gradient do_[0] = 110613871198208.000
Backward Time Step 3:
Gradient di[0] = 2798714355712.000, df[0] = 2019228254208.000, dc_hat[0] = 1099122999296.000
Gradient do_[0] = 148289634697216.000
Backward Time Step 2:
Gradient di[0] = 3565936181248.000, df[0] = 2561257504768.000, dc_hat[0] = 1830971375616.000
Gradient do_[0] = 161865422340096.000
Backward Time Step 1:
Gradient di[0] = 4482458451968.000, df[0] = 3091896729600.000, dc_hat[0] = 2510865563648.000
Gradient do_[0] = 142979880714240.000
Backward Time Step 0:
Gradient di[0] = 5439683035136.000, df[0] = 3870302928896.000, dc_hat[0] = 5104257728512.000
Gradient do_[0] = 84097003159552.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2040316563554304.000, df[0] = -1601678667153408.000, dc_hat[0] = -923754586701824.000
Gradient do_[0] = -119421548485410816.000
Backward Time Step 3:
Gradient di[0] = -3207693640663040.000, df[0] = -2448307720814592.000, dc_hat[0] = -1329655973937152.000
Gradient do_[0] = -163539933586259968.000
Backward Time Step 2:
Gradient di[0] = -4207556926898176.000, df[0] = -3170462985093120.000, dc_hat[0] = -2329901378043904.000
Gradient do_[0] = -187411636695334912.000
Backward Time Step 1:
Gradient di[0] = -5318287814557696.000, df[0] = -3806234914324480.000, dc_hat[0] = -3240834413625344.000
Gradient do_[0] = -168476397197590528.000
Backward Time Step 0:
Gradient di[0] = -6227011626336256.000, df[0] = -4551324767092736.000, dc_hat[0] = -6295300532600832.000
Gradient do_[0] = -96605496798085120.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1450424008704.000, df[0] = 1063360659456.000, dc_hat[0] = 630781509632.000
Gradient do_[0] = 90045230874624.000
Backward Time Step 3:
Gradient di[0] = 2278490898432.000, df[0] = 1643906334720.000, dc_hat[0] = 894735089664.000
Gradient do_[0] = 120718310244352.000
Backward Time Step 2:
Gradient di[0] = 2903256334336.000, df[0] = 2085278973952.000, dc_hat[0] = 1490413551616.000
Gradient do_[0] = 131774445256704.000
Backward Time Step 1:
Gradient di[0] = 3649581875200.000, df[0] = 2517360181248.000, dc_hat[0] = 2043620753408.000
Gradient do_[0] = 116401675173888.000
Backward Time Step 0:
Gradient di[0] = 4428479332352.000, df[0] = 3150837448704.000, dc_hat[0] = 4155407073280.000
Gradient do_[0] = 68463884238848.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.655, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2101419184226304.000, df[0] = -1649650767495168.000, dc_hat[0] = -951377199104000.000
Gradient do_[0] = -122992100827463680.000
Backward Time Step 3:
Gradient di[0] = -3303857958420480.000, df[0] = -2521726965514240.000, dc_hat[0] = -1369380394893312.000
Gradient do_[0] = -168432725970124800.000
Backward Time Step 2:
Gradient di[0] = -4333856614252544.000, df[0] = -3265623354245120.000, dc_hat[0] = -2399350730784768.000
Gradient do_[0] = -193022238373445632.000
Backward Time Step 1:
Gradient di[0] = -5478040196874240.000, df[0] = -3920507619508224.000, dc_hat[0] = -3337092952227840.000
Gradient do_[0] = -173521248603602944.000
Backward Time Step 0:
Gradient di[0] = -6413362204246016.000, df[0] = -4687528917467136.000, dc_hat[0] = -6483695514943488.000
Gradient do_[0] = -99496550954172416.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1099196334080.000, df[0] = 805864734720.000, dc_hat[0] = 478016798720.000
Gradient do_[0] = 68237106610176.000
Backward Time Step 3:
Gradient di[0] = 1726809899008.000, df[0] = 1245883006976.000, dc_hat[0] = 678036832256.000
Gradient do_[0] = 91483944255488.000
Backward Time Step 2:
Gradient di[0] = 2200420745216.000, df[0] = 1580460146688.000, dc_hat[0] = 1129388572672.000
Gradient do_[0] = 99866025918464.000
Backward Time Step 1:
Gradient di[0] = 2766164721664.000, df[0] = 1907981549568.000, dc_hat[0] = 1548428115968.000
Gradient do_[0] = 88217185419264.000
Backward Time Step 0:
Gradient di[0] = 3356186640384.000, df[0] = 2387907444736.000, dc_hat[0] = 3149234962432.000
Gradient do_[0] = 51886333886464.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2164653450854400.000, df[0] = -1699296965558272.000, dc_hat[0] = -979964904079360.000
Gradient do_[0] = -126687232990904320.000
Backward Time Step 3:
Gradient di[0] = -3403382014345216.000, df[0] = -2597712453173248.000, dc_hat[0] = -1410494774640640.000
Gradient do_[0] = -173496372153024512.000
Backward Time Step 2:
Gradient di[0] = -4464560522133504.000, df[0] = -3364103464681472.000, dc_hat[0] = -2471228887531520.000
Gradient do_[0] = -198828707740123136.000
Backward Time Step 1:
Gradient di[0] = -5643357783064576.000, df[0] = -4038762296246272.000, dc_hat[0] = -3436721161109504.000
Gradient do_[0] = -178742142129143808.000
Backward Time Step 0:
Gradient di[0] = -6606216436383744.000, df[0] = -4828485986025472.000, dc_hat[0] = -6678663944732672.000
Gradient do_[0] = -102488459532304384.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 725991751680.000, df[0] = 532255113216.000, dc_hat[0] = 315706769408.000
Gradient do_[0] = 45066831790080.000
Backward Time Step 3:
Gradient di[0] = 1140559577088.000, df[0] = 822912614400.000, dc_hat[0] = 447805325312.000
Gradient do_[0] = 60421746720768.000
Backward Time Step 2:
Gradient di[0] = 1453454786560.000, df[0] = 1043947257856.000, dc_hat[0] = 745862070272.000
Gradient do_[0] = 65960065105920.000
Backward Time Step 1:
Gradient di[0] = 1827206070272.000, df[0] = 1260311019520.000, dc_hat[0] = 1022496473088.000
Gradient do_[0] = 58267095007232.000
Backward Time Step 0:
Gradient di[0] = 2216738947072.000, df[0] = 1577196978176.000, dc_hat[0] = 2080048807936.000
Gradient do_[0] = 34270582145024.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.891, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2229486250622976.000, df[0] = -1750199173120000.000, dc_hat[0] = -1009275975499776.000
Gradient do_[0] = -130475703383621632.000
Backward Time Step 3:
Gradient di[0] = -3505417015525376.000, df[0] = -2675615643729920.000, dc_hat[0] = -1452646791643136.000
Gradient do_[0] = -178688008361345024.000
Backward Time Step 2:
Gradient di[0] = -4598570481090560.000, df[0] = -3465074924584960.000, dc_hat[0] = -2544926231363584.000
Gradient do_[0] = -204782082168193024.000
Backward Time Step 1:
Gradient di[0] = -5812876384141312.000, df[0] = -4160022913220608.000, dc_hat[0] = -3538885548179456.000
Gradient do_[0] = -184095681424654336.000
Backward Time Step 0:
Gradient di[0] = -6803976058044416.000, df[0] = -4973028815405056.000, dc_hat[0] = -6878593061748736.000
Gradient do_[0] = -105556500700725248.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 329653846016.000, df[0] = 241683873792.000, dc_hat[0] = 143349268480.000
Gradient do_[0] = 20462753021952.000
Backward Time Step 3:
Gradient di[0] = 517918687232.000, df[0] = 373680472064.000, dc_hat[0] = 203328045056.000
Gradient do_[0] = 27435512889344.000
Backward Time Step 2:
Gradient di[0] = 660035731456.000, df[0] = 474071400448.000, dc_hat[0] = 338647089152.000
Gradient do_[0] = 29951289982976.000
Backward Time Step 1:
Gradient di[0] = 829789044736.000, df[0] = 572337618944.000, dc_hat[0] = 464204038144.000
Gradient do_[0] = 26458529464320.000
Backward Time Step 0:
Gradient di[0] = 1006597505024.000, df[0] = 716188352512.000, dc_hat[0] = 944527966208.000
Gradient do_[0] = 15561908027392.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2295570093834240.000, df[0] = -1802083720232960.000, dc_hat[0] = -1039151432466432.000
Gradient do_[0] = -134337162450567168.000
Backward Time Step 3:
Gradient di[0] = -3609430386016256.000, df[0] = -2755030931210240.000, dc_hat[0] = -1495616865697792.000
Gradient do_[0] = -183980129624522752.000
Backward Time Step 2:
Gradient di[0] = -4735173526552576.000, df[0] = -3568000594608128.000, dc_hat[0] = -2620048162160640.000
Gradient do_[0] = -210850613259403264.000
Backward Time Step 1:
Gradient di[0] = -5985658086621184.000, df[0] = -4283616502743040.000, dc_hat[0] = -3643017030270976.000
Gradient do_[0] = -189552368654745600.000
Backward Time Step 0:
Gradient di[0] = -7005554442502144.000, df[0] = -5120362299785216.000, dc_hat[0] = -7082381743751168.000
Gradient do_[0] = -108683778058092544.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -90431922176.000, df[0] = -66299969536.000, dc_hat[0] = -39323152384.000
Gradient do_[0] = -5613209255936.000
Backward Time Step 3:
Gradient di[0] = -142082686976.000, df[0] = -102514057216.000, dc_hat[0] = -55776428032.000
Gradient do_[0] = -7526154240000.000
Backward Time Step 2:
Gradient di[0] = -181078278144.000, df[0] = -130059722752.000, dc_hat[0] = -92894494720.000
Gradient do_[0] = -8216579145728.000
Backward Time Step 1:
Gradient di[0] = -227656974336.000, df[0] = -157022339072.000, dc_hat[0] = -127328116736.000
Gradient do_[0] = -7258575470592.000
Backward Time Step 0:
Gradient di[0] = -276150157312.000, df[0] = -196479254528.000, dc_hat[0] = -259121987584.000
Gradient do_[0] = -4269256474624.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865
c_state[0] = 0.726, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 14902563438592.000, df[0] = 10223314010112.000, dc_hat[0] = 6241249656832.000
Gradient do_[0] = 990050426814464.000
Backward Time Step 3:
Gradient di[0] = 23446775922688.000, df[0] = 15972911022080.000, dc_hat[0] = 8751943254016.000
Gradient do_[0] = 1299144794701824.000
Backward Time Step 2:
Gradient di[0] = 28806968508416.000, df[0] = 19683998695424.000, dc_hat[0] = 13718453747712.000
Gradient do_[0] = 1335499813814272.000
Backward Time Step 1:
Gradient di[0] = 35983617687552.000, df[0] = 23910695305216.000, dc_hat[0] = 18482987008000.000
Gradient do_[0] = 1155858914344960.000
Backward Time Step 0:
Gradient di[0] = 44918336651264.000, df[0] = 31169712226304.000, dc_hat[0] = 39161373392896.000
Gradient do_[0] = 692813121454080.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1584051781632.000, df[0] = 1161342877696.000, dc_hat[0] = 688765992960.000
Gradient do_[0] = 98322337497088.000
Backward Time Step 3:
Gradient di[0] = 2488726454272.000, df[0] = 1795642621952.000, dc_hat[0] = 976919592960.000
Gradient do_[0] = 131826832113664.000
Backward Time Step 2:
Gradient di[0] = 3171469754368.000, df[0] = 2277911822336.000, dc_hat[0] = 1626930413568.000
Gradient do_[0] = 143907341991936.000
Backward Time Step 1:
Gradient di[0] = 3987265552384.000, df[0] = 2750145363968.000, dc_hat[0] = 2230070280192.000
Gradient do_[0] = 127129278742528.000
Backward Time Step 0:
Gradient di[0] = 4837988106240.000, df[0] = 3442201067520.000, dc_hat[0] = 4539665088512.000
Gradient do_[0] = 74794858971136.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1779691102928896.000, df[0] = -1397108837974016.000, dc_hat[0] = -805565743759360.000
Gradient do_[0] = -104142674776293376.000
Backward Time Step 3:
Gradient di[0] = -2798314001006592.000, df[0] = -2135935286247424.000, dc_hat[0] = -1159377834737664.000
Gradient do_[0] = -142627995520073728.000
Backward Time Step 2:
Gradient di[0] = -3670845197123584.000, df[0] = -2766015310069760.000, dc_hat[0] = -2030809921880064.000
Gradient do_[0] = -163448210264686592.000
Backward Time Step 1:
Gradient di[0] = -4640398630715392.000, df[0] = -3320853412446208.000, dc_hat[0] = -2823655918665728.000
Gradient do_[0] = -146942272989036544.000
Backward Time Step 0:
Gradient di[0] = -5432328323072000.000, df[0] = -3970490838286336.000, dc_hat[0] = -5491902740692992.000
Gradient do_[0] = -84276836124590080.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1314857156608.000, df[0] = 963986718720.000, dc_hat[0] = 571697922048.000
Gradient do_[0] = 81609898328064.000
Backward Time Step 3:
Gradient di[0] = 2065876910080.000, df[0] = 1490562580480.000, dc_hat[0] = 810872537088.000
Gradient do_[0] = 109422797914112.000
Backward Time Step 2:
Gradient di[0] = 2632746795008.000, df[0] = 1890971156480.000, dc_hat[0] = 1350346080256.000
Gradient do_[0] = 119454331568128.000
Backward Time Step 1:
Gradient di[0] = 3310073413632.000, df[0] = 2283036475392.000, dc_hat[0] = 1850787627008.000
Gradient do_[0] = 105529217122304.000
Backward Time Step 0:
Gradient di[0] = 4015973203968.000, df[0] = 2857341812736.000, dc_hat[0] = 3768337563648.000
Gradient do_[0] = 62086579224576.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1835130675003392.000, df[0] = -1440636452470784.000, dc_hat[0] = -830630703136768.000
Gradient do_[0] = -107382342837862400.000
Backward Time Step 3:
Gradient di[0] = -2885571361898496.000, df[0] = -2202556939894784.000, dc_hat[0] = -1195430461308928.000
Gradient do_[0] = -147067909372379136.000
Backward Time Step 2:
Gradient di[0] = -3785431971790848.000, df[0] = -2852353279524864.000, dc_hat[0] = -2093848230625280.000
Gradient do_[0] = -168539327058411520.000
Backward Time Step 1:
Gradient di[0] = -4785336597086208.000, df[0] = -3424533386100736.000, dc_hat[0] = -2911060382187520.000
Gradient do_[0] = -151520347349319680.000
Backward Time Step 0:
Gradient di[0] = -5601504400506880.000, df[0] = -4094141336125440.000, dc_hat[0] = -5662933707128832.000
Gradient do_[0] = -86901413329764352.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.911, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1026715615232.000, df[0] = 752739614720.000, dc_hat[0] = 446403805184.000
Gradient do_[0] = 63723309891584.000
Backward Time Step 3:
Gradient di[0] = 1613210845184.000, df[0] = 1163966545920.000, dc_hat[0] = 633162629120.000
Gradient do_[0] = 85442921758720.000
Backward Time Step 2:
Gradient di[0] = 2055962099712.000, df[0] = 1476696211456.000, dc_hat[0] = 1054383144960.000
Gradient do_[0] = 93279358025728.000
Backward Time Step 1:
Gradient di[0] = 2584976556032.000, df[0] = 1782904258560.000, dc_hat[0] = 1445054644224.000
Gradient do_[0] = 82407302627328.000
Backward Time Step 0:
Gradient di[0] = 3136088965120.000, df[0] = 2231309434880.000, dc_hat[0] = 2942709268480.000
Gradient do_[0] = 48483654434816.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1861412787847168.000, df[0] = -1461266388353024.000, dc_hat[0] = -842499979476992.000
Gradient do_[0] = -108917836595855360.000
Backward Time Step 3:
Gradient di[0] = -2926869083062272.000, df[0] = -2234080389234688.000, dc_hat[0] = -1212455174799360.000
Gradient do_[0] = -149167959401562112.000
Backward Time Step 2:
Gradient di[0] = -3839605199601664.000, df[0] = -2893161911287808.000, dc_hat[0] = -2123539339542528.000
Gradient do_[0] = -170943392052674560.000
Backward Time Step 1:
Gradient di[0] = -4853743614951424.000, df[0] = -3473455479521280.000, dc_hat[0] = -2952087251976192.000
Gradient do_[0] = -153677795321446400.000
Backward Time Step 0:
Gradient di[0] = -5681064374697984.000, df[0] = -4152291972087808.000, dc_hat[0] = -5743366096551936.000
Gradient do_[0] = -88135709621223424.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 975921610752.000, df[0] = 715498192896.000, dc_hat[0] = 424307490816.000
Gradient do_[0] = 60569537216512.000
Backward Time Step 3:
Gradient di[0] = 1533385637888.000, df[0] = 1106370101248.000, dc_hat[0] = 601793953792.000
Gradient do_[0] = 81212496412672.000
Backward Time Step 2:
Gradient di[0] = 1954226110464.000, df[0] = 1403619115008.000, dc_hat[0] = 1002085351424.000
Gradient do_[0] = 88659474776064.000
Backward Time Step 1:
Gradient di[0] = 2457007816704.000, df[0] = 1694627004416.000, dc_hat[0] = 1373237280768.000
Gradient do_[0] = 78323233325056.000
Backward Time Step 0:
Gradient di[0] = 2980543201280.000, df[0] = 2120639447040.000, dc_hat[0] = 2796755091456.000
Gradient do_[0] = 46078929928192.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1888675495411712.000, df[0] = -1482666062905344.000, dc_hat[0] = -854813583605760.000
Gradient do_[0] = -110510788426334208.000
Backward Time Step 3:
Gradient di[0] = -2969706013130752.000, df[0] = -2266779049000960.000, dc_hat[0] = -1230117959368704.000
Gradient do_[0] = -151346572972523520.000
Backward Time Step 2:
Gradient di[0] = -3895798672654336.000, df[0] = -2935492572086272.000, dc_hat[0] = -2154348616351744.000
Gradient do_[0] = -173437496741330944.000
Backward Time Step 1:
Gradient di[0] = -4924703990874112.000, df[0] = -3524204813090816.000, dc_hat[0] = -2994667826184192.000
Gradient do_[0] = -155916143297560576.000
Backward Time Step 0:
Gradient di[0] = -5763609888030720.000, df[0] = -4212624451436544.000, dc_hat[0] = -5826817311113216.000
Gradient do_[0] = -89416322840002560.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 923648524288.000, df[0] = 677172346880.000, dc_hat[0] = 401568890880.000
Gradient do_[0] = 57324144164864.000
Backward Time Step 3:
Gradient di[0] = 1451238883328.000, df[0] = 1047098884096.000, dc_hat[0] = 569519374336.000
Gradient do_[0] = 76859479949312.000
Backward Time Step 2:
Gradient di[0] = 1849530515456.000, df[0] = 1328416948224.000, dc_hat[0] = 948286783488.000
Gradient do_[0] = 83905902280704.000
Backward Time Step 1:
Gradient di[0] = 2325328953344.000, df[0] = 1603793190912.000, dc_hat[0] = 1299384238080.000
Gradient do_[0] = 74121480241152.000
Backward Time Step 0:
Gradient di[0] = 2820538368000.000, df[0] = 2006796861440.000, dc_hat[0] = 2646616309760.000
Gradient do_[0] = 43605267841024.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1916727805870080.000, df[0] = -1504685689143296.000, dc_hat[0] = -867484743761920.000
Gradient do_[0] = -112149988464656384.000
Backward Time Step 3:
Gradient di[0] = -3013787946844160.000, df[0] = -2300427970281472.000, dc_hat[0] = -1248295468924928.000
Gradient do_[0] = -153588528721166336.000
Backward Time Step 2:
Gradient di[0] = -3953627186069504.000, df[0] = -2979055888498688.000, dc_hat[0] = -2186061849559040.000
Gradient do_[0] = -176004358175981568.000
Backward Time Step 1:
Gradient di[0] = -4997725951098880.000, df[0] = -3576429467926528.000, dc_hat[0] = -3038502799278080.000
Gradient do_[0] = -158219723236966400.000
Backward Time Step 0:
Gradient di[0] = -5848553603727360.000, df[0] = -4274709814312960.000, dc_hat[0] = -5912692497842176.000
Gradient do_[0] = -90734130475565056.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 869448941568.000, df[0] = 637434396672.000, dc_hat[0] = 377994444800.000
Gradient do_[0] = 53959276363776.000
Backward Time Step 3:
Gradient di[0] = 1366064496640.000, df[0] = 985643220992.000, dc_hat[0] = 536061575168.000
Gradient do_[0] = 72346408845312.000
Backward Time Step 2:
Gradient di[0] = 1740979175424.000, df[0] = 1250446016512.000, dc_hat[0] = 892525805568.000
Gradient do_[0] = 78977880293376.000
Backward Time Step 1:
Gradient di[0] = 2188803702784.000, df[0] = 1509618745344.000, dc_hat[0] = 1222859423744.000
Gradient do_[0] = 69765850726400.000
Backward Time Step 0:
Gradient di[0] = 2654689558528.000, df[0] = 1888796278784.000, dc_hat[0] = 2490994262016.000
Gradient do_[0] = 41041260445696.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1945364500316160.000, df[0] = -1527164205793280.000, dc_hat[0] = -880419574644736.000
Gradient do_[0] = -113823333492981760.000
Backward Time Step 3:
Gradient di[0] = -3058783903285248.000, df[0] = -2334774286876672.000, dc_hat[0] = -1266851606691840.000
Gradient do_[0] = -155877093454905344.000
Backward Time Step 2:
Gradient di[0] = -4012650774134784.000, df[0] = -3023519537430528.000, dc_hat[0] = -2218434628681728.000
Gradient do_[0] = -178624339766149120.000
Backward Time Step 1:
Gradient di[0] = -5072260276682752.000, df[0] = -3629735917649920.000, dc_hat[0] = -3083255553196032.000
Gradient do_[0] = -160571200651657216.000
Backward Time Step 0:
Gradient di[0] = -5935275435884544.000, df[0] = -4338094941667328.000, dc_hat[0] = -6000365128384512.000
Gradient do_[0] = -92079528981037056.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 813608599552.000, df[0] = 596493795328.000, dc_hat[0] = 353708146688.000
Gradient do_[0] = 50492772188160.000
Backward Time Step 3:
Gradient di[0] = 1278316249088.000, df[0] = 922330923008.000, dc_hat[0] = 501598945280.000
Gradient do_[0] = 67697366794240.000
Backward Time Step 2:
Gradient di[0] = 1629144743936.000, df[0] = 1170117885952.000, dc_hat[0] = 835098443776.000
Gradient do_[0] = 73901455441920.000
Backward Time Step 1:
Gradient di[0] = 2048157024256.000, df[0] = 1412603707392.000, dc_hat[0] = 1144068767744.000
Gradient do_[0] = 65279438618624.000
Backward Time Step 0:
Gradient di[0] = 2483877314560.000, df[0] = 1767264354304.000, dc_hat[0] = 2330714701824.000
Gradient do_[0] = 38400518258688.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1974529878392832.000, df[0] = -1550057321005056.000, dc_hat[0] = -893592977539072.000
Gradient do_[0] = -115527533566361600.000
Backward Time Step 3:
Gradient di[0] = -3104612009639936.000, df[0] = -2369755990196224.000, dc_hat[0] = -1285751878713344.000
Gradient do_[0] = -158208023746052096.000
Backward Time Step 2:
Gradient di[0] = -4072768505118720.000, df[0] = -3068807283212288.000, dc_hat[0] = -2251413132410880.000
Gradient do_[0] = -181292957565976576.000
Backward Time Step 1:
Gradient di[0] = -5148162012479488.000, df[0] = -3684021351481344.000, dc_hat[0] = -3128842336075776.000
Gradient do_[0] = -162965885437345792.000
Backward Time Step 0:
Gradient di[0] = -6023589090295808.000, df[0] = -4402643468288000.000, dc_hat[0] = -6089647834791936.000
Gradient do_[0] = -93449623548461056.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 755736379392.000, df[0] = 554063822848.000, dc_hat[0] = 328540061696.000
Gradient do_[0] = 46900325646336.000
Backward Time Step 3:
Gradient di[0] = 1187376660480.000, df[0] = 856715821056.000, dc_hat[0] = 465888215040.000
Gradient do_[0] = 62879592087552.000
Backward Time Step 2:
Gradient di[0] = 1513242361856.000, df[0] = 1086868357120.000, dc_hat[0] = 775601258496.000
Gradient do_[0] = 68641039056896.000
Backward Time Step 1:
Gradient di[0] = 1902402338816.000, df[0] = 1312067420160.000, dc_hat[0] = 1062460391424.000
Gradient do_[0] = 60630790832128.000
Backward Time Step 0:
Gradient di[0] = 2306911764480.000, df[0] = 1641354493952.000, dc_hat[0] = 2164661420032.000
Gradient do_[0] = 35664645062656.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2004161663074304.000, df[0] = -1573317119049728.000, dc_hat[0] = -906978041790464.000
Gradient do_[0] = -117259006682071040.000
Backward Time Step 3:
Gradient di[0] = -3151170260434944.000, df[0] = -2405295502393344.000, dc_hat[0] = -1304954677493760.000
Gradient do_[0] = -160576131274113024.000
Backward Time Step 2:
Gradient di[0] = -4133834618568704.000, df[0] = -3114810141048832.000, dc_hat[0] = -2284915353714688.000
Gradient do_[0] = -184003820664127488.000
Backward Time Step 1:
Gradient di[0] = -5225267412860928.000, df[0] = -3739168731561984.000, dc_hat[0] = -3175158994960384.000
Gradient do_[0] = -165398829791707136.000
Backward Time Step 0:
Gradient di[0] = -6113306662141952.000, df[0] = -4468218223656960.000, dc_hat[0] = -6180348954148864.000
Gradient do_[0] = -94841493600075776.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 696167694336.000, df[0] = 510390272000.000, dc_hat[0] = 302635974656.000
Gradient do_[0] = 43202782429184.000
Backward Time Step 3:
Gradient di[0] = 1093773426688.000, df[0] = 789179072512.000, dc_hat[0] = 429137756160.000
Gradient do_[0] = 57921081704448.000
Backward Time Step 2:
Gradient di[0] = 1393946787840.000, df[0] = 1001182920704.000, dc_hat[0] = 714381197312.000
Gradient do_[0] = 63227228585984.000
Backward Time Step 1:
Gradient di[0] = 1752388468736.000, df[0] = 1208595251200.000, dc_hat[0] = 978507595776.000
Gradient do_[0] = 55846973210624.000
Backward Time Step 0:
Gradient di[0] = 2124814876672.000, df[0] = 1511793491968.000, dc_hat[0] = 1993793077248.000
Gradient do_[0] = 32849447092224.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2034129931599872.000, df[0] = -1596840654929920.000, dc_hat[0] = -920515443163136.000
Gradient do_[0] = -119010279597015040.000
Backward Time Step 3:
Gradient di[0] = -3198254913159168.000, df[0] = -2441236594032640.000, dc_hat[0] = -1324374640558080.000
Gradient do_[0] = -162971005038362624.000
Backward Time Step 2:
Gradient di[0] = -4195599201075200.000, df[0] = -3161338863943680.000, dc_hat[0] = -2318801840373760.000
Gradient do_[0] = -186745779325501440.000
Backward Time Step 1:
Gradient di[0] = -5303254355279872.000, df[0] = -3794946398093312.000, dc_hat[0] = -3222011987886080.000
Gradient do_[0] = -167859519634800640.000
Backward Time Step 0:
Gradient di[0] = -6204058247364608.000, df[0] = -4534548624834560.000, dc_hat[0] = -6272095898042368.000
Gradient do_[0] = -96249418239442944.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 634406895616.000, df[0] = 465110335488.000, dc_hat[0] = 275782467584.000
Gradient do_[0] = 39369503145984.000
Backward Time Step 3:
Gradient di[0] = 996725686272.000, df[0] = 719157264384.000, dc_hat[0] = 391045578752.000
Gradient do_[0] = 52780890849280.000
Backward Time Step 2:
Gradient di[0] = 1270258860032.000, df[0] = 912343826432.000, dc_hat[0] = 650943856640.000
Gradient do_[0] = 57615312748544.000
Backward Time Step 1:
Gradient di[0] = 1596860792832.000, df[0] = 1101324615680.000, dc_hat[0] = 891557576704.000
Gradient do_[0] = 50888722874368.000
Backward Time Step 0:
Gradient di[0] = 1936113401856.000, df[0] = 1377533558784.000, dc_hat[0] = 1816727519232.000
Gradient do_[0] = 29932138790912.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2064487565754368.000, df[0] = -1620671415189504.000, dc_hat[0] = -934235716190208.000
Gradient do_[0] = -120784865594441728.000
Backward Time Step 3:
Gradient di[0] = -3245948645933056.000, df[0] = -2477642884317184.000, dc_hat[0] = -1344067870916608.000
Gradient do_[0] = -165398159776808960.000
Backward Time Step 2:
Gradient di[0] = -4258144662323200.000, df[0] = -3208460023889920.000, dc_hat[0] = -2353194327867392.000
Gradient do_[0] = -189524605986144256.000
Backward Time Step 1:
Gradient di[0] = -5382220550242304.000, df[0] = -3851433539534848.000, dc_hat[0] = -3269621163491328.000
Gradient do_[0] = -170353641503326208.000
Backward Time Step 0:
Gradient di[0] = -6296050541264896.000, df[0] = -4601785800982528.000, dc_hat[0] = -6365096972386304.000
Gradient do_[0] = -97676575742361600.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 570433798144.000, df[0] = 418208448512.000, dc_hat[0] = 247967989760.000
Gradient do_[0] = 35399044956160.000
Backward Time Step 3:
Gradient di[0] = 896206176256.000, df[0] = 646630604800.000, dc_hat[0] = 351595036672.000
Gradient do_[0] = 47457027227648.000
Backward Time Step 2:
Gradient di[0] = 1142145417216.000, df[0] = 820326694912.000, dc_hat[0] = 585249456128.000
Gradient do_[0] = 51803009843200.000
Backward Time Step 1:
Gradient di[0] = 1435774353408.000, df[0] = 990221631488.000, dc_hat[0] = 801526775808.000
Gradient do_[0] = 45753703596032.000
Backward Time Step 0:
Gradient di[0] = 1740694224896.000, df[0] = 1238493822976.000, dc_hat[0] = 1633358184448.000
Gradient do_[0] = 26910975328256.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2095196850356224.000, df[0] = -1644777992880128.000, dc_hat[0] = -948113426612224.000
Gradient do_[0] = -122579869866393600.000
Backward Time Step 3:
Gradient di[0] = -3294192134520832.000, df[0] = -2514469007654912.000, dc_hat[0] = -1363989002977280.000
Gradient do_[0] = -167853163083202560.000
Backward Time Step 2:
Gradient di[0] = -4321411409641472.000, df[0] = -3256124497199104.000, dc_hat[0] = -2387983026094080.000
Gradient do_[0] = -192335335663861760.000
Backward Time Step 1:
Gradient di[0] = -5462094057046016.000, df[0] = -3908570026344448.000, dc_hat[0] = -3317778752733184.000
Gradient do_[0] = -172876436573519872.000
Backward Time Step 0:
Gradient di[0] = -6389105302700032.000, df[0] = -4669799829340160.000, dc_hat[0] = -6459172325425152.000
Gradient do_[0] = -99120217329762304.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 504443764736.000, df[0] = 369828233216.000, dc_hat[0] = 219278180352.000
Gradient do_[0] = 31303531495424.000
Backward Time Step 3:
Gradient di[0] = 792519901184.000, df[0] = 571819098112.000, dc_hat[0] = 310905405440.000
Gradient do_[0] = 41965697630208.000
Backward Time Step 2:
Gradient di[0] = 1009997643776.000, df[0] = 725412610048.000, dc_hat[0] = 517498470400.000
Gradient do_[0] = 45808099524608.000
Backward Time Step 1:
Gradient di[0] = 1269623291904.000, df[0] = 875626758144.000, dc_hat[0] = 708691558400.000
Gradient do_[0] = 40457681764352.000
Backward Time Step 0:
Gradient di[0] = 1539161849856.000, df[0] = 1095104724992.000, dc_hat[0] = 1444252876800.000
Gradient do_[0] = 23795301613568.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2126226244239360.000, df[0] = -1669136631463936.000, dc_hat[0] = -962137501466624.000
Gradient do_[0] = -124393643145428992.000
Backward Time Step 3:
Gradient di[0] = -3342940013330432.000, df[0] = -2551680872742912.000, dc_hat[0] = -1384116427685888.000
Gradient do_[0] = -170334004912848896.000
Backward Time Step 2:
Gradient di[0] = -4385334213214208.000, df[0] = -3304282891747328.000, dc_hat[0] = -2423132770009088.000
Gradient do_[0] = -195175442917883904.000
Backward Time Step 1:
Gradient di[0] = -5542798103150592.000, df[0] = -3966299755511808.000, dc_hat[0] = -3366435363487744.000
Gradient do_[0] = -175425482483826688.000
Backward Time Step 0:
Gradient di[0] = -6483124284293120.000, df[0] = -4738517695463424.000, dc_hat[0] = -6554222636040192.000
Gradient do_[0] = -100578831173156864.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 436169572352.000, df[0] = 319773278208.000, dc_hat[0] = 189596303360.000
Gradient do_[0] = 27066395262976.000
Backward Time Step 3:
Gradient di[0] = 685247234048.000, df[0] = 494419869696.000, dc_hat[0] = 268812058624.000
Gradient do_[0] = 36284697411584.000
Backward Time Step 2:
Gradient di[0] = 873281093632.000, df[0] = 627217399808.000, dc_hat[0] = 447417090048.000
Gradient do_[0] = 39606321938432.000
Backward Time Step 1:
Gradient di[0] = 1097736847360.000, df[0] = 757077573632.000, dc_hat[0] = 612677844992.000
Gradient do_[0] = 34979262234624.000
Backward Time Step 0:
Gradient di[0] = 1330705858560.000, df[0] = 946789548032.000, dc_hat[0] = 1248650854400.000
Gradient do_[0] = 20572591357952.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2157605812174848.000, df[0] = -1693769208430592.000, dc_hat[0] = -976318141300736.000
Gradient do_[0] = -126227946368139264.000
Backward Time Step 3:
Gradient di[0] = -3392235768905728.000, df[0] = -2589310423400448.000, dc_hat[0] = -1404472022532096.000
Gradient do_[0] = -172842660950704128.000
Backward Time Step 2:
Gradient di[0] = -4449973739454464.000, df[0] = -3352981646868480.000, dc_hat[0] = -2458676845608960.000
Gradient do_[0] = -198047229850681344.000
Backward Time Step 1:
Gradient di[0] = -5624396576194560.000, df[0] = -4024671045419008.000, dc_hat[0] = -3415635287605248.000
Gradient do_[0] = -178002720559464448.000
Backward Time Step 0:
Gradient di[0] = -6578191237906432.000, df[0] = -4808002750119936.000, dc_hat[0] = -6650331656093696.000
Gradient do_[0] = -102053697172799488.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 365640876032.000, df[0] = 268065669120.000, dc_hat[0] = 158935695360.000
Gradient do_[0] = 22689462878208.000
Backward Time Step 3:
Gradient di[0] = 574435229696.000, df[0] = 414466932736.000, dc_hat[0] = 225333968896.000
Gradient do_[0] = 30416507502592.000
Backward Time Step 2:
Gradient di[0] = 732056977408.000, df[0] = 525784940544.000, dc_hat[0] = 375036477440.000
Gradient do_[0] = 33200451616768.000
Backward Time Step 1:
Gradient di[0] = 920193662976.000, df[0] = 634628210688.000, dc_hat[0] = 513529708544.000
Gradient do_[0] = 29320955297792.000
Backward Time Step 0:
Gradient di[0] = 1115418001408.000, df[0] = 793613500416.000, dc_hat[0] = 1046638362624.000
Gradient do_[0] = 17244261187584.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2189320387559424.000, df[0] = -1718665523232768.000, dc_hat[0] = -990651386691584.000
Gradient do_[0] = -128081748742373376.000
Backward Time Step 3:
Gradient di[0] = -3442061952942080.000, df[0] = -2627344237854720.000, dc_hat[0] = -1425045318533120.000
Gradient do_[0] = -175378169124093952.000
Backward Time Step 2:
Gradient di[0] = -4515309855703040.000, df[0] = -3402204119564288.000, dc_hat[0] = -2494601025814528.000
Gradient do_[0] = -200949906188271616.000
Backward Time Step 1:
Gradient di[0] = -5706874443792384.000, df[0] = -4083670474293248.000, dc_hat[0] = -3465363492700160.000
Gradient do_[0] = -180607772843311104.000
Backward Time Step 0:
Gradient di[0] = -6674270193188864.000, df[0] = -4878226539151360.000, dc_hat[0] = -6747464488976384.000
Gradient do_[0] = -103544248393007104.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 292985503744.000, df[0] = 214798974976.000, dc_hat[0] = 127351799808.000
Gradient do_[0] = 18180684644352.000
Backward Time Step 3:
Gradient di[0] = 460285870080.000, df[0] = 332105908224.000, dc_hat[0] = 180549648384.000
Gradient do_[0] = 24371817086976.000
Backward Time Step 2:
Gradient di[0] = 586581868544.000, df[0] = 421299781632.000, dc_hat[0] = 300488491008.000
Gradient do_[0] = 26602146627584.000
Backward Time Step 1:
Gradient di[0] = 737315127296.000, df[0] = 508500410368.000, dc_hat[0] = 411427241984.000
Gradient do_[0] = 23493030707200.000
Backward Time Step 0:
Gradient di[0] = 893687103488.000, df[0] = 635853144064.000, dc_hat[0] = 838579912704.000
Gradient do_[0] = 13816321540096.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2221360440934400.000, df[0] = -1743816717500416.000, dc_hat[0] = -1005131466276864.000
Gradient do_[0] = -129954646541205504.000
Backward Time Step 3:
Gradient di[0] = -3492391453458432.000, df[0] = -2665763257188352.000, dc_hat[0] = -1445826249359360.000
Gradient do_[0] = -177939464281128960.000
Backward Time Step 2:
Gradient di[0] = -4581313302495232.000, df[0] = -3451930445611008.000, dc_hat[0] = -2530895646949376.000
Gradient do_[0] = -203882423958634496.000
Backward Time Step 1:
Gradient di[0] = -5790192514367488.000, df[0] = -4143270393282560.000, dc_hat[0] = -3515599040806912.000
Gradient do_[0] = -183239402384785408.000
Backward Time Step 0:
Gradient di[0] = -6771329474756608.000, df[0] = -4949167050850304.000, dc_hat[0] = -6845587311820800.000
Gradient do_[0] = -105050020977311744.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 217997656064.000, df[0] = 159822381056.000, dc_hat[0] = 94755258368.000
Gradient do_[0] = 13527281565696.000
Backward Time Step 3:
Gradient di[0] = 342473572352.000, df[0] = 247101931520.000, dc_hat[0] = 134332383232.000
Gradient do_[0] = 18133419032576.000
Backward Time Step 2:
Gradient di[0] = 436440104960.000, df[0] = 313463177216.000, dc_hat[0] = 223560613888.000
Gradient do_[0] = 19792547282944.000
Backward Time Step 1:
Gradient di[0] = 548578721792.000, df[0] = 378333822976.000, dc_hat[0] = 306078842880.000
Gradient do_[0] = 17478806667264.000
Backward Time Step 0:
Gradient di[0] = 664885067776.000, df[0] = 473061916672.000, dc_hat[0] = 623886467072.000
Gradient do_[0] = 10279063650304.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2253760466255872.000, df[0] = -1769250976956416.000, dc_hat[0] = -1019772875571200.000
Gradient do_[0] = -131848460830769152.000
Backward Time Step 3:
Gradient di[0] = -3543285205303296.000, df[0] = -2704612846993408.000, dc_hat[0] = -1466841524338688.000
Gradient do_[0] = -180529346740486144.000
Backward Time Step 2:
Gradient di[0] = -4648047967469568.000, df[0] = -3502207601213440.000, dc_hat[0] = -2567591579090944.000
Gradient do_[0] = -206847342962278400.000
Backward Time Step 1:
Gradient di[0] = -5874432929169408.000, df[0] = -4203531468800000.000, dc_hat[0] = -3566394276839424.000
Gradient do_[0] = -185900134624657408.000
Backward Time Step 0:
Gradient di[0] = -6869462498148352.000, df[0] = -5020892467822592.000, dc_hat[0] = -6944797298262016.000
Gradient do_[0] = -106572449444790272.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 140631359488.000, df[0] = 103102103552.000, dc_hat[0] = 61126000640.000
Gradient do_[0] = 8726406758400.000
Backward Time Step 3:
Gradient di[0] = 220928671744.000, df[0] = 159404818432.000, dc_hat[0] = 86654337024.000
Gradient do_[0] = 11697616060416.000
Backward Time Step 2:
Gradient di[0] = 281544032256.000, df[0] = 202212245504.000, dc_hat[0] = 144207790080.000
Gradient do_[0] = 12767704645632.000
Backward Time Step 1:
Gradient di[0] = 353875755008.000, df[0] = 244053590016.000, dc_hat[0] = 197424316416.000
Gradient do_[0] = 11274853285888.000
Backward Time Step 0:
Gradient di[0] = 428877807616.000, df[0] = 305144102912.000, dc_hat[0] = 402432032768.000
Gradient do_[0] = 6630412386304.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2286473353101312.000, df[0] = -1794930854854656.000, dc_hat[0] = -1034557159636992.000
Gradient do_[0] = -133760674760228864.000
Backward Time Step 3:
Gradient di[0] = -3594673683693568.000, df[0] = -2743839588614144.000, dc_hat[0] = -1488058394345472.000
Gradient do_[0] = -183144466427674624.000
Backward Time Step 2:
Gradient di[0] = -4715434393731072.000, df[0] = -3552975725264896.000, dc_hat[0] = -2604645604130816.000
Gradient do_[0] = -209841175685758976.000
Backward Time Step 1:
Gradient di[0] = -5959502809530368.000, df[0] = -4264385518239744.000, dc_hat[0] = -3617688802820096.000
Gradient do_[0] = -188587117704642560.000
Backward Time Step 0:
Gradient di[0] = -6968560815439872.000, df[0] = -5093323870044160.000, dc_hat[0] = -7044982779150336.000
Gradient do_[0] = -108109858758197248.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 61110956032.000, df[0] = 44802711552.000, dc_hat[0] = 26561681408.000
Gradient do_[0] = 3791989506048.000
Backward Time Step 3:
Gradient di[0] = 96002744320.000, df[0] = 69268078592.000, dc_hat[0] = 37653594112.000
Gradient do_[0] = 5083011481600.000
Backward Time Step 2:
Gradient di[0] = 122341736448.000, df[0] = 87868899328.000, dc_hat[0] = 62659858432.000
Gradient do_[0] = 5547922817024.000
Backward Time Step 1:
Gradient di[0] = 153769033728.000, df[0] = 106047758336.000, dc_hat[0] = 85777801216.000
Gradient do_[0] = 4899103834112.000
Backward Time Step 0:
Gradient di[0] = 186349125632.000, df[0] = 132586332160.000, dc_hat[0] = 174858338304.000
Gradient do_[0] = 2880940539904.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2319541111619584.000, df[0] = -1820889234538496.000, dc_hat[0] = -1049501431234560.000
Gradient do_[0] = -135693495942774784.000
Backward Time Step 3:
Gradient di[0] = -3646617286606848.000, df[0] = -2783491531997184.000, dc_hat[0] = -1509506655715328.000
Gradient do_[0] = -185787743920455680.000
Backward Time Step 2:
Gradient di[0] = -4783540227014656.000, df[0] = -3604285820502016.000, dc_hat[0] = -2642093960855552.000
Gradient do_[0] = -212866911426314240.000
Backward Time Step 1:
Gradient di[0] = -6045463358734336.000, df[0] = -4325875759710208.000, dc_hat[0] = -3669520736583680.000
Gradient do_[0] = -191302207050612736.000
Backward Time Step 0:
Gradient di[0] = -7068695830462464.000, df[0] = -5166512260251648.000, dc_hat[0] = -7146215695187968.000
Gradient do_[0] = -109663357019095040.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -20936667136.000, df[0] = -15349449728.000, dc_hat[0] = -9099941888.000
Gradient do_[0] = -1299127730176.000
Backward Time Step 3:
Gradient di[0] = -32890167296.000, df[0] = -23730995200.000, dc_hat[0] = -12899672064.000
Gradient do_[0] = -1741399785472.000
Backward Time Step 2:
Gradient di[0] = -41913380864.000, df[0] = -30103224320.000, dc_hat[0] = -21465958400.000
Gradient do_[0] = -1900648726528.000
Backward Time Step 1:
Gradient di[0] = -52678877184.000, df[0] = -36330217472.000, dc_hat[0] = -29384400896.000
Gradient do_[0] = -1678328201216.000
Backward Time Step 0:
Gradient di[0] = -63837835264.000, df[0] = -45420249088.000, dc_hat[0] = -59901419520.000
Gradient do_[0] = -986927202304.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865
c_state[0] = 0.725, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 15143181221888.000, df[0] = 10388175323136.000, dc_hat[0] = 6339325591552.000
Gradient do_[0] = 1005682765594624.000
Backward Time Step 3:
Gradient di[0] = 23823973875712.000, df[0] = 16230065897472.000, dc_hat[0] = 8885159067648.000
Gradient do_[0] = 1319416369250304.000
Backward Time Step 2:
Gradient di[0] = 29269482799104.000, df[0] = 19999343247360.000, dc_hat[0] = 13915838742528.000
Gradient do_[0] = 1356046534705152.000
Backward Time Step 1:
Gradient di[0] = 36549555126272.000, df[0] = 24284053372928.000, dc_hat[0] = 18720644661248.000
Gradient do_[0] = 1173082538508288.000
Backward Time Step 0:
Gradient di[0] = 45576443920384.000, df[0] = 31626388045824.000, dc_hat[0] = 39735137402880.000
Gradient do_[0] = 702963672678400.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1347854925824.000, df[0] = 988168585216.000, dc_hat[0] = 585804611584.000
Gradient do_[0] = 83632341385216.000
Backward Time Step 3:
Gradient di[0] = 2117441159168.000, df[0] = 1527796465664.000, dc_hat[0] = 830424612864.000
Gradient do_[0] = 112107261526016.000
Backward Time Step 2:
Gradient di[0] = 2698207035392.000, df[0] = 1937926258688.000, dc_hat[0] = 1381846351872.000
Gradient do_[0] = 122354592120832.000
Backward Time Step 1:
Gradient di[0] = 3391417745408.000, df[0] = 2338906177536.000, dc_hat[0] = 1891740942336.000
Gradient do_[0] = 108049238851584.000
Backward Time Step 0:
Gradient di[0] = 4111209857024.000, df[0] = 2925102366720.000, dc_hat[0] = 3857701666816.000
Gradient do_[0] = 63558935117824.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1798014507155456.000, df[0] = -1411486777868288.000, dc_hat[0] = -813488884678656.000
Gradient do_[0] = -105180484903895040.000
Backward Time Step 3:
Gradient di[0] = -2826725746540544.000, df[0] = -2157681510973440.000, dc_hat[0] = -1170028548325376.000
Gradient do_[0] = -144011370126376960.000
Backward Time Step 2:
Gradient di[0] = -3707752924839936.000, df[0] = -2793711138242560.000, dc_hat[0] = -2047773968957440.000
Gradient do_[0] = -164990807898587136.000
Backward Time Step 1:
Gradient di[0] = -4686005277818880.000, df[0] = -3353096805679104.000, dc_hat[0] = -2844185862340608.000
Gradient do_[0] = -148281289173106688.000
Backward Time Step 0:
Gradient di[0] = -5480716498370560.000, df[0] = -4005857746485248.000, dc_hat[0] = -5540821344452608.000
Gradient do_[0] = -85027519098519552.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1306311393280.000, df[0] = 957710860288.000, dc_hat[0] = 567739351040.000
Gradient do_[0] = 81053616177152.000
Backward Time Step 3:
Gradient di[0] = 2052148559872.000, df[0] = 1480686567424.000, dc_hat[0] = 804790140928.000
Gradient do_[0] = 108648554561536.000
Backward Time Step 2:
Gradient di[0] = 2614984966144.000, df[0] = 1878151135232.000, dc_hat[0] = 1339142307840.000
Gradient do_[0] = 118577973690368.000
Backward Time Step 1:
Gradient di[0] = 3286740238336.000, df[0] = 2266705690624.000, dc_hat[0] = 1833173123072.000
Gradient do_[0] = 104711420116992.000
Backward Time Step 0:
Gradient di[0] = 3984099115008.000, df[0] = 2834663473152.000, dc_hat[0] = 3738428768256.000
Gradient do_[0] = 61593811419136.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1821990088343552.000, df[0] = -1430307995648000.000, dc_hat[0] = -824322402811904.000
Gradient do_[0] = -106581769523822592.000
Backward Time Step 3:
Gradient di[0] = -2864386704146432.000, df[0] = -2186430679875584.000, dc_hat[0] = -1185574853541888.000
Gradient do_[0] = -145927732994244608.000
Backward Time Step 2:
Gradient di[0] = -3757134042890240.000, df[0] = -2830914144960512.000, dc_hat[0] = -2074918699139072.000
Gradient do_[0] = -167184453855084544.000
Backward Time Step 1:
Gradient di[0] = -4748323105800192.000, df[0] = -3397674808115200.000, dc_hat[0] = -2881744814473216.000
Gradient do_[0] = -150249346267348992.000
Backward Time Step 0:
Gradient di[0] = -5553305203769344.000, df[0] = -4058912940621824.000, dc_hat[0] = -5614206229413888.000
Gradient do_[0] = -86153659523530752.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1264228630528.000, df[0] = 926858280960.000, dc_hat[0] = 549443338240.000
Gradient do_[0] = 78441890185216.000
Backward Time Step 3:
Gradient di[0] = 1986015526912.000, df[0] = 1432970723328.000, dc_hat[0] = 778836377600.000
Gradient do_[0] = 105146008731648.000
Backward Time Step 2:
Gradient di[0] = 2530686795776.000, df[0] = 1817605308416.000, dc_hat[0] = 1295923019776.000
Gradient do_[0] = 114753707966464.000
Backward Time Step 1:
Gradient di[0] = 3180709806080.000, df[0] = 2193575903232.000, dc_hat[0] = 1773932380160.000
Gradient do_[0] = 101331733839872.000
Backward Time Step 0:
Gradient di[0] = 3855426256896.000, df[0] = 2743113613312.000, dc_hat[0] = 3617690222592.000
Gradient do_[0] = 59604545306624.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1847368781660160.000, df[0] = -1450231275192320.000, dc_hat[0] = -835794528894976.000
Gradient do_[0] = -108065526105767936.000
Backward Time Step 3:
Gradient di[0] = -2904245074395136.000, df[0] = -2216857167724544.000, dc_hat[0] = -1202044442509312.000
Gradient do_[0] = -147956658365005824.000
Backward Time Step 2:
Gradient di[0] = -3809379098820608.000, df[0] = -2870277251792896.000, dc_hat[0] = -2103689074442240.000
Gradient do_[0] = -169506845751246848.000
Backward Time Step 1:
Gradient di[0] = -4814266423050240.000, df[0] = -3444852071071744.000, dc_hat[0] = -2921606942818304.000
Gradient do_[0] = -152333642356490240.000
Backward Time Step 0:
Gradient di[0] = -5630206056333312.000, df[0] = -4115119835447296.000, dc_hat[0] = -5691950506180608.000
Gradient do_[0] = -87346698359144448.000
Epoch 400, Train Loss=0.011534, Weight Norm=12.551555
Sample Predictions at Epoch 400:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.08 | 63.87 | 6.79 |
| 193 | 2024-10-14 | 56.50 | 66.55 | 10.05 |
| 194 | 2024-10-15 | 56.70 | 66.00 | 9.30 |
| 195 | 2024-10-16 | 57.64 | 67.20 | 9.56 |
| 196 | 2024-10-17 | 57.20 | 66.76 | 9.56 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1221786861568.000, df[0] = 895742443520.000, dc_hat[0] = 530991120384.000
Gradient do_[0] = 75807825330176.000
Backward Time Step 3:
Gradient di[0] = 1919313248256.000, df[0] = 1384844492800.000, dc_hat[0] = 752661430272.000
Gradient do_[0] = 101613381353472.000
Backward Time Step 2:
Gradient di[0] = 2445662486528.000, df[0] = 1756537159680.000, dc_hat[0] = 1252335157248.000
Gradient do_[0] = 110896684728320.000
Backward Time Step 1:
Gradient di[0] = 3073773928448.000, df[0] = 2119822999552.000, dc_hat[0] = 1714195267584.000
Gradient do_[0] = 97923375300608.000
Backward Time Step 0:
Gradient di[0] = 3725666287616.000, df[0] = 2650790166528.000, dc_hat[0] = 3495931412480.000
Gradient do_[0] = 57598464229376.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1873784575361024.000, df[0] = -1470968451039232.000, dc_hat[0] = -847735947264000.000
Gradient do_[0] = -109609867496390656.000
Backward Time Step 3:
Gradient di[0] = -2945735263780864.000, df[0] = -2248529196089344.000, dc_hat[0] = -1219186999164928.000
Gradient do_[0] = -150068699942879232.000
Backward Time Step 2:
Gradient di[0] = -3863764390641664.000, df[0] = -2911252850409472.000, dc_hat[0] = -2133642176364544.000
Gradient do_[0] = -171924465662296064.000
Backward Time Step 1:
Gradient di[0] = -4882906979762176.000, df[0] = -3493958848086016.000, dc_hat[0] = -2963102769348608.000
Gradient do_[0] = -154503305215606784.000
Backward Time Step 0:
Gradient di[0] = -5710258341150720.000, df[0] = -4173630174920704.000, dc_hat[0] = -5772880574939136.000
Gradient do_[0] = -88588622512521216.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1177911296000.000, df[0] = 863575605248.000, dc_hat[0] = 511916572672.000
Gradient do_[0] = 73084883173376.000
Backward Time Step 3:
Gradient di[0] = 1850363084800.000, df[0] = 1335096115200.000, dc_hat[0] = 725605679104.000
Gradient do_[0] = 97961895788544.000
Backward Time Step 2:
Gradient di[0] = 2357782380544.000, df[0] = 1693418913792.000, dc_hat[0] = 1207289511936.000
Gradient do_[0] = 106910300766208.000
Backward Time Step 1:
Gradient di[0] = 2963254542336.000, df[0] = 2043598471168.000, dc_hat[0] = 1652467564544.000
Gradient do_[0] = 94400948469760.000
Backward Time Step 0:
Gradient di[0] = 3591572029440.000, df[0] = 2555382595584.000, dc_hat[0] = 3370105700352.000
Gradient do_[0] = 55525370757120.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1900930245066752.000, df[0] = -1492279139237888.000, dc_hat[0] = -860007407026176.000
Gradient do_[0] = -111196918041804800.000
Backward Time Step 3:
Gradient di[0] = -2988370867257344.000, df[0] = -2281075652952064.000, dc_hat[0] = -1236804418142208.000
Gradient do_[0] = -152239118716239872.000
Backward Time Step 2:
Gradient di[0] = -3919655605370880.000, df[0] = -2953363125698560.000, dc_hat[0] = -2164424475410432.000
Gradient do_[0] = -174408983983947776.000
Backward Time Step 1:
Gradient di[0] = -4953444301406208.000, df[0] = -3544423371636736.000, dc_hat[0] = -3005747499630592.000
Gradient do_[0] = -156732857098698752.000
Backward Time Step 0:
Gradient di[0] = -5792513407320064.000, df[0] = -4233750053388288.000, dc_hat[0] = -5856037584240640.000
Gradient do_[0] = -89864726015639552.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1133031063552.000, df[0] = 830672207872.000, dc_hat[0] = 492406243328.000
Gradient do_[0] = 70299680768000.000
Backward Time Step 3:
Gradient di[0] = 1779837173760.000, df[0] = 1284210425856.000, dc_hat[0] = 697933889536.000
Gradient do_[0] = 94227044237312.000
Backward Time Step 2:
Gradient di[0] = 2267890319360.000, df[0] = 1628855468032.000, dc_hat[0] = 1161217441792.000
Gradient do_[0] = 102832724246528.000
Backward Time Step 1:
Gradient di[0] = 2850206253056.000, df[0] = 1965630423040.000, dc_hat[0] = 1589337653248.000
Gradient do_[0] = 90798133608448.000
Backward Time Step 0:
Gradient di[0] = 3454425890816.000, df[0] = 2457804210176.000, dc_hat[0] = 3241416851456.000
Gradient do_[0] = 53405108142080.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1928686739652608.000, df[0] = -1514069253160960.000, dc_hat[0] = -872554482892800.000
Gradient do_[0] = -112819642585579520.000
Backward Time Step 3:
Gradient di[0] = -3031964785311744.000, df[0] = -2314353864867840.000, dc_hat[0] = -1254817779417088.000
Gradient do_[0] = -154458276778475520.000
Backward Time Step 2:
Gradient di[0] = -3976798266195968.000, df[0] = -2996416951615488.000, dc_hat[0] = -2195898264190976.000
Gradient do_[0] = -176949216621363200.000
Backward Time Step 1:
Gradient di[0] = -5025573445304320.000, df[0] = -3596026329956352.000, dc_hat[0] = -3049355107893248.000
Gradient do_[0] = -159012745998499840.000
Backward Time Step 0:
Gradient di[0] = -5876630878683136.000, df[0] = -4295231436488704.000, dc_hat[0] = -5941077399830528.000
Gradient do_[0] = -91169708878856192.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1085995352064.000, df[0] = 796188475392.000, dc_hat[0] = 471959142400.000
Gradient do_[0] = 67380738785280.000
Backward Time Step 3:
Gradient di[0] = 1705926721536.000, df[0] = 1230882865152.000, dc_hat[0] = 668935716864.000
Gradient do_[0] = 90313179791360.000
Backward Time Step 2:
Gradient di[0] = 2173689135104.000, df[0] = 1561196756992.000, dc_hat[0] = 1112942313472.000
Gradient do_[0] = 98559961595904.000
Backward Time Step 1:
Gradient di[0] = 2731754389504.000, df[0] = 1883936522240.000, dc_hat[0] = 1523203178496.000
Gradient do_[0] = 87023335505920.000
Backward Time Step 0:
Gradient di[0] = 3310738210816.000, df[0] = 2355571458048.000, dc_hat[0] = 3106589376512.000
Gradient do_[0] = 51183716663296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1956948698202112.000, df[0] = -1536256517341184.000, dc_hat[0] = -885330534203392.000
Gradient do_[0] = -114472019583565824.000
Backward Time Step 3:
Gradient di[0] = -3076352735444992.000, df[0] = -2348238472478720.000, dc_hat[0] = -1273159437254656.000
Gradient do_[0] = -156717910612508672.000
Backward Time Step 2:
Gradient di[0] = -4034983261896704.000, df[0] = -3040255145934848.000, dc_hat[0] = -2227944894234624.000
Gradient do_[0] = -179535731826360320.000
Backward Time Step 1:
Gradient di[0] = -5099009869873152.000, df[0] = -3648564785840128.000, dc_hat[0] = -3093754332315648.000
Gradient do_[0] = -161334055562903552.000
Backward Time Step 0:
Gradient di[0] = -5962280579629056.000, df[0] = -4357832732311552.000, dc_hat[0] = -6027666624872448.000
Gradient do_[0] = -92498477270958080.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1037819183104.000, df[0] = 760868634624.000, dc_hat[0] = 451017015296.000
Gradient do_[0] = 64391110197248.000
Backward Time Step 3:
Gradient di[0] = 1630229233664.000, df[0] = 1176265687040.000, dc_hat[0] = 639238602752.000
Gradient do_[0] = 86304708624384.000
Backward Time Step 2:
Gradient di[0] = 2077214900224.000, df[0] = 1491905806336.000, dc_hat[0] = 1063508049920.000
Gradient do_[0] = 94184245559296.000
Backward Time Step 1:
Gradient di[0] = 2610447777792.000, df[0] = 1800274051072.000, dc_hat[0] = 1455484698624.000
Gradient do_[0] = 83157655224320.000
Backward Time Step 0:
Gradient di[0] = 3163602550784.000, df[0] = 2250885038080.000, dc_hat[0] = 2968526258176.000
Gradient do_[0] = 48909007192064.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1985599082856448.000, df[0] = -1558748053110784.000, dc_hat[0] = -898281806757888.000
Gradient do_[0] = -116147013879332864.000
Backward Time Step 3:
Gradient di[0] = -3121350839369728.000, df[0] = -2382588547170304.000, dc_hat[0] = -1291753021767680.000
Gradient do_[0] = -159008605650026496.000
Backward Time Step 2:
Gradient di[0] = -4093972490223616.000, df[0] = -3084699735949312.000, dc_hat[0] = -2260432429514752.000
Gradient do_[0] = -182157963979390976.000
Backward Time Step 1:
Gradient di[0] = -5173457759240192.000, df[0] = -3701826943713280.000, dc_hat[0] = -3138763710529536.000
Gradient do_[0] = -163687285324251136.000
Backward Time Step 0:
Gradient di[0] = -6049099048550400.000, df[0] = -4421288726626304.000, dc_hat[0] = -6115436965920768.000
Gradient do_[0] = -93845379014983680.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 987559559168.000, df[0] = 724021149696.000, dc_hat[0] = 429169672192.000
Gradient do_[0] = 61272280268800.000
Backward Time Step 3:
Gradient di[0] = 1551257567232.000, df[0] = 1119285936128.000, dc_hat[0] = 608258883584.000
Gradient do_[0] = 82122979147776.000
Backward Time Step 2:
Gradient di[0] = 1976568250368.000, df[0] = 1419618549760.000, dc_hat[0] = 1011941834752.000
Gradient do_[0] = 89619492241408.000
Backward Time Step 1:
Gradient di[0] = 2483906412544.000, df[0] = 1713002250240.000, dc_hat[0] = 1384856682496.000
Gradient do_[0] = 79125427519488.000
Backward Time Step 0:
Gradient di[0] = 3010140045312.000, df[0] = 2141697474560.000, dc_hat[0] = 2824526888960.000
Gradient do_[0] = 46536494940160.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2014644604502016.000, df[0] = -1581549900267520.000, dc_hat[0] = -911411655999488.000
Gradient do_[0] = -117845140868956160.000
Backward Time Step 3:
Gradient di[0] = -3166971445116928.000, df[0] = -2417413484183552.000, dc_hat[0] = -1310602425270272.000
Gradient do_[0] = -161330877287104512.000
Backward Time Step 2:
Gradient di[0] = -4153769977708544.000, df[0] = -3129753137577984.000, dc_hat[0] = -2293367044046848.000
Gradient do_[0] = -184816119238885376.000
Backward Time Step 1:
Gradient di[0] = -5248924629598208.000, df[0] = -3755817903849472.000, dc_hat[0] = -3184389148114944.000
Gradient do_[0] = -166072658620841984.000
Backward Time Step 0:
Gradient di[0] = -6137097022865408.000, df[0] = -4485606398754816.000, dc_hat[0] = -6204400234135552.000
Gradient do_[0] = -95210577319690240.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 935386415104.000, df[0] = 685771128832.000, dc_hat[0] = 406492184576.000
Gradient do_[0] = 58034759925760.000
Backward Time Step 3:
Gradient di[0] = 1469285793792.000, df[0] = 1060141531136.000, dc_hat[0] = 576104497152.000
Gradient do_[0] = 77782604316672.000
Backward Time Step 2:
Gradient di[0] = 1872098754560.000, df[0] = 1344585203712.000, dc_hat[0] = 958421794816.000
Gradient do_[0] = 84881606443008.000
Backward Time Step 1:
Gradient di[0] = 2352564928512.000, df[0] = 1622420357120.000, dc_hat[0] = 1311560695808.000
Gradient do_[0] = 74940376154112.000
Backward Time Step 0:
Gradient di[0] = 2850869477376.000, df[0] = 2028377341952.000, dc_hat[0] = 2675077021696.000
Gradient do_[0] = 44074186833920.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2044089289670656.000, df[0] = -1604665548472320.000, dc_hat[0] = -924722028085248.000
Gradient do_[0] = -119566632480669696.000
Backward Time Step 3:
Gradient di[0] = -3213218579218432.000, df[0] = -2452717310050304.000, dc_hat[0] = -1329711942729728.000
Gradient do_[0] = -163685103480864768.000
Backward Time Step 2:
Gradient di[0] = -4214393709527040.000, df[0] = -3175429041029120.000, dc_hat[0] = -2326755851370496.000
Gradient do_[0] = -187510936339218432.000
Backward Time Step 1:
Gradient di[0] = -5325426050203648.000, df[0] = -3810549208973312.000, dc_hat[0] = -3230639503441920.000
Gradient do_[0] = -168490759568228352.000
Backward Time Step 0:
Gradient di[0] = -6226311546667008.000, df[0] = -4550813129113600.000, dc_hat[0] = -6294592936738816.000
Gradient do_[0] = -96594630530826240.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 881278844928.000, df[0] = 646102777856.000, dc_hat[0] = 382974099456.000
Gradient do_[0] = 54677290876928.000
Backward Time Step 3:
Gradient di[0] = 1384273674240.000, df[0] = 998803177472.000, dc_hat[0] = 542759485440.000
Gradient do_[0] = 73281319206912.000
Backward Time Step 2:
Gradient di[0] = 1763761192960.000, df[0] = 1266774310912.000, dc_hat[0] = 902926696448.000
Gradient do_[0] = 79968423903232.000
Backward Time Step 1:
Gradient di[0] = 2216371159040.000, df[0] = 1528492457984.000, dc_hat[0] = 1235567378432.000
Gradient do_[0] = 70600907292672.000
Backward Time Step 0:
Gradient di[0] = 2685730553856.000, df[0] = 1910881779712.000, dc_hat[0] = 2520121344000.000
Gradient do_[0] = 41521151737856.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2073838380646400.000, df[0] = -1628019567362048.000, dc_hat[0] = -938169369362432.000
Gradient do_[0] = -121305845127446528.000
Backward Time Step 3:
Gradient di[0] = -3259933864755200.000, df[0] = -2488378960379904.000, dc_hat[0] = -1349013928411136.000
Gradient do_[0] = -166063106613575680.000
Backward Time Step 2:
Gradient di[0] = -4275623031734272.000, df[0] = -3221561553190912.000, dc_hat[0] = -2360476713353216.000
Gradient do_[0] = -190232914812731392.000
Backward Time Step 1:
Gradient di[0] = -5402703249276928.000, df[0] = -3865835101749248.000, dc_hat[0] = -3277358278639616.000
Gradient do_[0] = -170933410548678656.000
Backward Time Step 0:
Gradient di[0] = -6316421571149824.000, df[0] = -4616674841985024.000, dc_hat[0] = -6385691340570624.000
Gradient do_[0] = -97992608025935872.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 825474154496.000, df[0] = 605190160384.000, dc_hat[0] = 358719094784.000
Gradient do_[0] = 51214595129344.000
Backward Time Step 3:
Gradient di[0] = 1296600399872.000, df[0] = 935544815616.000, dc_hat[0] = 508372746240.000
Gradient do_[0] = 68639348752384.000
Backward Time Step 2:
Gradient di[0] = 1652036337664.000, df[0] = 1186530328576.000, dc_hat[0] = 845701382144.000
Gradient do_[0] = 74901855666176.000
Backward Time Step 1:
Gradient di[0] = 2075924627456.000, df[0] = 1431632216064.000, dc_hat[0] = 1157213323264.000
Gradient do_[0] = 66126121795584.000
Backward Time Step 0:
Gradient di[0] = 2515452035072.000, df[0] = 1789729701888.000, dc_hat[0] = 2360342740992.000
Gradient do_[0] = 38888663941120.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2103900333146112.000, df[0] = -1651620010000384.000, dc_hat[0] = -951758981431296.000
Gradient do_[0] = -123063311385231360.000
Backward Time Step 3:
Gradient di[0] = -3307149782417408.000, df[0] = -2524422057492480.000, dc_hat[0] = -1368522609393664.000
Gradient do_[0] = -168466535952678912.000
Backward Time Step 2:
Gradient di[0] = -4337509483937792.000, df[0] = -3268189060333568.000, dc_hat[0] = -2394558352588800.000
Gradient do_[0] = -192983789826211840.000
Backward Time Step 1:
Gradient di[0] = -5480804545200128.000, df[0] = -3921710747222016.000, dc_hat[0] = -3324574464737280.000
Gradient do_[0] = -173402054671204352.000
Backward Time Step 0:
Gradient di[0] = -6407495278919680.000, df[0] = -4683240929492992.000, dc_hat[0] = -6477764165107712.000
Gradient do_[0] = -99405532007235584.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 767638044672.000, df[0] = 562788040704.000, dc_hat[0] = 333581877248.000
Gradient do_[0] = 47625919266816.000
Backward Time Step 3:
Gradient di[0] = 1205739061248.000, df[0] = 869985878016.000, dc_hat[0] = 472737087488.000
Gradient do_[0] = 63828658225152.000
Backward Time Step 2:
Gradient di[0] = 1536249298944.000, df[0] = 1103369076736.000, dc_hat[0] = 786400935936.000
Gradient do_[0] = 69651203620864.000
Backward Time Step 1:
Gradient di[0] = 1930382278656.000, df[0] = 1331258064896.000, dc_hat[0] = 1076026408960.000
Gradient do_[0] = 61489180311552.000
Backward Time Step 0:
Gradient di[0] = 2339013132288.000, df[0] = 1664194314240.000, dc_hat[0] = 2194783207424.000
Gradient do_[0] = 36160931889152.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2134311654391808.000, df[0] = -1675494256803840.000, dc_hat[0] = -965505762459648.000
Gradient do_[0] = -124841264637018112.000
Backward Time Step 3:
Gradient di[0] = -3354906865958912.000, df[0] = -2560878276771840.000, dc_hat[0] = -1388254360240128.000
Gradient do_[0] = -170897676420775936.000
Backward Time Step 2:
Gradient di[0] = -4400104874180608.000, df[0] = -3315350485598208.000, dc_hat[0] = -2429030565412864.000
Gradient do_[0] = -195766344518467584.000
Backward Time Step 1:
Gradient di[0] = -5559797583708160.000, df[0] = -3978224732209152.000, dc_hat[0] = -3372327790182400.000
Gradient do_[0] = -175898856599322624.000
Backward Time Step 0:
Gradient di[0] = -6499599778840576.000, df[0] = -4750559710019584.000, dc_hat[0] = -6570877982343168.000
Gradient do_[0] = -100834424676941824.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 707754983424.000, df[0] = 518885376000.000, dc_hat[0] = 307555926016.000
Gradient do_[0] = 43910298599424.000
Backward Time Step 3:
Gradient di[0] = 1111665278976.000, df[0] = 802108997632.000, dc_hat[0] = 435844251648.000
Gradient do_[0] = 58847976751104.000
Backward Time Step 2:
Gradient di[0] = 1416373075968.000, df[0] = 1017270763520.000, dc_hat[0] = 725011267584.000
Gradient do_[0] = 64215326916608.000
Backward Time Step 1:
Gradient di[0] = 1779710558208.000, df[0] = 1227347066880.000, dc_hat[0] = 991990120448.000
Gradient do_[0] = 56688954572800.000
Backward Time Step 0:
Gradient di[0] = 2156370067456.000, df[0] = 1534244945920.000, dc_hat[0] = 2023402504192.000
Gradient do_[0] = 33337286590464.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2164964701765632.000, df[0] = -1699558421692416.000, dc_hat[0] = -979361461174272.000
Gradient do_[0] = -126633356921143296.000
Backward Time Step 3:
Gradient di[0] = -3403046201589760.000, df[0] = -2597626285391872.000, dc_hat[0] = -1408143011610624.000
Gradient do_[0] = -173348058342359040.000
Backward Time Step 2:
Gradient di[0] = -4463205191516160.000, df[0] = -3362892552339456.000, dc_hat[0] = -2463781951111168.000
Gradient do_[0] = -198571267400400896.000
Backward Time Step 1:
Gradient di[0] = -5639421445537792.000, df[0] = -4035189151891456.000, dc_hat[0] = -3420461220233216.000
Gradient do_[0] = -178415604355563520.000
Backward Time Step 0:
Gradient di[0] = -6592436033814528.000, df[0] = -4818413750845440.000, dc_hat[0] = -6664732681437184.000
Gradient do_[0] = -102274681830113280.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 645849743360.000, df[0] = 473500123136.000, dc_hat[0] = 280651825152.000
Gradient do_[0] = 40069289213952.000
Backward Time Step 3:
Gradient di[0] = 1014417260544.000, df[0] = 731941437440.000, dc_hat[0] = 397708296192.000
Gradient do_[0] = 53699443425280.000
Backward Time Step 2:
Gradient di[0] = 1292455116800.000, df[0] = 928269664256.000, dc_hat[0] = 661557673984.000
Gradient do_[0] = 58596394008576.000
Backward Time Step 1:
Gradient di[0] = 1623963992064.000, df[0] = 1119936970752.000, dc_hat[0] = 905134014464.000
Gradient do_[0] = 51727256518656.000
Backward Time Step 0:
Gradient di[0] = 1967592177664.000, df[0] = 1399930617856.000, dc_hat[0] = 1846265118720.000
Gradient do_[0] = 30418797592576.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2196039595458560.000, df[0] = -1723953701715968.000, dc_hat[0] = -993408017498112.000
Gradient do_[0] = -128450076547743744.000
Backward Time Step 3:
Gradient di[0] = -3451844814700544.000, df[0] = -2634878415798272.000, dc_hat[0] = -1428304259186688.000
Gradient do_[0] = -175832129987411968.000
Backward Time Step 2:
Gradient di[0] = -4527172555374592.000, df[0] = -3411087454109696.000, dc_hat[0] = -2499006856953856.000
Gradient do_[0] = -201414604469829632.000
Backward Time Step 1:
Gradient di[0] = -5720131397222400.000, df[0] = -4092930692218880.000, dc_hat[0] = -3469251780280320.000
Gradient do_[0] = -180966729030041600.000
Backward Time Step 0:
Gradient di[0] = -6686540914753536.000, df[0] = -4887194967736320.000, dc_hat[0] = -6759869965139968.000
Gradient do_[0] = -103734609933500416.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 581724930048.000, df[0] = 426487676928.000, dc_hat[0] = 252783673344.000
Gradient do_[0] = 36090626965504.000
Backward Time Step 3:
Gradient di[0] = 913684889600.000, df[0] = 659259916288.000, dc_hat[0] = 358208176128.000
Gradient do_[0] = 48366566244352.000
Backward Time Step 2:
Gradient di[0] = 1164102729728.000, df[0] = 836083908608.000, dc_hat[0] = 595839156224.000
Gradient do_[0] = 52776537161728.000
Backward Time Step 1:
Gradient di[0] = 1462654599168.000, df[0] = 1008690724864.000, dc_hat[0] = 815186247680.000
Gradient do_[0] = 46588500115456.000
Backward Time Step 0:
Gradient di[0] = 1772088590336.000, df[0] = 1260830851072.000, dc_hat[0] = 1662816878592.000
Gradient do_[0] = 27396329701376.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2227351517659136.000, df[0] = -1748535141728256.000, dc_hat[0] = -1007561276915712.000
Gradient do_[0] = -130280548659625984.000
Backward Time Step 3:
Gradient di[0] = -3501014674046976.000, df[0] = -2672412940304384.000, dc_hat[0] = -1448619051843584.000
Gradient do_[0] = -178334858970398720.000
Backward Time Step 2:
Gradient di[0] = -4591611023458304.000, df[0] = -3459638301294592.000, dc_hat[0] = -2534492950495232.000
Gradient do_[0] = -204278918159532032.000
Backward Time Step 1:
Gradient di[0] = -5801451234263040.000, df[0] = -4151108708597760.000, dc_hat[0] = -3518409560031232.000
Gradient do_[0] = -183537026438529024.000
Backward Time Step 0:
Gradient di[0] = -6781354465296384.000, df[0] = -4956494265057280.000, dc_hat[0] = -6855723434639360.000
Gradient do_[0] = -105205541743099904.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 515513090048.000, df[0] = 377944965120.000, dc_hat[0] = 224009273344.000
Gradient do_[0] = 31982547369984.000
Backward Time Step 3:
Gradient di[0] = 809677946880.000, df[0] = 584215363584.000, dc_hat[0] = 317425778688.000
Gradient do_[0] = 42860447531008.000
Backward Time Step 2:
Gradient di[0] = 1031578058752.000, df[0] = 740901453824.000, dc_hat[0] = 527989080064.000
Gradient do_[0] = 46767693365248.000
Backward Time Step 1:
Gradient di[0] = 1296110452736.000, df[0] = 893835018240.000, dc_hat[0] = 722330779648.000
Gradient do_[0] = 41283171123200.000
Backward Time Step 0:
Gradient di[0] = 1570255273984.000, df[0] = 1117227581440.000, dc_hat[0] = 1473429110784.000
Gradient do_[0] = 24276006600704.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2258989857374208.000, df[0] = -1773373071818752.000, dc_hat[0] = -1021862914031616.000
Gradient do_[0] = -132130193505517568.000
Backward Time Step 3:
Gradient di[0] = -3550703150694400.000, df[0] = -2710344749285376.000, dc_hat[0] = -1469149263953920.000
Gradient do_[0] = -180864199570751488.000
Backward Time Step 2:
Gradient di[0] = -4656735075696640.000, df[0] = -3508704544555008.000, dc_hat[0] = -2570355390545920.000
Gradient do_[0] = -207173760476774400.000
Backward Time Step 1:
Gradient di[0] = -5883618790473728.000, df[0] = -4209892852236288.000, dc_hat[0] = -3568078709325824.000
Gradient do_[0] = -186134193162420224.000
Backward Time Step 0:
Gradient di[0] = -6877153173962752.000, df[0] = -5026513506271232.000, dc_hat[0] = -6952572262809600.000
Gradient do_[0] = -106691763636273152.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 447387828224.000, df[0] = 327999488000.000, dc_hat[0] = 194404319232.000
Gradient do_[0] = 27755836080128.000
Backward Time Step 3:
Gradient di[0] = 702669979648.000, df[0] = 507005272064.000, dc_hat[0] = 275468615680.000
Gradient do_[0] = 37195561828352.000
Backward Time Step 2:
Gradient di[0] = 895233425408.000, df[0] = 642975727616.000, dc_hat[0] = 458189373440.000
Gradient do_[0] = 40585838723072.000
Backward Time Step 1:
Gradient di[0] = 1124774969344.000, df[0] = 775675576320.000, dc_hat[0] = 626814746624.000
Gradient do_[0] = 35825383374848.000
Backward Time Step 0:
Gradient di[0] = 1362632245248.000, df[0] = 969504980992.000, dc_hat[0] = 1278608670720.000
Gradient do_[0] = 21066170761216.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2290945219362816.000, df[0] = -1798459438923776.000, dc_hat[0] = -1036307157483520.000
Gradient do_[0] = -133998298120847360.000
Backward Time Step 3:
Gradient di[0] = -3600884474839040.000, df[0] = -2748651562598400.000, dc_hat[0] = -1489879728914432.000
Gradient do_[0] = -183418450981421056.000
Backward Time Step 2:
Gradient di[0] = -4722504446771200.000, df[0] = -3558257998168064.000, dc_hat[0] = -2606571628527616.000
Gradient do_[0] = -210097138556731392.000
Backward Time Step 1:
Gradient di[0] = -5966610980405248.000, df[0] = -4269267285442560.000, dc_hat[0] = -3618244732649472.000
Gradient do_[0] = -188757267129040896.000
Backward Time Step 0:
Gradient di[0] = -6973897849176064.000, df[0] = -5097224237219840.000, dc_hat[0] = -7050377794945024.000
Gradient do_[0] = -108192648547794944.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 377028116480.000, df[0] = 276415807488.000, dc_hat[0] = 163828924416.000
Gradient do_[0] = 23390534500352.000
Backward Time Step 3:
Gradient di[0] = 592154066944.000, df[0] = 427264016384.000, dc_hat[0] = 232138178560.000
Gradient do_[0] = 31345107533824.000
Backward Time Step 2:
Gradient di[0] = 754420875264.000, df[0] = 541840998400.000, dc_hat[0] = 386107572224.000
Gradient do_[0] = 34201596329984.000
Backward Time Step 1:
Gradient di[0] = 947834191872.000, df[0] = 653651083264.000, dc_hat[0] = 528184016896.000
Gradient do_[0] = 30189205585920.000
Backward Time Step 0:
Gradient di[0] = 1148237250560.000, df[0] = 816964239360.000, dc_hat[0] = 1077433860096.000
Gradient do_[0] = 17751642996736.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2323200960626688.000, df[0] = -1823781760794624.000, dc_hat[0] = -1050886558187520.000
Gradient do_[0] = -135883986332286976.000
Backward Time Step 3:
Gradient di[0] = -3651532339806208.000, df[0] = -2787315126632448.000, dc_hat[0] = -1510804004274176.000
Gradient do_[0] = -185996548050518016.000
Backward Time Step 2:
Gradient di[0] = -4788883703201792.000, df[0] = -3608270476410880.000, dc_hat[0] = -2643123142393856.000
Gradient do_[0] = -213047712369606656.000
Backward Time Step 1:
Gradient di[0] = -6050366600773632.000, df[0] = -4329188253237248.000, dc_hat[0] = -3668872196521984.000
Gradient do_[0] = -191404530351472640.000
Backward Time Step 0:
Gradient di[0] = -7071543393779712.000, df[0] = -5168593708777472.000, dc_hat[0] = -7149094397018112.000
Gradient do_[0] = -109707517872832512.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 304252092416.000, df[0] = 223060541440.000, dc_hat[0] = 132204265472.000
Gradient do_[0] = 18875429158912.000
Backward Time Step 3:
Gradient di[0] = 477846110208.000, df[0] = 344786337792.000, dc_hat[0] = 187322957824.000
Gradient do_[0] = 25294092107776.000
Backward Time Step 2:
Gradient di[0] = 608782974976.000, df[0] = 437240561664.000, dc_hat[0] = 311560699904.000
Gradient do_[0] = 27598761492480.000
Backward Time Step 1:
Gradient di[0] = 764840574976.000, df[0] = 527452962816.000, dc_hat[0] = 426190503936.000
Gradient do_[0] = 24360412774400.000
Backward Time Step 0:
Gradient di[0] = 926521425920.000, df[0] = 659214696448.000, dc_hat[0] = 869389631488.000
Gradient do_[0] = 14323937181696.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2355782045663232.000, df[0] = -1849359633219584.000, dc_hat[0] = -1065613262848000.000
Gradient do_[0] = -137788787148193792.000
Backward Time Step 3:
Gradient di[0] = -3702696406155264.000, df[0] = -2826372485480448.000, dc_hat[0] = -1531940880515072.000
Gradient do_[0] = -188600964679204864.000
Backward Time Step 2:
Gradient di[0] = -4855946396303360.000, df[0] = -3658796740116480.000, dc_hat[0] = -2680051271204864.000
Gradient do_[0] = -216028608651591680.000
Backward Time Step 1:
Gradient di[0] = -6134972087795712.000, df[0] = -4389715885162496.000, dc_hat[0] = -3720011566809088.000
Gradient do_[0] = -194078680069177344.000
Backward Time Step 0:
Gradient di[0] = -7170167654055936.000, df[0] = -5240677755518976.000, dc_hat[0] = -7248799915311104.000
Gradient do_[0] = -111237574202228736.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 229311381504.000, df[0] = 168118321152.000, dc_hat[0] = 99639861248.000
Gradient do_[0] = 14226089312256.000
Backward Time Step 3:
Gradient di[0] = 360142438400.000, df[0] = 259858382848.000, dc_hat[0] = 141178388480.000
Gradient do_[0] = 19063419961344.000
Backward Time Step 2:
Gradient di[0] = 458821500928.000, df[0] = 329534996480.000, dc_hat[0] = 234806378496.000
Gradient do_[0] = 20800105086976.000
Backward Time Step 1:
Gradient di[0] = 576423591936.000, df[0] = 397515227136.000, dc_hat[0] = 321184563200.000
Gradient do_[0] = 18359024353280.000
Backward Time Step 0:
Gradient di[0] = 698250952704.000, df[0] = 496801546240.000, dc_hat[0] = 655194849280.000
Gradient do_[0] = 10794895933440.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2388696795971584.000, df[0] = -1875199632867328.000, dc_hat[0] = -1080491029561344.000
Gradient do_[0] = -139712932496801792.000
Backward Time Step 3:
Gradient di[0] = -3754374526402560.000, df[0] = -2865823102271488.000, dc_hat[0] = -1553290760290304.000
Gradient do_[0] = -191231477529182208.000
Backward Time Step 2:
Gradient di[0] = -4923671588110336.000, df[0] = -3709824172818432.000, dc_hat[0] = -2717343398494208.000
Gradient do_[0] = -219038985589096448.000
Backward Time Step 1:
Gradient di[0] = -6220422609633280.000, df[0] = -4450849912782848.000, dc_hat[0] = -3771662038204416.000
Gradient do_[0] = -196779527303593984.000
Backward Time Step 0:
Gradient di[0] = -7269780293681152.000, df[0] = -5313484967378944.000, dc_hat[0] = -7349505087242240.000
Gradient do_[0] = -112782946385002496.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 152166465536.000, df[0] = 111559999488.000, dc_hat[0] = 66118258688.000
Gradient do_[0] = 9440084361216.000
Backward Time Step 3:
Gradient di[0] = 238979956736.000, df[0] = 172434571264.000, dc_hat[0] = 93679968256.000
Gradient do_[0] = 12649802760192.000
Backward Time Step 2:
Gradient di[0] = 304457285632.000, df[0] = 218667417600.000, dc_hat[0] = 155804008448.000
Gradient do_[0] = 13802020012032.000
Backward Time Step 1:
Gradient di[0] = 382484709376.000, df[0] = 263769915392.000, dc_hat[0] = 213111685120.000
Gradient do_[0] = 12181939683328.000
Backward Time Step 0:
Gradient di[0] = 463308685312.000, df[0] = 329641459712.000, dc_hat[0] = 434739838976.000
Gradient do_[0] = 7162710982656.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2421923736715264.000, df[0] = -1901284445650944.000, dc_hat[0] = -1095509255127040.000
Gradient do_[0] = -141655331456417792.000
Backward Time Step 3:
Gradient di[0] = -3806551668162560.000, df[0] = -2905653823668224.000, dc_hat[0] = -1574845724753920.000
Gradient do_[0] = -193887227606990848.000
Backward Time Step 2:
Gradient di[0] = -4992049614946304.000, df[0] = -3761342842404864.000, dc_hat[0] = -2754993618681856.000
Gradient do_[0] = -222078241886699520.000
Backward Time Step 1:
Gradient di[0] = -6306690248998912.000, df[0] = -4512567519084544.000, dc_hat[0] = -3823805088661504.000
Gradient do_[0] = -199506041262571520.000
Backward Time Step 0:
Gradient di[0] = -7370338362982400.000, df[0] = -5386982595231744.000, dc_hat[0] = -7451165889396736.000
Gradient do_[0] = -114342998765993984.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 72700993536.000, df[0] = 53300350976.000, dc_hat[0] = 31589171200.000
Gradient do_[0] = 4510180179968.000
Backward Time Step 3:
Gradient di[0] = 114176606208.000, df[0] = 82383536128.000, dc_hat[0] = 44756267008.000
Gradient do_[0] = 6043591507968.000
Backward Time Step 2:
Gradient di[0] = 145457889280.000, df[0] = 104470781952.000, dc_hat[0] = 74434813952.000
Gradient do_[0] = 6593989050368.000
Backward Time Step 1:
Gradient di[0] = 182731898880.000, df[0] = 126015717376.000, dc_hat[0] = 101809463296.000
Gradient do_[0] = 5819841642496.000
Backward Time Step 0:
Gradient di[0] = 221338042368.000, df[0] = 157480747008.000, dc_hat[0] = 207689744384.000
Gradient do_[0] = 3421865771008.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2455471726264320.000, df[0] = -1927621453545472.000, dc_hat[0] = -1110672435838976.000
Gradient do_[0] = -143616550962724864.000
Backward Time Step 3:
Gradient di[0] = -3859232931708928.000, df[0] = -2945869481508864.000, dc_hat[0] = -1596607652954112.000
Gradient do_[0] = -196568627229491200.000
Backward Time Step 2:
Gradient di[0] = -5061094435454976.000, df[0] = -3813363217858560.000, dc_hat[0] = -2793011595444224.000
Gradient do_[0] = -225147270897598464.000
Backward Time Step 1:
Gradient di[0] = -6393802923180032.000, df[0] = -4574889105162240.000, dc_hat[0] = -3876457092743168.000
Gradient do_[0] = -202259441716822016.000
Backward Time Step 0:
Gradient di[0] = -7471885348503552.000, df[0] = -5461203925073920.000, dc_hat[0] = -7553827418931200.000
Gradient do_[0] = -115918418539970560.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -8865286144.000, df[0] = -6499538944.000, dc_hat[0] = -3851997184.000
Gradient do_[0] = -549975031808.000
Backward Time Step 3:
Gradient di[0] = -13922705408.000, df[0] = -10045867008.000, dc_hat[0] = -5457469952.000
Gradient do_[0] = -736948912128.000
Backward Time Step 2:
Gradient di[0] = -17736943616.000, df[0] = -12739026944.000, dc_hat[0] = -9076195328.000
Gradient do_[0] = -804052729856.000
Backward Time Step 1:
Gradient di[0] = -22281558016.000, df[0] = -15365799936.000, dc_hat[0] = -12413666304.000
Gradient do_[0] = -709638160384.000
Backward Time Step 0:
Gradient di[0] = -26988093440.000, df[0] = -19201871872.000, dc_hat[0] = -25323933696.000
Gradient do_[0] = -417233469440.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865
c_state[0] = 0.725, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 16122168475648.000, df[0] = 11059813416960.000, dc_hat[0] = 6746993065984.000
Gradient do_[0] = 1070457180651520.000
Backward Time Step 3:
Gradient di[0] = 25356742426624.000, df[0] = 17274792574976.000, dc_hat[0] = 9451647008768.000
Gradient do_[0] = 1403907502768128.000
Backward Time Step 2:
Gradient di[0] = 31141193056256.000, df[0] = 21278228480000.000, dc_hat[0] = 14793055404032.000
Gradient do_[0] = 1442264110858240.000
Backward Time Step 1:
Gradient di[0] = 38861535182848.000, df[0] = 25818883424256.000, dc_hat[0] = 19879405027328.000
Gradient do_[0] = 1246827663851520.000
Backward Time Step 0:
Gradient di[0] = 48432311959552.000, df[0] = 33608131674112.000, dc_hat[0] = 42224985309184.000
Gradient do_[0] = 747012186177536.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1371785396224.000, df[0] = 1005723779072.000, dc_hat[0] = 596016562176.000
Gradient do_[0] = 85098745561088.000
Backward Time Step 3:
Gradient di[0] = 2154399399936.000, df[0] = 1554512609280.000, dc_hat[0] = 844442959872.000
Gradient do_[0] = 114032832937984.000
Backward Time Step 2:
Gradient di[0] = 2744470732800.000, df[0] = 1971141476352.000, dc_hat[0] = 1404334899200.000
Gradient do_[0] = 124411403304960.000
Backward Time Step 1:
Gradient di[0] = 3447840309248.000, df[0] = 2377698508800.000, dc_hat[0] = 1920886243328.000
Gradient do_[0] = 109809152032768.000
Backward Time Step 0:
Gradient di[0] = 4177542512640.000, df[0] = 2972297461760.000, dc_hat[0] = 3919943827456.000
Gradient do_[0] = 64584429862912.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1903799283220480.000, df[0] = -1494549297889280.000, dc_hat[0] = -861094167969792.000
Gradient do_[0] = -111346331364098048.000
Backward Time Step 3:
Gradient di[0] = -2992190166925312.000, df[0] = -2284050723110912.000, dc_hat[0] = -1237817090899968.000
Gradient do_[0] = -152401485659897856.000
Backward Time Step 2:
Gradient di[0] = -3923735555866624.000, df[0] = -2956409062817792.000, dc_hat[0] = -2165228976472064.000
Gradient do_[0] = -174547402189963264.000
Backward Time Step 1:
Gradient di[0] = -4957071401287680.000, df[0] = -3546873650479104.000, dc_hat[0] = -3005265389551616.000
Gradient do_[0] = -156808517242585088.000
Backward Time Step 0:
Gradient di[0] = -5794594855845888.000, df[0] = -4235271277117440.000, dc_hat[0] = -5858141581344768.000
Gradient do_[0] = -89897015579770880.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1330531270656.000, df[0] = 975478849536.000, dc_hat[0] = 578085781504.000
Gradient do_[0] = 82538894721024.000
Backward Time Step 3:
Gradient di[0] = 2089578921984.000, df[0] = 1507742580736.000, dc_hat[0] = 819019317248.000
Gradient do_[0] = 110600784969728.000
Backward Time Step 2:
Gradient di[0] = 2661863129088.000, df[0] = 1911810031616.000, dc_hat[0] = 1362022760448.000
Gradient do_[0] = 120665227132928.000
Backward Time Step 1:
Gradient di[0] = 3343979380736.000, df[0] = 2306069233664.000, dc_hat[0] = 1862940884992.000
Gradient do_[0] = 106499980394496.000
Backward Time Step 0:
Gradient di[0] = 4051569737728.000, df[0] = 2882668855296.000, dc_hat[0] = 3801739165696.000
Gradient do_[0] = 62636909658112.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1928301803208704.000, df[0] = -1513785785319424.000, dc_hat[0] = -872168338489344.000
Gradient do_[0] = -112778634237837312.000
Backward Time Step 3:
Gradient di[0] = -3030657504641024.000, df[0] = -2313417025126400.000, dc_hat[0] = -1253706590846976.000
Gradient do_[0] = -154359406631321600.000
Backward Time Step 2:
Gradient di[0] = -3974145587019776.000, df[0] = -2994389995487232.000, dc_hat[0] = -2192979968131072.000
Gradient do_[0] = -176787846110117888.000
Backward Time Step 1:
Gradient di[0] = -5020667518910464.000, df[0] = -3592370775916544.000, dc_hat[0] = -3043694340997120.000
Gradient do_[0] = -158818476037767168.000
Backward Time Step 0:
Gradient di[0] = -5868732433825792.000, df[0] = -4289458463571968.000, dc_hat[0] = -5933092518756352.000
Gradient do_[0] = -91047182051835904.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1289409789952.000, df[0] = 945330716672.000, dc_hat[0] = 560213327872.000
Gradient do_[0] = 79987356991488.000
Backward Time Step 3:
Gradient di[0] = 2024969732096.000, df[0] = 1461125382144.000, dc_hat[0] = 793680347136.000
Gradient do_[0] = 107179977736192.000
Backward Time Step 2:
Gradient di[0] = 2579527368704.000, df[0] = 1852674539520.000, dc_hat[0] = 1319853096960.000
Gradient do_[0] = 116931457712128.000
Backward Time Step 1:
Gradient di[0] = 3240470249472.000, df[0] = 2234683490304.000, dc_hat[0] = 1805197508608.000
Gradient do_[0] = 103202158542848.000
Backward Time Step 0:
Gradient di[0] = 3926034481152.000, df[0] = 2793350889472.000, dc_hat[0] = 3683944497152.000
Gradient do_[0] = 60696133894144.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1954169720143872.000, df[0] = -1534093598654464.000, dc_hat[0] = -883859642122240.000
Gradient do_[0] = -114290771963674624.000
Backward Time Step 3:
Gradient di[0] = -3071274205052928.000, df[0] = -2344423199342592.000, dc_hat[0] = -1270483269976064.000
Gradient do_[0] = -156426694649970688.000
Backward Time Step 2:
Gradient di[0] = -4027370163929088.000, df[0] = -3034491299823616.000, dc_hat[0] = -2222282382508032.000
Gradient do_[0] = -179153582816231424.000
Backward Time Step 1:
Gradient di[0] = -5087804300197888.000, df[0] = -3640401395187712.000, dc_hat[0] = -3084269165477888.000
Gradient do_[0] = -160940310141075456.000
Backward Time Step 0:
Gradient di[0] = -5947001770344448.000, df[0] = -4346665548906496.000, dc_hat[0] = -6012220311863296.000
Gradient do_[0] = -92261446615826432.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1247468847104.000, df[0] = 914582339584.000, dc_hat[0] = 541985472512.000
Gradient do_[0] = 77385084960768.000
Backward Time Step 3:
Gradient di[0] = 1959076823040.000, df[0] = 1413581242368.000, dc_hat[0] = 767837929472.000
Gradient do_[0] = 103691340218368.000
Backward Time Step 2:
Gradient di[0] = 2495561072640.000, df[0] = 1792367788032.000, dc_hat[0] = 1276851781632.000
Gradient do_[0] = 113123885318144.000
Backward Time Step 1:
Gradient di[0] = 3134914822144.000, df[0] = 2161886756864.000, dc_hat[0] = 1746320228352.000
Gradient do_[0] = 99839232704512.000
Backward Time Step 0:
Gradient di[0] = 3798026158080.000, df[0] = 2702273937408.000, dc_hat[0] = 3563829854208.000
Gradient do_[0] = 58717143826432.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -1981037961805824.000, df[0] = -1555186854133760.000, dc_hat[0] = -896003058171904.000
Gradient do_[0] = -115861381374279680.000
Backward Time Step 3:
Gradient di[0] = -3113456689479680.000, df[0] = -2376625253515264.000, dc_hat[0] = -1287907281207296.000
Gradient do_[0] = -158573680081764352.000
Backward Time Step 2:
Gradient di[0] = -4082645856157696.000, df[0] = -3076137718644736.000, dc_hat[0] = -2252715178590208.000
Gradient do_[0] = -181610372829020160.000
Backward Time Step 1:
Gradient di[0] = -5157547589763072.000, df[0] = -3690296835571712.000, dc_hat[0] = -3126422122004480.000
Gradient do_[0] = -163144659156074496.000
Backward Time Step 0:
Gradient di[0] = -6028315701805056.000, df[0] = -4406097964171264.000, dc_hat[0] = -6094425985908736.000
Gradient do_[0] = -93522947230138368.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1204307361792.000, df[0] = 882938675200.000, dc_hat[0] = 523227561984.000
Gradient do_[0] = 74707055411200.000
Backward Time Step 3:
Gradient di[0] = 1891267510272.000, df[0] = 1364654948352.000, dc_hat[0] = 741246763008.000
Gradient do_[0] = 100101326372864.000
Backward Time Step 2:
Gradient di[0] = 2409154215936.000, df[0] = 1730307948544.000, dc_hat[0] = 1232604364800.000
Gradient do_[0] = 109205784625152.000
Backward Time Step 1:
Gradient di[0] = 3026296766464.000, df[0] = 2086978322432.000, dc_hat[0] = 1685742682112.000
Gradient do_[0] = 96378856407040.000
Backward Time Step 0:
Gradient di[0] = 3666310332416.000, df[0] = 2608558768128.000, dc_hat[0] = 3440235511808.000
Gradient do_[0] = 56680830205952.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.773, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2008600645992448.000, df[0] = -1576825570459648.000, dc_hat[0] = -908460275269632.000
Gradient do_[0] = -117472603995635712.000
Backward Time Step 3:
Gradient di[0] = -3156739490840576.000, df[0] = -2409666705358848.000, dc_hat[0] = -1305785619447808.000
Gradient do_[0] = -160776637527359488.000
Backward Time Step 2:
Gradient di[0] = -4139358214946816.000, df[0] = -3118867274530816.000, dc_hat[0] = -2283940396138496.000
Gradient do_[0] = -184131123494780928.000
Backward Time Step 1:
Gradient di[0] = -5229089396883456.000, df[0] = -3741478887096320.000, dc_hat[0] = -3169661436821504.000
Gradient do_[0] = -165405821998465024.000
Backward Time Step 0:
Gradient di[0] = -6111727724789760.000, df[0] = -4467063682760704.000, dc_hat[0] = -6178752300056576.000
Gradient do_[0] = -94816986516684800.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1159640252416.000, df[0] = 850191450112.000, dc_hat[0] = 503816257536.000
Gradient do_[0] = 71935669043200.000
Backward Time Step 3:
Gradient di[0] = 1821096148992.000, df[0] = 1314023800832.000, dc_hat[0] = 713731014656.000
Gradient do_[0] = 96386389377024.000
Backward Time Step 2:
Gradient di[0] = 2319741878272.000, df[0] = 1666089484288.000, dc_hat[0] = 1186823012352.000
Gradient do_[0] = 105151536824320.000
Backward Time Step 1:
Gradient di[0] = 2913908817920.000, df[0] = 2009470599168.000, dc_hat[0] = 1623071784960.000
Gradient do_[0] = 92798531403776.000
Backward Time Step 0:
Gradient di[0] = 3530045521920.000, df[0] = 2511606906880.000, dc_hat[0] = 3312373465088.000
Gradient do_[0] = 54574182301696.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2036784087171072.000, df[0] = -1598951362920448.000, dc_hat[0] = -921197940310016.000
Gradient do_[0] = -119120119090642944.000
Backward Time Step 3:
Gradient di[0] = -3200984633311232.000, df[0] = -2443443401916416.000, dc_hat[0] = -1324062450122752.000
Gradient do_[0] = -163028695039082496.000
Backward Time Step 2:
Gradient di[0] = -4197335978475520.000, df[0] = -3162550313156608.000, dc_hat[0] = -2315863277436928.000
Gradient do_[0] = -186708172591857664.000
Backward Time Step 1:
Gradient di[0] = -5302240206127104.000, df[0] = -3793812258291712.000, dc_hat[0] = -3213878393569280.000
Gradient do_[0] = -167717905973116928.000
Backward Time Step 0:
Gradient di[0] = -6197006984806400.000, df[0] = -4529394664079360.000, dc_hat[0] = -6264967326072832.000
Gradient do_[0] = -96140025422413824.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1113643679744.000, df[0] = 816469245952.000, dc_hat[0] = 483827286016.000
Gradient do_[0] = 69081889767424.000
Backward Time Step 3:
Gradient di[0] = 1748838121472.000, df[0] = 1261886898176.000, dc_hat[0] = 685397966848.000
Gradient do_[0] = 92561142185984.000
Backward Time Step 2:
Gradient di[0] = 2227672449024.000, df[0] = 1599962873856.000, dc_hat[0] = 1139684671488.000
Gradient do_[0] = 100977021550592.000
Backward Time Step 1:
Gradient di[0] = 2798190067712.000, df[0] = 1929665970176.000, dc_hat[0] = 1558551592960.000
Gradient do_[0] = 89112233115648.000
Backward Time Step 0:
Gradient di[0] = 3389752082432.000, df[0] = 2411789287424.000, dc_hat[0] = 3180731039744.000
Gradient do_[0] = 52405261565952.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2065406017667072.000, df[0] = -1621421155418112.000, dc_hat[0] = -934133979152384.000
Gradient do_[0] = -120793283730341888.000
Backward Time Step 3:
Gradient di[0] = -3245925023612928.000, df[0] = -2477751063805952.000, dc_hat[0] = -1342626372517888.000
Gradient do_[0] = -165316108721586176.000
Backward Time Step 2:
Gradient di[0] = -4256224275070976.000, df[0] = -3206919741243392.000, dc_hat[0] = -2348287596167168.000
Gradient do_[0] = -189325680280862720.000
Backward Time Step 1:
Gradient di[0] = -5376524886736896.000, df[0] = -3846956841435136.000, dc_hat[0] = -3258776639504384.000
Gradient do_[0] = -170065792795148288.000
Backward Time Step 0:
Gradient di[0] = -6283612315975680.000, df[0] = -4592694428958720.000, dc_hat[0] = -6352522381885440.000
Gradient do_[0] = -97483602861752320.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1065880387584.000, df[0] = 781451919360.000, dc_hat[0] = 463071903744.000
Gradient do_[0] = 66118588825600.000
Backward Time Step 3:
Gradient di[0] = 1673808838656.000, df[0] = 1207750492160.000, dc_hat[0] = 655980494848.000
Gradient do_[0] = 88589195018240.000
Backward Time Step 2:
Gradient di[0] = 2132075872256.000, df[0] = 1531303034880.000, dc_hat[0] = 1090745532416.000
Gradient do_[0] = 96642619408384.000
Backward Time Step 1:
Gradient di[0] = 2678041083904.000, df[0] = 1846806970368.000, dc_hat[0] = 1491569868800.000
Gradient do_[0] = 85284955881472.000
Backward Time Step 0:
Gradient di[0] = 3244099371008.000, df[0] = 2308157997056.000, dc_hat[0] = 3044059643904.000
Gradient do_[0] = 50153486548992.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2094492341501952.000, df[0] = -1644255751700480.000, dc_hat[0] = -947279733194752.000
Gradient do_[0] = -122493566793547776.000
Backward Time Step 3:
Gradient di[0] = -3291595289919488.000, df[0] = -2512615460831232.000, dc_hat[0] = -1361490674188288.000
Gradient do_[0] = -167640596561788928.000
Backward Time Step 2:
Gradient di[0] = -4316071691550720.000, df[0] = -3252010723835904.000, dc_hat[0] = -2381237779955712.000
Gradient do_[0] = -191985639426621440.000
Backward Time Step 1:
Gradient di[0] = -5452016453156864.000, df[0] = -3900964981440512.000, dc_hat[0] = -3304406908928000.000
Gradient do_[0] = -172451801746898944.000
Backward Time Step 0:
Gradient di[0] = -6371629617643520.000, df[0] = -4657026059730944.000, dc_hat[0] = -6441504440582144.000
Gradient do_[0] = -98849101814169600.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1016350638080.000, df[0] = 745139404800.000, dc_hat[0] = 441549029376.000
Gradient do_[0] = 63045657165824.000
Backward Time Step 3:
Gradient di[0] = 1596007383040.000, df[0] = 1151613403136.000, dc_hat[0] = 625477550080.000
Gradient do_[0] = 84470589816832.000
Backward Time Step 2:
Gradient di[0] = 2032950312960.000, df[0] = 1460108656640.000, dc_hat[0] = 1040003694592.000
Gradient do_[0] = 92148397506560.000
Backward Time Step 1:
Gradient di[0] = 2553472090112.000, df[0] = 1760899760128.000, dc_hat[0] = 1422132641792.000
Gradient do_[0] = 81316984913920.000
Backward Time Step 0:
Gradient di[0] = 3093105737728.000, df[0] = 2200726929408.000, dc_hat[0] = 2902376579072.000
Gradient do_[0] = 47819134074880.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2123878776176640.000, df[0] = -1667326302748672.000, dc_hat[0] = -960560845815808.000
Gradient do_[0] = -124211390503190528.000
Backward Time Step 3:
Gradient di[0] = -3337727802081280.000, df[0] = -2547832850481152.000, dc_hat[0] = -1380546638774272.000
Gradient do_[0] = -169988689542250496.000
Backward Time Step 2:
Gradient di[0] = -4376524966854656.000, df[0] = -3297558583574528.000, dc_hat[0] = -2414521629016064.000
Gradient do_[0] = -194672639686475776.000
Backward Time Step 1:
Gradient di[0] = -5528278429335552.000, df[0] = -3955524756307968.000, dc_hat[0] = -3350501034819584.000
Gradient do_[0] = -174862154573283328.000
Backward Time Step 0:
Gradient di[0] = -6460539198767104.000, df[0] = -4722010525532160.000, dc_hat[0] = -6531389516152832.000
Gradient do_[0] = -100228439151214592.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 965164990464.000, df[0] = 707612704768.000, dc_hat[0] = 419307192320.000
Gradient do_[0] = 59870153801728.000
Backward Time Step 3:
Gradient di[0] = 1515607687168.000, df[0] = 1093601329152.000, dc_hat[0] = 593957486592.000
Gradient do_[0] = 80214595993600.000
Backward Time Step 2:
Gradient di[0] = 1930513481728.000, df[0] = 1386536108032.000, dc_hat[0] = 987571683328.000
Gradient do_[0] = 87504220848128.000
Backward Time Step 1:
Gradient di[0] = 2424749686784.000, df[0] = 1672128757760.000, dc_hat[0] = 1350388416512.000
Gradient do_[0] = 77216868204544.000
Backward Time Step 0:
Gradient di[0] = 2937090998272.000, df[0] = 2089723232256.000, dc_hat[0] = 2755982000128.000
Gradient do_[0] = 45407161810944.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2153623974838272.000, df[0] = -1690678174154752.000, dc_hat[0] = -974004428996608.000
Gradient do_[0] = -125950199423041536.000
Backward Time Step 3:
Gradient di[0] = -3384426981490688.000, df[0] = -2583482689650688.000, dc_hat[0] = -1399834397376512.000
Gradient do_[0] = -172365507263987712.000
Backward Time Step 2:
Gradient di[0] = -4437715097485312.000, df[0] = -3343661567836160.000, dc_hat[0] = -2448211084050432.000
Gradient do_[0] = -197392436316602368.000
Backward Time Step 1:
Gradient di[0] = -5605465434095616.000, df[0] = -4010745956139008.000, dc_hat[0] = -3397154311766016.000
Gradient do_[0] = -177301833436364800.000
Backward Time Step 0:
Gradient di[0] = -6550519837360128.000, df[0] = -4787777212252160.000, dc_hat[0] = -6622356923482112.000
Gradient do_[0] = -101624398011695104.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 912043999232.000, df[0] = 668667412480.000, dc_hat[0] = 396225314816.000
Gradient do_[0] = 56574588485632.000
Backward Time Step 3:
Gradient di[0] = 1432170266624.000, df[0] = 1033397665792.000, dc_hat[0] = 561248862208.000
Gradient do_[0] = 75797943549952.000
Backward Time Step 2:
Gradient di[0] = 1824214876160.000, df[0] = 1310189944832.000, dc_hat[0] = 933166841856.000
Gradient do_[0] = 82685133324288.000
Backward Time Step 1:
Gradient di[0] = 2291180765184.000, df[0] = 1580016074752.000, dc_hat[0] = 1275951710208.000
Gradient do_[0] = 72962526937088.000
Backward Time Step 0:
Gradient di[0] = 2775216553984.000, df[0] = 1974550528000.000, dc_hat[0] = 2604089212928.000
Gradient do_[0] = 42904596774912.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2183712502448128.000, df[0] = -1714299957411840.000, dc_hat[0] = -987602563891200.000
Gradient do_[0] = -127709005710622720.000
Backward Time Step 3:
Gradient di[0] = -3431671621746688.000, df[0] = -2619548872212480.000, dc_hat[0] = -1419348849721344.000
Gradient do_[0] = -174770104834195456.000
Backward Time Step 2:
Gradient di[0] = -4499618729558016.000, df[0] = -3390302228316160.000, dc_hat[0] = -2482294333898752.000
Gradient do_[0] = -200143758006681600.000
Backward Time Step 1:
Gradient di[0] = -5683546865795072.000, df[0] = -4066606569226240.000, dc_hat[0] = -3444347680849920.000
Gradient do_[0] = -179769687284908032.000
Backward Time Step 0:
Gradient di[0] = -6641546837360640.000, df[0] = -4854308940021760.000, dc_hat[0] = -6714382503378944.000
Gradient do_[0] = -103036574668685312.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 856914788352.000, df[0] = 628249460736.000, dc_hat[0] = 372271251456.000
Gradient do_[0] = 53154511060992.000
Backward Time Step 3:
Gradient di[0] = 1345582399488.000, df[0] = 970920361984.000, dc_hat[0] = 527306293248.000
Gradient do_[0] = 71214601076736.000
Backward Time Step 2:
Gradient di[0] = 1713902583808.000, df[0] = 1230961115136.000, dc_hat[0] = 876712558592.000
Gradient do_[0] = 77684222722048.000
Backward Time Step 1:
Gradient di[0] = 2152580644864.000, df[0] = 1484433653760.000, dc_hat[0] = 1198718976000.000
Gradient do_[0] = 68548055531520.000
Backward Time Step 0:
Gradient di[0] = 2607253553152.000, df[0] = 1855045763072.000, dc_hat[0] = 2446483259392.000
Gradient do_[0] = 40307903168512.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2214115904847872.000, df[0] = -1738168432852992.000, dc_hat[0] = -1001343640666112.000
Gradient do_[0] = -129486245997838336.000
Backward Time Step 3:
Gradient di[0] = -3479399445823488.000, df[0] = -2655984421961728.000, dc_hat[0] = -1439063689134080.000
Gradient do_[0] = -177199286797205504.000
Backward Time Step 2:
Gradient di[0] = -4562149158420480.000, df[0] = -3437415335198720.000, dc_hat[0] = -2516720375824384.000
Gradient do_[0] = -202922996984184832.000
Backward Time Step 1:
Gradient di[0] = -5762416960864256.000, df[0] = -4123031970512896.000, dc_hat[0] = -3492016717561856.000
Gradient do_[0] = -182262486303506432.000
Backward Time Step 0:
Gradient di[0] = -6733486517911552.000, df[0] = -4921507461464064.000, dc_hat[0] = -6807329890631680.000
Gradient do_[0] = -104462933307686912.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 799988908032.000, df[0] = 586514563072.000, dc_hat[0] = 347538325504.000
Gradient do_[0] = 49623137779712.000
Backward Time Step 3:
Gradient di[0] = 1256177139712.000, df[0] = 906410196992.000, dc_hat[0] = 492263243776.000
Gradient do_[0] = 66482419531776.000
Backward Time Step 2:
Gradient di[0] = 1600004030464.000, df[0] = 1149156982784.000, dc_hat[0] = 818435391488.000
Gradient do_[0] = 72521135161344.000
Backward Time Step 1:
Gradient di[0] = 2009479118848.000, df[0] = 1385748758528.000, dc_hat[0] = 1119005179904.000
Gradient do_[0] = 63990663217152.000
Backward Time Step 0:
Gradient di[0] = 2433866530816.000, df[0] = 1731681976320.000, dc_hat[0] = 2283787517952.000
Gradient do_[0] = 37627357036544.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2244764657254400.000, df[0] = -1762230316040192.000, dc_hat[0] = -1015196923461632.000
Gradient do_[0] = -131277968914776064.000
Backward Time Step 3:
Gradient di[0] = -3527515964440576.000, df[0] = -2692717129760768.000, dc_hat[0] = -1458944958529536.000
Gradient do_[0] = -179648637926637568.000
Backward Time Step 2:
Gradient di[0] = -4625194983358464.000, df[0] = -3484917941927936.000, dc_hat[0] = -2551454581653504.000
Gradient do_[0] = -205725892641554432.000
Backward Time Step 1:
Gradient di[0] = -5841943649058816.000, df[0] = -4179929281331200.000, dc_hat[0] = -3540136188968960.000
Gradient do_[0] = -184776811698192384.000
Backward Time Step 0:
Gradient di[0] = -6826221304283136.000, df[0] = -4989287414104064.000, dc_hat[0] = -6901081510510592.000
Gradient do_[0] = -105901609912893440.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 741205475328.000, df[0] = 543417794560.000, dc_hat[0] = 321998225408.000
Gradient do_[0] = 45976555356160.000
Backward Time Step 3:
Gradient di[0] = 1163854348288.000, df[0] = 839794622464.000, dc_hat[0] = 456078131200.000
Gradient do_[0] = 61595866628096.000
Backward Time Step 2:
Gradient di[0] = 1482393255936.000, df[0] = 1064686845952.000, dc_hat[0] = 758261219328.000
Gradient do_[0] = 67189864398848.000
Backward Time Step 1:
Gradient di[0] = 1861723095040.000, df[0] = 1283854172160.000, dc_hat[0] = 1036702908416.000
Gradient do_[0] = 59285115502592.000
Backward Time Step 0:
Gradient di[0] = 2254852849664.000, df[0] = 1604314988544.000, dc_hat[0] = 2115812589568.000
Gradient do_[0] = 34859823136768.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2275753114730496.000, df[0] = -1786558621417472.000, dc_hat[0] = -1029203885555712.000
Gradient do_[0] = -133089586120228864.000
Backward Time Step 3:
Gradient di[0] = -3576158348115968.000, df[0] = -2729850611695616.000, dc_hat[0] = -1479042855337984.000
Gradient do_[0] = -182124789651996672.000
Backward Time Step 2:
Gradient di[0] = -4688921560612864.000, df[0] = -3532932455071744.000, dc_hat[0] = -2586563523379200.000
Gradient do_[0] = -208559076408295424.000
Backward Time Step 1:
Gradient di[0] = -5922321814519808.000, df[0] = -4237436477505536.000, dc_hat[0] = -3588772398628864.000
Gradient do_[0] = -187318006408282112.000
Backward Time Step 0:
Gradient di[0] = -6919959502389248.000, df[0] = -5057801269280768.000, dc_hat[0] = -6995848353284096.000
Gradient do_[0] = -107355860069515264.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 680337276928.000, df[0] = 498792366080.000, dc_hat[0] = 295552909312.000
Gradient do_[0] = 42200679317504.000
Backward Time Step 3:
Gradient di[0] = 1068263211008.000, df[0] = 770820407296.000, dc_hat[0] = 418612805632.000
Gradient do_[0] = 56536411930624.000
Backward Time Step 2:
Gradient di[0] = 1360621862912.000, df[0] = 977228070912.000, dc_hat[0] = 695961583616.000
Gradient do_[0] = 61670063865856.000
Backward Time Step 1:
Gradient di[0] = 1708750012416.000, df[0] = 1178362052608.000, dc_hat[0] = 951499358208.000
Gradient do_[0] = 54413469155328.000
Backward Time Step 0:
Gradient di[0] = 2069532114944.000, df[0] = 1472460095488.000, dc_hat[0] = 1941919236096.000
Gradient do_[0] = 31994775863296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2307031214063616.000, df[0] = -1811114962714624.000, dc_hat[0] = -1043341978370048.000
Gradient do_[0] = -134918177036435456.000
Backward Time Step 3:
Gradient di[0] = -3625261098598400.000, df[0] = -2767336012513280.000, dc_hat[0] = -1499331072884736.000
Gradient do_[0] = -184624305999446016.000
Backward Time Step 2:
Gradient di[0] = -4753250507030528.000, df[0] = -3581402234748928.000, dc_hat[0] = -2622005056634880.000
Gradient do_[0] = -211418923332009984.000
Backward Time Step 1:
Gradient di[0] = -6003457504837632.000, df[0] = -4295484571123712.000, dc_hat[0] = -3637864143257600.000
Gradient do_[0] = -189883270115098624.000
Backward Time Step 0:
Gradient di[0] = -7014567968243712.000, df[0] = -5126950242746368.000, dc_hat[0] = -7091494053740544.000
Gradient do_[0] = -108823613603315712.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 617450438656.000, df[0] = 452686970880.000, dc_hat[0] = 268231507968.000
Gradient do_[0] = 38299670413312.000
Backward Time Step 3:
Gradient di[0] = 969504980992.000, df[0] = 699561017344.000, dc_hat[0] = 379908063232.000
Gradient do_[0] = 51309398982656.000
Backward Time Step 2:
Gradient di[0] = 1234818564096.000, df[0] = 886873653248.000, dc_hat[0] = 631601233920.000
Gradient do_[0] = 55967634948096.000
Backward Time Step 1:
Gradient di[0] = 1550718468096.000, df[0] = 1069381910528.000, dc_hat[0] = 863483002880.000
Gradient do_[0] = 49380807671808.000
Backward Time Step 0:
Gradient di[0] = 1878092021760.000, df[0] = 1336251645952.000, dc_hat[0] = 1762283880448.000
Gradient do_[0] = 29035134451712.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2338555468709888.000, df[0] = -1835864443322368.000, dc_hat[0] = -1057591807442944.000
Gradient do_[0] = -136761078783672320.000
Backward Time Step 3:
Gradient di[0] = -3674752812056576.000, df[0] = -2805118302945280.000, dc_hat[0] = -1519779546398720.000
Gradient do_[0] = -187143562016587776.000
Backward Time Step 2:
Gradient di[0] = -4818093238910976.000, df[0] = -3630258293047296.000, dc_hat[0] = -2657727104942080.000
Gradient do_[0] = -214301705381085184.000
Backward Time Step 1:
Gradient di[0] = -6085241198346240.000, df[0] = -4353997058080768.000, dc_hat[0] = -3687347535216640.000
Gradient do_[0] = -192468874787028992.000
Backward Time Step 0:
Gradient di[0] = -7109933422084096.000, df[0] = -5196653266993152.000, dc_hat[0] = -7187905868988416.000
Gradient do_[0] = -110303100987768832.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 552736718848.000, df[0] = 405242118144.000, dc_hat[0] = 240116711424.000
Gradient do_[0] = 34285354483712.000
Backward Time Step 3:
Gradient di[0] = 867879747584.000, df[0] = 626232524800.000, dc_hat[0] = 340080623616.000
Gradient do_[0] = 45930778722304.000
Backward Time Step 2:
Gradient di[0] = 1105369366528.000, df[0] = 793900613632.000, dc_hat[0] = 565378678784.000
Gradient do_[0] = 50100063698944.000
Backward Time Step 1:
Gradient di[0] = 1388119457792.000, df[0] = 957252042752.000, dc_hat[0] = 772927127552.000
Gradient do_[0] = 44202759028736.000
Backward Time Step 0:
Gradient di[0] = 1681123704832.000, df[0] = 1196109856768.000, dc_hat[0] = 1577460957184.000
Gradient do_[0] = 25990021513216.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2370433521287168.000, df[0] = -1860891083538432.000, dc_hat[0] = -1071999476563968.000
Gradient do_[0] = -138624656503472128.000
Backward Time Step 3:
Gradient di[0] = -3724794281328640.000, df[0] = -2843320694865920.000, dc_hat[0] = -1540454445219840.000
Gradient do_[0] = -189690821220499456.000
Backward Time Step 2:
Gradient di[0] = -4883651082846208.000, df[0] = -3679653369741312.000, dc_hat[0] = -2693844021805056.000
Gradient do_[0] = -217216081209589760.000
Backward Time Step 1:
Gradient di[0] = -6167918781923328.000, df[0] = -4413149226729472.000, dc_hat[0] = -3737371824619520.000
Gradient do_[0] = -195082757523636224.000
Backward Time Step 0:
Gradient di[0] = -7206339331751936.000, df[0] = -5267116500451328.000, dc_hat[0] = -7285368877481984.000
Gradient do_[0] = -111798746039189504.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 485604655104.000, df[0] = 356024156160.000, dc_hat[0] = 210951880704.000
Gradient do_[0] = 30121096380416.000
Backward Time Step 3:
Gradient di[0] = 762461421568.000, df[0] = 550167052288.000, dc_hat[0] = 298768367616.000
Gradient do_[0] = 40351444238336.000
Backward Time Step 2:
Gradient di[0] = 971091607552.000, df[0] = 697459474432.000, dc_hat[0] = 496688824320.000
Gradient do_[0] = 44013704970240.000
Backward Time Step 1:
Gradient di[0] = 1219462823936.000, df[0] = 840945106944.000, dc_hat[0] = 679002112000.000
Gradient do_[0] = 38831902425088.000
Backward Time Step 0:
Gradient di[0] = 1476834492416.000, df[0] = 1050759266304.000, dc_hat[0] = 1385768943616.000
Gradient do_[0] = 22831729475584.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2402609537220608.000, df[0] = -1886151799472128.000, dc_hat[0] = -1086543108243456.000
Gradient do_[0] = -140505637430755328.000
Backward Time Step 3:
Gradient di[0] = -3775295580536832.000, df[0] = -2881873663492096.000, dc_hat[0] = -1561320335867904.000
Gradient do_[0] = -192261548125716480.000
Backward Time Step 2:
Gradient di[0] = -4949806464106496.000, df[0] = -3729498881130496.000, dc_hat[0] = -2730289772101632.000
Gradient do_[0] = -220157257634021376.000
Backward Time Step 1:
Gradient di[0] = -6251357648453632.000, df[0] = -4472845245612032.000, dc_hat[0] = -3787855138652160.000
Gradient do_[0] = -197720760796577792.000
Backward Time Step 0:
Gradient di[0] = -7303629467811840.000, df[0] = -5338225052745728.000, dc_hat[0] = -7383725776044032.000
Gradient do_[0] = -113308092036284416.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 416843628544.000, df[0] = 305611800576.000, dc_hat[0] = 181079916544.000
Gradient do_[0] = 25855852019712.000
Backward Time Step 3:
Gradient di[0] = 654489157632.000, df[0] = 472258510848.000, dc_hat[0] = 256456228864.000
Gradient do_[0] = 34637048971264.000
Backward Time Step 2:
Gradient di[0] = 833563721728.000, df[0] = 598684139520.000, dc_hat[0] = 426339270656.000
Gradient do_[0] = 37780130365440.000
Backward Time Step 1:
Gradient di[0] = 1046735421440.000, df[0] = 721831133184.000, dc_hat[0] = 582814531584.000
Gradient do_[0] = 33331462799360.000
Backward Time Step 0:
Gradient di[0] = 1267622346752.000, df[0] = 901906104320.000, dc_hat[0] = 1189457428480.000
Gradient do_[0] = 19597327597568.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2435089958961152.000, df[0] = -1911651959832576.000, dc_hat[0] = -1101223642005504.000
Gradient do_[0] = -142404511191793664.000
Backward Time Step 3:
Gradient di[0] = -3826281674178560.000, df[0] = -2920797073047552.000, dc_hat[0] = -1582385002971136.000
Gradient do_[0] = -194856807884128256.000
Backward Time Step 2:
Gradient di[0] = -5016609848557568.000, df[0] = -3779832408178688.000, dc_hat[0] = -2767092273119232.000
Gradient do_[0] = -223126918281560064.000
Backward Time Step 1:
Gradient di[0] = -6335596989513728.000, df[0] = -4533115179499520.000, dc_hat[0] = -3838823783989248.000
Gradient do_[0] = -200383984117481472.000
Backward Time Step 0:
Gradient di[0] = -7401856443613184.000, df[0] = -5410019726065664.000, dc_hat[0] = -7483029714894848.000
Gradient do_[0] = -114831972202708992.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 345775865856.000, df[0] = 253508190208.000, dc_hat[0] = 150206349312.000
Gradient do_[0] = 21447565115392.000
Backward Time Step 3:
Gradient di[0] = 542896357376.000, df[0] = 391737212928.000, dc_hat[0] = 212726611968.000
Gradient do_[0] = 28731106131968.000
Backward Time Step 2:
Gradient di[0] = 691429507072.000, df[0] = 496600252416.000, dc_hat[0] = 353636286464.000
Gradient do_[0] = 31337870262272.000
Backward Time Step 1:
Gradient di[0] = 868231544832.000, df[0] = 598733946880.000, dc_hat[0] = 483414900736.000
Gradient do_[0] = 27647155372032.000
Backward Time Step 0:
Gradient di[0] = 1051426291712.000, df[0] = 748083740672.000, dc_hat[0] = 986592444416.000
Gradient do_[0] = 16254953848832.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2467807946080256.000, df[0] = -1937338682834944.000, dc_hat[0] = -1116011952603136.000
Gradient do_[0] = -144317068718637056.000
Backward Time Step 3:
Gradient di[0] = -3877639014055936.000, df[0] = -2960003950444544.000, dc_hat[0] = -1603603752026112.000
Gradient do_[0] = -197471120117465088.000
Backward Time Step 2:
Gradient di[0] = -5083886753153024.000, df[0] = -3830523491254272.000, dc_hat[0] = -2804153545916416.000
Gradient do_[0] = -226117847607148544.000
Backward Time Step 1:
Gradient di[0] = -6420443531575296.000, df[0] = -4593818636648448.000, dc_hat[0] = -3890156696240128.000
Gradient do_[0] = -203066448891871232.000
Backward Time Step 0:
Gradient di[0] = -7500771151052800.000, df[0] = -5482315836817408.000, dc_hat[0] = -7583028364705792.000
Gradient do_[0] = -116366529657831424.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 272426106880.000, df[0] = 199731363840.000, dc_hat[0] = 118341902336.000
Gradient do_[0] = 16897786511360.000
Backward Time Step 3:
Gradient di[0] = 427725520896.000, df[0] = 308633862144.000, dc_hat[0] = 167596097536.000
Gradient do_[0] = 22635893227520.000
Backward Time Step 2:
Gradient di[0] = 544740704256.000, df[0] = 391245135872.000, dc_hat[0] = 278606610432.000
Gradient do_[0] = 24689273470976.000
Backward Time Step 1:
Gradient di[0] = 684016533504.000, df[0] = 471698636800.000, dc_hat[0] = 380839886848.000
Gradient do_[0] = 21781062615040.000
Backward Time Step 0:
Gradient di[0] = 828323790848.000, df[0] = 589347618816.000, dc_hat[0] = 777247129600.000
Gradient do_[0] = 12805809897472.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2500882147049472.000, df[0] = -1963304847147008.000, dc_hat[0] = -1130960854712320.000
Gradient do_[0] = -146250594275819520.000
Backward Time Step 3:
Gradient di[0] = -3929556578729984.000, df[0] = -2999638445522944.000, dc_hat[0] = -1625052281831424.000
Gradient do_[0] = -200113830674563072.000
Backward Time Step 2:
Gradient di[0] = -5151901318381568.000, df[0] = -3881768893546496.000, dc_hat[0] = -2841618277203968.000
Gradient do_[0] = -229141401504317440.000
Backward Time Step 1:
Gradient di[0] = -6506213491605504.000, df[0] = -4655182445019136.000, dc_hat[0] = -3942048759545856.000
Gradient do_[0] = -205778067904266240.000
Backward Time Step 0:
Gradient di[0] = -7600766579638272.000, df[0] = -5555402758422528.000, dc_hat[0] = -7684121694306304.000
Gradient do_[0] = -117917863255212032.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 196988370944.000, df[0] = 144423747584.000, dc_hat[0] = 85571158016.000
Gradient do_[0] = 12218533937152.000
Backward Time Step 3:
Gradient di[0] = 309279195136.000, df[0] = 223166857216.000, dc_hat[0] = 121183559680.000
Gradient do_[0] = 16367420964864.000
Backward Time Step 2:
Gradient di[0] = 393885843456.000, df[0] = 282897842176.000, dc_hat[0] = 201448620032.000
Gradient do_[0] = 17851953971200.000
Backward Time Step 1:
Gradient di[0] = 494579646464.000, df[0] = 341062418432.000, dc_hat[0] = 275361431552.000
Gradient do_[0] = 15748754833408.000
Backward Time Step 0:
Gradient di[0] = 598906961920.000, df[0] = 426118873088.000, dc_hat[0] = 561976705024.000
Gradient do_[0] = 9259047714816.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2534257264164864.000, df[0] = -1989507100442624.000, dc_hat[0] = -1146045585162240.000
Gradient do_[0] = -148201660479438848.000
Backward Time Step 3:
Gradient di[0] = -3981941221097472.000, df[0] = -3039629422886912.000, dc_hat[0] = -1646694756253696.000
Gradient do_[0] = -202780300990742528.000
Backward Time Step 2:
Gradient di[0] = -5220527379578880.000, df[0] = -3933476004823040.000, dc_hat[0] = -2879423653085184.000
Gradient do_[0] = -232192185494142976.000
Backward Time Step 1:
Gradient di[0] = -6592749029556224.000, df[0] = -4717095472332800.000, dc_hat[0] = -3994403068706816.000
Gradient do_[0] = -208513876172472320.000
Backward Time Step 0:
Gradient di[0] = -7701650529583104.000, df[0] = -5629139293831168.000, dc_hat[0] = -7786112135200768.000
Gradient do_[0] = -119482975107678208.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 119240450048.000, df[0] = 87422246912.000, dc_hat[0] = 51797237760.000
Gradient do_[0] = 7396046929920.000
Backward Time Step 3:
Gradient di[0] = 187208941568.000, df[0] = 135084703744.000, dc_hat[0] = 73352298496.000
Gradient do_[0] = 9907250135040.000
Backward Time Step 2:
Gradient di[0] = 238418493440.000, df[0] = 171237687296.000, dc_hat[0] = 121934430208.000
Gradient do_[0] = 10805681586176.000
Backward Time Step 1:
Gradient di[0] = 299361140736.000, df[0] = 206439432192.000, dc_hat[0] = 166668484608.000
Gradient do_[0] = 9532413575168.000
Backward Time Step 0:
Gradient di[0] = 362500554752.000, df[0] = 257917059072.000, dc_hat[0] = 340147798016.000
Gradient do_[0] = 5604226105344.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2567915043815424.000, df[0] = -2015931618295808.000, dc_hat[0] = -1161258359324672.000
Gradient do_[0] = -150169270897082368.000
Backward Time Step 3:
Gradient di[0] = -4034771734757376.000, df[0] = -3079961044844544.000, dc_hat[0] = -1668520303656960.000
Gradient do_[0] = -205469397194637312.000
Backward Time Step 2:
Gradient di[0] = -5289731113877504.000, df[0] = -3985617981538304.000, dc_hat[0] = -2917544440627200.000
Gradient do_[0] = -235268704928006144.000
Backward Time Step 1:
Gradient di[0] = -6680019006914560.000, df[0] = -4779531411914752.000, dc_hat[0] = -4047199222628352.000
Gradient do_[0] = -211272877264076800.000
Backward Time Step 0:
Gradient di[0] = -7803387030536192.000, df[0] = -5703498599497728.000, dc_hat[0] = -7888964253908992.000
Gradient do_[0] = -121061306869481472.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 39789289472.000, df[0] = 29171914752.000, dc_hat[0] = 17284067328.000
Gradient do_[0] = 2467970678784.000
Backward Time Step 3:
Gradient di[0] = 62468841472.000, df[0] = 45075816448.000, dc_hat[0] = 24476250112.000
Gradient do_[0] = 3305880158208.000
Backward Time Step 2:
Gradient di[0] = 79555682304.000, df[0] = 57138749440.000, dc_hat[0] = 40686473216.000
Gradient do_[0] = 3605623472128.000
Backward Time Step 1:
Gradient di[0] = 99888693248.000, df[0] = 68883185664.000, dc_hat[0] = 55611645952.000
Gradient do_[0] = 3180689620992.000
Backward Time Step 0:
Gradient di[0] = 120953806848.000, df[0] = 86057943040.000, dc_hat[0] = 113495474176.000
Gradient do_[0] = 1869935017984.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2601873471176704.000, df[0] = -2042592493568000.000, dc_hat[0] = -1176606424956928.000
Gradient do_[0] = -152154370421555200.000
Backward Time Step 3:
Gradient di[0] = -4088070399852544.000, df[0] = -3120649954394112.000, dc_hat[0] = -1690539124588544.000
Gradient do_[0] = -208182287517351936.000
Backward Time Step 2:
Gradient di[0] = -5359545270403072.000, df[0] = -4038220593496064.000, dc_hat[0] = -2956001846231040.000
Gradient do_[0] = -238372162396749824.000
Backward Time Step 1:
Gradient di[0] = -6768051340967936.000, df[0] = -4842514422956032.000, dc_hat[0] = -4100458696146944.000
Gradient do_[0] = -214056050431623168.000
Backward Time Step 0:
Gradient di[0] = -7906012052848640.000, df[0] = -5778506982096896.000, dc_hat[0] = -7992714557652992.000
Gradient do_[0] = -122653425476304896.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -42455273472.000, df[0] = -31126528000.000, dc_hat[0] = -18441992192.000
Gradient do_[0] = -2633315385344.000
Backward Time Step 3:
Gradient di[0] = -66653335552.000, df[0] = -48095293440.000, dc_hat[0] = -26115430400.000
Gradient do_[0] = -3527304282112.000
Backward Time Step 2:
Gradient di[0] = -84883742720.000, df[0] = -60965490688.000, dc_hat[0] = -43410608128.000
Gradient do_[0] = -3847075921920.000
Backward Time Step 1:
Gradient di[0] = -106575855616.000, df[0] = -73494577152.000, dc_hat[0] = -59333414912.000
Gradient do_[0] = -3393605599232.000
Backward Time Step 0:
Gradient di[0] = -129048444928.000, df[0] = -91817230336.000, dc_hat[0] = -121090965504.000
Gradient do_[0] = -1995077320704.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865
c_state[0] = 0.725, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 17143145955328.000, df[0] = 11760413179904.000, dc_hat[0] = 7172251451392.000
Gradient do_[0] = 1138026478567424.000
Backward Time Step 3:
Gradient di[0] = 26953801269248.000, df[0] = 18363577270272.000, dc_hat[0] = 10042543702016.000
Gradient do_[0] = 1491999429492736.000
Backward Time Step 2:
Gradient di[0] = 33088583237632.000, df[0] = 22609125179392.000, dc_hat[0] = 15708831350784.000
Gradient do_[0] = 1532085030354944.000
Backward Time Step 1:
Gradient di[0] = 41263004909568.000, df[0] = 27413561999360.000, dc_hat[0] = 21091753918464.000
Gradient do_[0] = 1323585306099712.000
Backward Time Step 0:
Gradient di[0] = 51407944155136.000, df[0] = 35672979144704.000, dc_hat[0] = 44819246219264.000
Gradient do_[0] = 792907736940544.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1366039330816.000, df[0] = 1001532489728.000, dc_hat[0] = 593359601664.000
Gradient do_[0] = 84726870179840.000
Backward Time Step 3:
Gradient di[0] = 2144686571520.000, df[0] = 1547564613632.000, dc_hat[0] = 840263860224.000
Gradient do_[0] = 113494435299328.000
Backward Time Step 2:
Gradient di[0] = 2731131797504.000, df[0] = 1961570598912.000, dc_hat[0] = 1396693401600.000
Gradient do_[0] = 123778398945280.000
Backward Time Step 1:
Gradient di[0] = 3429249056768.000, df[0] = 2364806004736.000, dc_hat[0] = 1909147959296.000
Gradient do_[0] = 109194711662592.000
Backward Time Step 0:
Gradient di[0] = 4153747177472.000, df[0] = 2955367415808.000, dc_hat[0] = 3897615974400.000
Gradient do_[0] = 64216555847680.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2018455851106304.000, df[0] = -1584591005548544.000, dc_hat[0] = -912734237491200.000
Gradient do_[0] = -118033028508286976.000
Backward Time Step 3:
Gradient di[0] = -3171415930961920.000, df[0] = -2420940189204480.000, dc_hat[0] = -1311394444083200.000
Gradient do_[0] = -161498037414264832.000
Backward Time Step 2:
Gradient di[0] = -4157474923872256.000, df[0] = -3132513459372032.000, dc_hat[0] = -2292916877787136.000
Gradient do_[0] = -184905969954717696.000
Backward Time Step 1:
Gradient di[0] = -5250202919239680.000, df[0] = -3756496240246784.000, dc_hat[0] = -3180800971374592.000
Gradient do_[0] = -166049465797443584.000
Backward Time Step 0:
Gradient di[0] = -6134809952780288.000, df[0] = -4483934851170304.000, dc_hat[0] = -6202088467988480.000
Gradient do_[0] = -95175083709956096.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1325226000384.000, df[0] = 971608293376.000, dc_hat[0] = 575621300224.000
Gradient do_[0] = 82194458476544.000
Backward Time Step 3:
Gradient di[0] = 2080582664192.000, df[0] = 1501308649472.000, dc_hat[0] = 815117565952.000
Gradient do_[0] = 110100085735424.000
Backward Time Step 2:
Gradient di[0] = 2649488883712.000, df[0] = 1902929379328.000, dc_hat[0] = 1354843422720.000
Gradient do_[0] = 120074945953792.000
Backward Time Step 1:
Gradient di[0] = 3326656643072.000, df[0] = 2294048096256.000, dc_hat[0] = 1851828076544.000
Gradient do_[0] = 105924605771776.000
Backward Time Step 0:
Gradient di[0] = 4029252108288.000, df[0] = 2866789744640.000, dc_hat[0] = 3780797792256.000
Gradient do_[0] = 62291869433856.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2043501147586560.000, df[0] = -1604252292087808.000, dc_hat[0] = -924043490361344.000
Gradient do_[0] = -119496169247211520.000
Backward Time Step 3:
Gradient di[0] = -3210728303493120.000, df[0] = -2450951004749824.000, dc_hat[0] = -1327599758344192.000
Gradient do_[0] = -163497018273038336.000
Backward Time Step 2:
Gradient di[0] = -4208988493185024.000, df[0] = -3171321441681408.000, dc_hat[0] = -2321166756741120.000
Gradient do_[0] = -187192421564547072.000
Backward Time Step 1:
Gradient di[0] = -5315161078366208.000, df[0] = -3802955975229440.000, dc_hat[0] = -3219832828854272.000
Gradient do_[0] = -168099230349524992.000
Backward Time Step 0:
Gradient di[0] = -6210387418546176.000, df[0] = -4539174304612352.000, dc_hat[0] = -6278494862442496.000
Gradient do_[0] = -96347592601894912.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1283910402048.000, df[0] = 941318144000.000, dc_hat[0] = 557671383040.000
Gradient do_[0] = 79631470297088.000
Backward Time Step 3:
Gradient di[0] = 2015687999488.000, df[0] = 1454484094976.000, dc_hat[0] = 789683372032.000
Gradient do_[0] = 106665353412608.000
Backward Time Step 2:
Gradient di[0] = 2566811025408.000, df[0] = 1843548651520.000, dc_hat[0] = 1312543997952.000
Gradient do_[0] = 116327209500672.000
Backward Time Step 1:
Gradient di[0] = 3222769762304.000, df[0] = 2222406238208.000, dc_hat[0] = 1793963065344.000
Gradient do_[0] = 102616155553792.000
Backward Time Step 0:
Gradient di[0] = 3903339102208.000, df[0] = 2777203081216.000, dc_hat[0] = 3662648705024.000
Gradient do_[0] = 60345267781632.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2069838692352000.000, df[0] = -1624929606828032.000, dc_hat[0] = -935946052698112.000
Gradient do_[0] = -121035777583874048.000
Backward Time Step 3:
Gradient di[0] = -3252072195555328.000, df[0] = -2482513645666304.000, dc_hat[0] = -1344676950966272.000
Gradient do_[0] = -165601243010433024.000
Backward Time Step 2:
Gradient di[0] = -4263140783030272.000, df[0] = -3212123094122496.000, dc_hat[0] = -2350992083386368.000
Gradient do_[0] = -189599544575524864.000
Backward Time Step 1:
Gradient di[0] = -5383434952245248.000, df[0] = -3851803175157760.000, dc_hat[0] = -3261131892195328.000
Gradient do_[0] = -170257640394326016.000
Backward Time Step 0:
Gradient di[0] = -6289980678733824.000, df[0] = -4597349099765760.000, dc_hat[0] = -6358960537862144.000
Gradient do_[0] = -97582412879364096.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1241858572288.000, df[0] = 910487977984.000, dc_hat[0] = 539401453568.000
Gradient do_[0] = 77022906810368.000
Backward Time Step 3:
Gradient di[0] = 1949639376896.000, df[0] = 1406826315776.000, dc_hat[0] = 763796979712.000
Gradient do_[0] = 103169560412160.000
Backward Time Step 2:
Gradient di[0] = 2482672762880.000, df[0] = 1783118692352.000, dc_hat[0] = 1269497593856.000
Gradient do_[0] = 112513295319040.000
Backward Time Step 1:
Gradient di[0] = 3117049970688.000, df[0] = 2149500715008.000, dc_hat[0] = 1735079755776.000
Gradient do_[0] = 99249412898816.000
Backward Time Step 0:
Gradient di[0] = 3775215435776.000, df[0] = 2686044078080.000, dc_hat[0] = 3542425534464.000
Gradient do_[0] = 58364486746112.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2097083012087808.000, df[0] = -1646319080833024.000, dc_hat[0] = -948258717302784.000
Gradient do_[0] = -122628248378015744.000
Backward Time Step 3:
Gradient di[0] = -3294829131857920.000, df[0] = -2515155933986816.000, dc_hat[0] = -1362339869753344.000
Gradient do_[0] = -167777502939316224.000
Backward Time Step 2:
Gradient di[0] = -4319148498747392.000, df[0] = -3254322758418432.000, dc_hat[0] = -2381839880683520.000
Gradient do_[0] = -192089302757277696.000
Backward Time Step 1:
Gradient di[0] = -5454051730784256.000, df[0] = -3902324607025152.000, dc_hat[0] = -3303847221002240.000
Gradient do_[0] = -172490095675310080.000
Backward Time Step 0:
Gradient di[0] = -6372329160441856.000, df[0] = -4657537160839168.000, dc_hat[0] = -6442211499573248.000
Gradient do_[0] = -98859942311624704.000
Epoch 500, Train Loss=0.011462, Weight Norm=12.689054
Sample Predictions at Epoch 500:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.17 | 63.87 | 6.70 |
| 193 | 2024-10-14 | 56.58 | 66.55 | 9.97 |
| 194 | 2024-10-15 | 56.78 | 66.00 | 9.22 |
| 195 | 2024-10-16 | 57.72 | 67.20 | 9.48 |
| 196 | 2024-10-17 | 57.27 | 66.76 | 9.49 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1199029616640.000, df[0] = 879087976448.000, dc_hat[0] = 520794767360.000
Gradient do_[0] = 74366159159296.000
Backward Time Step 3:
Gradient di[0] = 1882376503296.000, df[0] = 1358292451328.000, dc_hat[0] = 737436434432.000
Gradient do_[0] = 99609535840256.000
Backward Time Step 2:
Gradient di[0] = 2396986540032.000, df[0] = 1721577111552.000, dc_hat[0] = 1225661874176.000
Gradient do_[0] = 108629294317568.000
Backward Time Step 1:
Gradient di[0] = 3009398964224.000, df[0] = 2075263238144.000, dc_hat[0] = 1675124277248.000
Gradient do_[0] = 95821181747200.000
Backward Time Step 0:
Gradient di[0] = 3644745318400.000, df[0] = 2593215217664.000, dc_hat[0] = 3420000354304.000
Gradient do_[0] = 56347433369600.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2125064052932608.000, df[0] = -1668287435898880.000, dc_hat[0] = -960904980070400.000
Gradient do_[0] = -124263943723024384.000
Backward Time Step 3:
Gradient di[0] = -3338746246201344.000, df[0] = -2548683254005760.000, dc_hat[0] = -1380480335216640.000
Gradient do_[0] = -170012844438323200.000
Backward Time Step 2:
Gradient di[0] = -4376667237646336.000, df[0] = -3297661125918720.000, dc_hat[0] = -2413521170071552.000
Gradient do_[0] = -194646062428848128.000
Backward Time Step 1:
Gradient di[0] = -5526585675350016.000, df[0] = -3954218549379072.000, dc_hat[0] = -3347724875333632.000
Gradient do_[0] = -174783161534775296.000
Backward Time Step 0:
Gradient di[0] = -6456899213983744.000, df[0] = -4719349793292288.000, dc_hat[0] = -6527709802921984.000
Gradient do_[0] = -100171960331272192.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1154357264384.000, df[0] = 846336360448.000, dc_hat[0] = 501387296768.000
Gradient do_[0] = 71595125112832.000
Backward Time Step 3:
Gradient di[0] = 1812217856000.000, df[0] = 1307668643840.000, dc_hat[0] = 709941854208.000
Gradient do_[0] = 95896284954624.000
Backward Time Step 2:
Gradient di[0] = 2307620077568.000, df[0] = 1657392594944.000, dc_hat[0] = 1179945664512.000
Gradient do_[0] = 104578594897920.000
Backward Time Step 1:
Gradient di[0] = 2897129242624.000, df[0] = 1997841104896.000, dc_hat[0] = 1612600311808.000
Gradient do_[0] = 92245931851776.000
Backward Time Step 0:
Gradient di[0] = 3508695990272.000, df[0] = 2496416972800.000, dc_hat[0] = 3292340420608.000
Gradient do_[0] = 54244124131328.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2153544383725568.000, df[0] = -1690646901424128.000, dc_hat[0] = -973775789096960.000
Gradient do_[0] = -125928733176496128.000
Backward Time Step 3:
Gradient di[0] = -3383442360238080.000, df[0] = -2582805426995200.000, dc_hat[0] = -1398942654791680.000
Gradient do_[0] = -172287802715668480.000
Backward Time Step 2:
Gradient di[0] = -4435213815906304.000, df[0] = -3341773929709568.000, dc_hat[0] = -2445768052965376.000
Gradient do_[0] = -197248709531009024.000
Backward Time Step 1:
Gradient di[0] = -5600409720717312.000, df[0] = -4007035909701632.000, dc_hat[0] = -3392381797793792.000
Gradient do_[0] = -177116943684206592.000
Backward Time Step 0:
Gradient di[0] = -6542970358595584.000, df[0] = -4782259253018624.000, dc_hat[0] = -6614724766597120.000
Gradient do_[0] = -101507274253533184.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1108524531712.000, df[0] = 812734021632.000, dc_hat[0] = 481476542464.000
Gradient do_[0] = 68752137781248.000
Backward Time Step 3:
Gradient di[0] = 1740239536128.000, df[0] = 1255731888128.000, dc_hat[0] = 681734897664.000
Gradient do_[0] = 92086850289664.000
Backward Time Step 2:
Gradient di[0] = 2215935475712.000, df[0] = 1591542546432.000, dc_hat[0] = 1133045481472.000
Gradient do_[0] = 100422861717504.000
Backward Time Step 1:
Gradient di[0] = 2781950771200.000, df[0] = 1918413832192.000, dc_hat[0] = 1548460490752.000
Gradient do_[0] = 88578122055680.000
Backward Time Step 0:
Gradient di[0] = 3369127903232.000, df[0] = 2397114990592.000, dc_hat[0] = 3161378258944.000
Gradient do_[0] = 52086410575872.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2182527359909888.000, df[0] = -1713401235505152.000, dc_hat[0] = -986874097172480.000
Gradient do_[0] = -127622917386141696.000
Backward Time Step 3:
Gradient di[0] = -3428929821999104.000, df[0] = -2617531848196096.000, dc_hat[0] = -1417732734058496.000
Gradient do_[0] = -174603047786250240.000
Backward Time Step 2:
Gradient di[0] = -4494789575704576.000, df[0] = -3386661706661888.000, dc_hat[0] = -2478581334671360.000
Gradient do_[0] = -199896969185853440.000
Backward Time Step 1:
Gradient di[0] = -5675528698724352.000, df[0] = -4060779640782848.000, dc_hat[0] = -3437822551785472.000
Gradient do_[0] = -179491734181380096.000
Backward Time Step 0:
Gradient di[0] = -6630567290339328.000, df[0] = -4846283793629184.000, dc_hat[0] = -6703282160402432.000
Gradient do_[0] = -102866244855660544.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1060792631296.000, df[0] = 777739108352.000, dc_hat[0] = 460741181440.000
Gradient do_[0] = 65791403753472.000
Backward Time Step 3:
Gradient di[0] = 1665281294336.000, df[0] = 1201644634112.000, dc_hat[0] = 652361269248.000
Gradient do_[0] = 88119844012032.000
Backward Time Step 2:
Gradient di[0] = 2120462106624.000, df[0] = 1522971443200.000, dc_hat[0] = 1084210872320.000
Gradient do_[0] = 96095489228800.000
Backward Time Step 1:
Gradient di[0] = 2662025920512.000, df[0] = 1835712643072.000, dc_hat[0] = 1481680748544.000
Gradient do_[0] = 84759208263680.000
Backward Time Step 0:
Gradient di[0] = 3223819911168.000, df[0] = 2293729067008.000, dc_hat[0] = 3025030086656.000
Gradient do_[0] = 49839962324992.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2211894467231744.000, df[0] = -1736457559474176.000, dc_hat[0] = -1000145881661440.000
Gradient do_[0] = -129339590044549120.000
Backward Time Step 3:
Gradient di[0] = -3475027169116160.000, df[0] = -2652723736477696.000, dc_hat[0] = -1436774471565312.000
Gradient do_[0] = -176949233801232384.000
Backward Time Step 2:
Gradient di[0] = -4555171447177216.000, df[0] = -3432156953051136.000, dc_hat[0] = -2511836729573376.000
Gradient do_[0] = -202581014508208128.000
Backward Time Step 1:
Gradient di[0] = -5751650014724096.000, df[0] = -4115240362967040.000, dc_hat[0] = -3483869433036800.000
Gradient do_[0] = -181898204357328896.000
Backward Time Step 0:
Gradient di[0] = -6719309904609280.000, df[0] = -4911145852862464.000, dc_hat[0] = -6792998121635840.000
Gradient do_[0] = -104242988032458752.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1011394215936.000, df[0] = 741522407424.000, dc_hat[0] = 439281647616.000
Gradient do_[0] = 62727305297920.000
Backward Time Step 3:
Gradient di[0] = 1587710918656.000, df[0] = 1145672695808.000, dc_hat[0] = 621966000128.000
Gradient do_[0] = 84014568308736.000
Backward Time Step 2:
Gradient di[0] = 2021659115520.000, df[0] = 1452009062400.000, dc_hat[0] = 1033674817536.000
Gradient do_[0] = 91617289568256.000
Backward Time Step 1:
Gradient di[0] = 2537927475200.000, df[0] = 1750133899264.000, dc_hat[0] = 1412580900864.000
Gradient do_[0] = 80807444086784.000
Backward Time Step 0:
Gradient di[0] = 3073464598528.000, df[0] = 2186752294912.000, dc_hat[0] = 2883946545152.000
Gradient do_[0] = 47515487436800.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2241602756018176.000, df[0] = -1759781647810560.000, dc_hat[0] = -1013571882319872.000
Gradient do_[0] = -131076114041798656.000
Backward Time Step 3:
Gradient di[0] = -3521646623195136.000, df[0] = -2688314519846912.000, dc_hat[0] = -1456031091654656.000
Gradient do_[0] = -179322100153057280.000
Backward Time Step 2:
Gradient di[0] = -4616227628515328.000, df[0] = -3478160884629504.000, dc_hat[0] = -2545465518194688.000
Gradient do_[0] = -205295210500980736.000
Backward Time Step 1:
Gradient di[0] = -5828639450988544.000, df[0] = -4170322513231872.000, dc_hat[0] = -3530439763427328.000
Gradient do_[0] = -184332162323972096.000
Backward Time Step 0:
Gradient di[0] = -6809075794837504.000, df[0] = -4976755773276160.000, dc_hat[0] = -6883748096245760.000
Gradient do_[0] = -105635622588252160.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 960371359744.000, df[0] = 704114720768.000, dc_hat[0] = 417117863936.000
Gradient do_[0] = 59562539352064.000
Backward Time Step 3:
Gradient di[0] = 1507593682944.000, df[0] = 1087862472704.000, dc_hat[0] = 590572814336.000
Gradient do_[0] = 79774571560960.000
Backward Time Step 2:
Gradient di[0] = 1919617990656.000, df[0] = 1378720940032.000, dc_hat[0] = 981485617152.000
Gradient do_[0] = 86992448651264.000
Backward Time Step 1:
Gradient di[0] = 2409766060032.000, df[0] = 1661753884672.000, dc_hat[0] = 1341223337984.000
Gradient do_[0] = 76726428237824.000
Backward Time Step 0:
Gradient di[0] = 2918193561600.000, df[0] = 2076277866496.000, dc_hat[0] = 2738249793536.000
Gradient do_[0] = 45115011760128.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2271610081902592.000, df[0] = -1783340214517760.000, dc_hat[0] = -1027132906012672.000
Gradient do_[0] = -132830092786139136.000
Backward Time Step 3:
Gradient di[0] = -3568739328983040.000, df[0] = -2724266885775360.000, dc_hat[0] = -1475482193231872.000
Gradient do_[0] = -181718966782132224.000
Backward Time Step 2:
Gradient di[0] = -4677915170045952.000, df[0] = -3524639678529536.000, dc_hat[0] = -2579438709506048.000
Gradient do_[0] = -208037186342223872.000
Backward Time Step 1:
Gradient di[0] = -5906413255655424.000, df[0] = -4225964619857920.000, dc_hat[0] = -3577483882397696.000
Gradient do_[0] = -186790859302240256.000
Backward Time Step 0:
Gradient di[0] = -6899746849423360.000, df[0] = -5043027655524352.000, dc_hat[0] = -6975413435760640.000
Gradient do_[0] = -107042284507234304.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 907439964160.000, df[0] = 665307578368.000, dc_hat[0] = 394124820480.000
Gradient do_[0] = 56279372398592.000
Backward Time Step 3:
Gradient di[0] = 1424479092736.000, df[0] = 1027889233920.000, dc_hat[0] = 558006992896.000
Gradient do_[0] = 75376063676416.000
Backward Time Step 2:
Gradient di[0] = 1813766209536.000, df[0] = 1302695641088.000, dc_hat[0] = 927348621312.000
Gradient do_[0] = 82194928238592.000
Backward Time Step 1:
Gradient di[0] = 2276832837632.000, df[0] = 1570082783232.000, dc_hat[0] = 1267211436032.000
Gradient do_[0] = 72493461143552.000
Backward Time Step 0:
Gradient di[0] = 2757146443776.000, df[0] = 1961693937664.000, dc_hat[0] = 2587133476864.000
Gradient do_[0] = 42625239351296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2301942483124224.000, df[0] = -1807154465996800.000, dc_hat[0] = -1040840898117632.000
Gradient do_[0] = -134603166955077632.000
Backward Time Step 3:
Gradient di[0] = -3616345014927360.000, df[0] = -2760610093727744.000, dc_hat[0] = -1495146164125696.000
Gradient do_[0] = -184141860913020928.000
Backward Time Step 2:
Gradient di[0] = -4740257157218304.000, df[0] = -3571611856797696.000, dc_hat[0] = -2613773751812096.000
Gradient do_[0] = -210808402320818176.000
Backward Time Step 1:
Gradient di[0] = -5985010620301312.000, df[0] = -4282196747616256.000, dc_hat[0] = -3625027559751680.000
Gradient do_[0] = -189275566602452992.000
Backward Time Step 0:
Gradient di[0] = -6991380513554432.000, df[0] = -5110002838667264.000, dc_hat[0] = -7068052659109888.000
Gradient do_[0] = -108463884322471936.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 852902412288.000, df[0] = 625322885120.000, dc_hat[0] = 370434932736.000
Gradient do_[0] = 52896678805504.000
Backward Time Step 3:
Gradient di[0] = 1338848706560.000, df[0] = 966100647936.000, dc_hat[0] = 524456460288.000
Gradient do_[0] = 70844537634816.000
Backward Time Step 2:
Gradient di[0] = 1704711684096.000, df[0] = 1224370159616.000, dc_hat[0] = 871576764416.000
Gradient do_[0] = 77252343627776.000
Backward Time Step 1:
Gradient di[0] = 2139881603072.000, df[0] = 1475641475072.000, dc_hat[0] = 1190966853632.000
Gradient do_[0] = 68132643274752.000
Backward Time Step 0:
Gradient di[0] = 2591250710528.000, df[0] = 1843659931648.000, dc_hat[0] = 2431467126784.000
Gradient do_[0] = 40060502147072.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2332552715042816.000, df[0] = -1831186821283840.000, dc_hat[0] = -1054674048253952.000
Gradient do_[0] = -136392398790983680.000
Backward Time Step 3:
Gradient di[0] = -3664381271343104.000, df[0] = -2797282403549184.000, dc_hat[0] = -1514987436638208.000
Gradient do_[0] = -186586848355680256.000
Backward Time Step 2:
Gradient di[0] = -4803176280621056.000, df[0] = -3619019168940032.000, dc_hat[0] = -2648425816391680.000
Gradient do_[0] = -213605216304496640.000
Backward Time Step 1:
Gradient di[0] = -6064333297549312.000, df[0] = -4338946687369216.000, dc_hat[0] = -3673005297238016.000
Gradient do_[0] = -191783174668288000.000
Backward Time Step 0:
Gradient di[0] = -7083855454404608.000, df[0] = -5177592202133504.000, dc_hat[0] = -7161541212241920.000
Gradient do_[0] = -109898532248354816.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 796560523264.000, df[0] = 584015085568.000, dc_hat[0] = 345961463808.000
Gradient do_[0] = 49402131513344.000
Backward Time Step 3:
Gradient di[0] = 1250385723392.000, df[0] = 902267928576.000, dc_hat[0] = 489797451776.000
Gradient do_[0] = 66163123945472.000
Backward Time Step 2:
Gradient di[0] = 1592053858304.000, df[0] = 1143456661504.000, dc_hat[0] = 813964460032.000
Gradient do_[0] = 72146567036928.000
Backward Time Step 1:
Gradient di[0] = 1998418345984.000, df[0] = 1378088779776.000, dc_hat[0] = 1112214077440.000
Gradient do_[0] = 63628220825600.000
Backward Time Step 0:
Gradient di[0] = 2419897925632.000, df[0] = 1721743441920.000, dc_hat[0] = 2270680580096.000
Gradient do_[0] = 37411404906496.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2363466547462144.000, df[0] = -1855457413038080.000, dc_hat[0] = -1068644234690560.000
Gradient do_[0] = -138199394611625984.000
Backward Time Step 3:
Gradient di[0] = -3712889974161408.000, df[0] = -2834315490623488.000, dc_hat[0] = -1535023056420864.000
Gradient do_[0] = -189055750176243712.000
Backward Time Step 2:
Gradient di[0] = -4866707973734400.000, df[0] = -3666888458502144.000, dc_hat[0] = -2683415035904000.000
Gradient do_[0] = -216429226021093376.000
Backward Time Step 1:
Gradient di[0] = -6144431216394240.000, df[0] = -4396252825387008.000, dc_hat[0] = -3721455212691456.000
Gradient do_[0] = -194315349947056128.000
Backward Time Step 0:
Gradient di[0] = -7177230727774208.000, df[0] = -5245840843079680.000, dc_hat[0] = -7255940835311616.000
Gradient do_[0] = -111347155997818880.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 737958363136.000, df[0] = 541050404864.000, dc_hat[0] = 320507609088.000
Gradient do_[0] = 45767465107456.000
Backward Time Step 3:
Gradient di[0] = 1158379995136.000, df[0] = 835878649856.000, dc_hat[0] = 453753274368.000
Gradient do_[0] = 61294468136960.000
Backward Time Step 2:
Gradient di[0] = 1474885189632.000, df[0] = 1059303456768.000, dc_hat[0] = 754054856704.000
Gradient do_[0] = 66836687224832.000
Backward Time Step 1:
Gradient di[0] = 1851295137792.000, df[0] = 1276634071040.000, dc_hat[0] = 1030331891712.000
Gradient do_[0] = 58943875317760.000
Backward Time Step 0:
Gradient di[0] = 2241710522368.000, df[0] = 1594964049920.000, dc_hat[0] = 2103480418304.000
Gradient do_[0] = 34656640565248.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2394675927318528.000, df[0] = -1879960872550400.000, dc_hat[0] = -1082751188992000.000
Gradient do_[0] = -140023905308901376.000
Backward Time Step 3:
Gradient di[0] = -3761867633721344.000, df[0] = -2871707207467008.000, dc_hat[0] = -1555261479190528.000
Gradient do_[0] = -191548892792225792.000
Backward Time Step 2:
Gradient di[0] = -4930841499140096.000, df[0] = -3715211135549440.000, dc_hat[0] = -2718762079879168.000
Gradient do_[0] = -219280878147207168.000
Backward Time Step 1:
Gradient di[0] = -6225288807579648.000, df[0] = -4454104961122304.000, dc_hat[0] = -3770424282316800.000
Gradient do_[0] = -196872384496533504.000
Backward Time Step 0:
Gradient di[0] = -7271528345370624.000, df[0] = -5314762720149504.000, dc_hat[0] = -7351273003155456.000
Gradient do_[0] = -112810073398444032.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 677679595520.000, df[0] = 496856137728.000, dc_hat[0] = 294325583872.000
Gradient do_[0] = 42028872237056.000
Backward Time Step 3:
Gradient di[0] = 1063744503808.000, df[0] = 767591383040.000, dc_hat[0] = 416679919616.000
Gradient do_[0] = 56286691459072.000
Backward Time Step 2:
Gradient di[0] = 1354373267456.000, df[0] = 972749144064.000, dc_hat[0] = 692437057536.000
Gradient do_[0] = 61375317540864.000
Backward Time Step 1:
Gradient di[0] = 1699986276352.000, df[0] = 1172293025792.000, dc_hat[0] = 946120032256.000
Gradient do_[0] = 54126289354752.000
Backward Time Step 0:
Gradient di[0] = 2058458234880.000, df[0] = 1464581226496.000, dc_hat[0] = 1931528110080.000
Gradient do_[0] = 31823579054080.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2426130120310784.000, df[0] = -1904655860760576.000, dc_hat[0] = -1096968268939264.000
Gradient do_[0] = -141862744017076224.000
Backward Time Step 3:
Gradient di[0] = -3811232645644288.000, df[0] = -2909394203312128.000, dc_hat[0] = -1575658413096960.000
Gradient do_[0] = -194061878157115392.000
Backward Time Step 2:
Gradient di[0] = -4995493641846784.000, df[0] = -3763925728362496.000, dc_hat[0] = -2754396618227712.000
Gradient do_[0] = -222155448218812416.000
Backward Time Step 1:
Gradient di[0] = -6306787422633984.000, df[0] = -4512415584616448.000, dc_hat[0] = -3819777751515136.000
Gradient do_[0] = -199449605392302080.000
Backward Time Step 0:
Gradient di[0] = -7366558791761920.000, df[0] = -5384220931260416.000, dc_hat[0] = -7447345515986944.000
Gradient do_[0] = -114284381052338176.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 615523418112.000, df[0] = 451285581824.000, dc_hat[0] = 267328716800.000
Gradient do_[0] = 38173866459136.000
Backward Time Step 3:
Gradient di[0] = 966163496960.000, df[0] = 697178587136.000, dc_hat[0] = 378452836352.000
Gradient do_[0] = 51123087998976.000
Backward Time Step 2:
Gradient di[0] = 1230113603584.000, df[0] = 883503005696.000, dc_hat[0] = 628904230912.000
Gradient do_[0] = 55744158236672.000
Backward Time Step 1:
Gradient di[0] = 1543978745856.000, df[0] = 1064711684096.000, dc_hat[0] = 859293417472.000
Gradient do_[0] = 49159117733888.000
Backward Time Step 0:
Gradient di[0] = 1869526466560.000, df[0] = 1330157453312.000, dc_hat[0] = 1754246414336.000
Gradient do_[0] = 28902709788672.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2457847380049920.000, df[0] = -1929557678489600.000, dc_hat[0] = -1111303393378304.000
Gradient do_[0] = -143716829859151872.000
Backward Time Step 3:
Gradient di[0] = -3860997089525760.000, df[0] = -2947386410270720.000, dc_hat[0] = -1596220166373376.000
Gradient do_[0] = -196595084228034560.000
Backward Time Step 2:
Gradient di[0] = -5060663328112640.000, df[0] = -3813029552586752.000, dc_hat[0] = -2790314087546880.000
Gradient do_[0] = -225053039315124224.000
Backward Time Step 1:
Gradient di[0] = -6388934040879104.000, df[0] = -4571190601449472.000, dc_hat[0] = -3869526894575616.000
Gradient do_[0] = -202047390591483904.000
Backward Time Step 0:
Gradient di[0] = -7462361795395584.000, df[0] = -5454242856828928.000, dc_hat[0] = -7544199175995392.000
Gradient do_[0] = -115770663075053568.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 551187513344.000, df[0] = 404116635648.000, dc_hat[0] = 239385608192.000
Gradient do_[0] = 34183736983552.000
Backward Time Step 3:
Gradient di[0] = 865164853248.000, df[0] = 624299474944.000, dc_hat[0] = 338888359936.000
Gradient do_[0] = 45778726813696.000
Backward Time Step 2:
Gradient di[0] = 1101507461120.000, df[0] = 791134928896.000, dc_hat[0] = 563150061568.000
Gradient do_[0] = 49916034416640.000
Backward Time Step 1:
Gradient di[0] = 1382524518400.000, df[0] = 953374539776.000, dc_hat[0] = 769435893760.000
Gradient do_[0] = 44018503254016.000
Backward Time Step 0:
Gradient di[0] = 1674003873792.000, df[0] = 1191044055040.000, dc_hat[0] = 1570780217344.000
Gradient do_[0] = 25879948296192.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2489882735804416.000, df[0] = -1954709141192704.000, dc_hat[0] = -1125781795241984.000
Gradient do_[0] = -145589598808965120.000
Backward Time Step 3:
Gradient di[0] = -3911269144854528.000, df[0] = -2985766506463232.000, dc_hat[0] = -1616991165087744.000
Gradient do_[0] = -199154060102729728.000
Backward Time Step 2:
Gradient di[0] = -5126492291858432.000, df[0] = -3862630519275520.000, dc_hat[0] = -2826595018473472.000
Gradient do_[0] = -227979939268263936.000
Backward Time Step 1:
Gradient di[0] = -6471913345908736.000, df[0] = -4630560471252992.000, dc_hat[0] = -3919778548809728.000
Gradient do_[0] = -204671546889863168.000
Backward Time Step 0:
Gradient di[0] = -7559134924767232.000, df[0] = -5524974525743104.000, dc_hat[0] = -7642033162289152.000
Gradient do_[0] = -117271994663174144.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 485126307840.000, df[0] = 355682648064.000, dc_hat[0] = 210693144576.000
Gradient do_[0] = 30086579355648.000
Backward Time Step 3:
Gradient di[0] = 761461080064.000, df[0] = 549468143616.000, dc_hat[0] = 298264788992.000
Gradient do_[0] = 40291239198720.000
Backward Time Step 2:
Gradient di[0] = 969459630080.000, df[0] = 696294768640.000, dc_hat[0] = 495636938752.000
Gradient do_[0] = 43932029288448.000
Backward Time Step 1:
Gradient di[0] = 1216759070720.000, df[0] = 839064354816.000, dc_hat[0] = 677179424768.000
Gradient do_[0] = 38740655341568.000
Backward Time Step 0:
Gradient di[0] = 1473267630080.000, df[0] = 1048221450240.000, dc_hat[0] = 1382421889024.000
Gradient do_[0] = 22776584863744.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2522201290964992.000, df[0] = -1980083405324288.000, dc_hat[0] = -1140389515886592.000
Gradient do_[0] = -147478886202998784.000
Backward Time Step 3:
Gradient di[0] = -3961979286847488.000, df[0] = -3024480536363008.000, dc_hat[0] = -1637944565694464.000
Gradient do_[0] = -201735421346971648.000
Backward Time Step 2:
Gradient di[0] = -5192896781221888.000, df[0] = -3912666082967552.000, dc_hat[0] = -2863194045415424.000
Gradient do_[0] = -230932523125833728.000
Backward Time Step 1:
Gradient di[0] = -6555629774700544.000, df[0] = -4690458085163008.000, dc_hat[0] = -3970478221819904.000
Gradient do_[0] = -207318930371379200.000
Backward Time Step 0:
Gradient di[0] = -7656749867728896.000, df[0] = -5596321448722432.000, dc_hat[0] = -7740718625849344.000
Gradient do_[0] = -118786391541809152.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 416951894016.000, df[0] = 305699258368.000, dc_hat[0] = 181083717632.000
Gradient do_[0] = 25858444099584.000
Backward Time Step 3:
Gradient di[0] = 654443806720.000, df[0] = 472245567488.000, dc_hat[0] = 256343769088.000
Gradient do_[0] = 34628494688256.000
Backward Time Step 2:
Gradient di[0] = 833197703168.000, df[0] = 598427697152.000, dc_hat[0] = 425969909760.000
Gradient do_[0] = 37757065887744.000
Backward Time Step 1:
Gradient di[0] = 1045711945728.000, df[0] = 721111941120.000, dc_hat[0] = 581983010816.000
Gradient do_[0] = 33294636810240.000
Backward Time Step 0:
Gradient di[0] = 1266143854592.000, df[0] = 900854120448.000, dc_hat[0] = 1188070031360.000
Gradient do_[0] = 19574470737920.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2554768417357824.000, df[0] = -2005652419379200.000, dc_hat[0] = -1155108368809984.000
Gradient do_[0] = -149382673406623744.000
Backward Time Step 3:
Gradient di[0] = -4013080002428928.000, df[0] = -3063492798054400.000, dc_hat[0] = -1659057819615232.000
Gradient do_[0] = -204336642519990272.000
Backward Time Step 2:
Gradient di[0] = -5259808076726272.000, df[0] = -3963081751265280.000, dc_hat[0] = -2900070366183424.000
Gradient do_[0] = -233907509532819456.000
Backward Time Step 1:
Gradient di[0] = -6639972731846656.000, df[0] = -4750804523155456.000, dc_hat[0] = -4021555583516672.000
Gradient do_[0] = -209986173781671936.000
Backward Time Step 0:
Gradient di[0] = -7755100860710912.000, df[0] = -5668205779484672.000, dc_hat[0] = -7840148192493568.000
Gradient do_[0] = -120312204443516928.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 346640384000.000, df[0] = 254148771840.000, dc_hat[0] = 150546284544.000
Gradient do_[0] = 21497789808640.000
Backward Time Step 3:
Gradient di[0] = 544075317248.000, df[0] = 392604385280.000, dc_hat[0] = 213110964224.000
Gradient do_[0] = 28788463239168.000
Backward Time Step 2:
Gradient di[0] = 692673642496.000, df[0] = 497499439104.000, dc_hat[0] = 354125086720.000
Gradient do_[0] = 31389011410944.000
Backward Time Step 1:
Gradient di[0] = 869324488704.000, df[0] = 599476928512.000, dc_hat[0] = 483815424000.000
Gradient do_[0] = 27678591680512.000
Backward Time Step 0:
Gradient di[0] = 1052558295040.000, df[0] = 748889178112.000, dc_hat[0] = 987654717440.000
Gradient do_[0] = 16272456679424.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2587629480574976.000, df[0] = -2031452690579456.000, dc_hat[0] = -1169961171025920.000
Gradient do_[0] = -151303692019040256.000
Backward Time Step 3:
Gradient di[0] = -4064643232301056.000, df[0] = -3102859126112256.000, dc_hat[0] = -1680362065362944.000
Gradient do_[0] = -206961451653398528.000
Backward Time Step 2:
Gradient di[0] = -5327324962619392.000, df[0] = -4013954833580032.000, dc_hat[0] = -2937280889094144.000
Gradient do_[0] = -236909485514293248.000
Backward Time Step 1:
Gradient di[0] = -6725060328947712.000, df[0] = -4811683000221696.000, dc_hat[0] = -4073084722085888.000
Gradient do_[0] = -212677005152354304.000
Backward Time Step 0:
Gradient di[0] = -7854317289603072.000, df[0] = -5740723081052160.000, dc_hat[0] = -7940452321853440.000
Gradient do_[0] = -121851443412992000.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 274768936960.000, df[0] = 201454534656.000, dc_hat[0] = 119331733504.000
Gradient do_[0] = 17040438984704.000
Backward Time Step 3:
Gradient di[0] = 431261941760.000, df[0] = 311198744576.000, dc_hat[0] = 168921235456.000
Gradient do_[0] = 22819117203456.000
Backward Time Step 2:
Gradient di[0] = 549040553984.000, df[0] = 394338009088.000, dc_hat[0] = 280691769344.000
Gradient do_[0] = 24880099622912.000
Backward Time Step 1:
Gradient di[0] = 689044193280.000, df[0] = 475157626880.000, dc_hat[0] = 383481446400.000
Gradient do_[0] = 21938600673280.000
Backward Time Step 0:
Gradient di[0] = 834265088000.000, df[0] = 593574821888.000, dc_hat[0] = 782822080512.000
Gradient do_[0] = 12897663057920.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2620828772466688.000, df[0] = -2057518041792512.000, dc_hat[0] = -1184965639274496.000
Gradient do_[0] = -153244364401803264.000
Backward Time Step 3:
Gradient di[0] = -4116729911312384.000, df[0] = -3142623812386816.000, dc_hat[0] = -1701881864781824.000
Gradient do_[0] = -209612889584041984.000
Backward Time Step 2:
Gradient di[0] = -5395536022601728.000, df[0] = -4065350828163072.000, dc_hat[0] = -2974873395658752.000
Gradient do_[0] = -239942247821344768.000
Backward Time Step 1:
Gradient di[0] = -6811036984279040.000, df[0] = -4873197669318656.000, dc_hat[0] = -4125150463131648.000
Gradient do_[0] = -215395891249414144.000
Backward Time Step 0:
Gradient di[0] = -7954566121259008.000, df[0] = -5813995223121920.000, dc_hat[0] = -8041801202008064.000
Gradient do_[0] = -123406694020546560.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 200138260480.000, df[0] = 146737102848.000, dc_hat[0] = 86919176192.000
Gradient do_[0] = 12411996209152.000
Backward Time Step 3:
Gradient di[0] = 314121420800.000, df[0] = 226670460928.000, dc_hat[0] = 123037310976.000
Gradient do_[0] = 16620859686912.000
Backward Time Step 2:
Gradient di[0] = 399903358976.000, df[0] = 287223218176.000, dc_hat[0] = 204445548544.000
Gradient do_[0] = 18121800810496.000
Backward Time Step 1:
Gradient di[0] = 501864398848.000, df[0] = 346080346112.000, dc_hat[0] = 279307780096.000
Gradient do_[0] = 15978943479808.000
Backward Time Step 0:
Gradient di[0] = 607627182080.000, df[0] = 432323264512.000, dc_hat[0] = 570159267840.000
Gradient do_[0] = 9393861033984.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2654188320325632.000, df[0] = -2083709691887616.000, dc_hat[0] = -1200043121967104.000
Gradient do_[0] = -155194537252225024.000
Backward Time Step 3:
Gradient di[0] = -4169071604006912.000, df[0] = -3182583651237888.000, dc_hat[0] = -1723507964641280.000
Gradient do_[0] = -212277212416573440.000
Backward Time Step 2:
Gradient di[0] = -5464070278873088.000, df[0] = -4116989756833792.000, dc_hat[0] = -3012643874930688.000
Gradient do_[0] = -242989441218510848.000
Backward Time Step 1:
Gradient di[0] = -6897421124632576.000, df[0] = -4935004396191744.000, dc_hat[0] = -4177464238538752.000
Gradient do_[0] = -218127576349016064.000
Backward Time Step 0:
Gradient di[0] = -8055286325575680.000, df[0] = -5887610962575360.000, dc_hat[0] = -8143625212919808.000
Gradient do_[0] = -124969246072504320.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 123744026624.000, df[0] = 90726604800.000, dc_hat[0] = 53741150208.000
Gradient do_[0] = 7674214219776.000
Backward Time Step 3:
Gradient di[0] = 194215936000.000, df[0] = 140146737152.000, dc_hat[0] = 76071247872.000
Gradient do_[0] = 10276350984192.000
Backward Time Step 2:
Gradient di[0] = 247249698816.000, df[0] = 177582653440.000, dc_hat[0] = 126402412544.000
Gradient do_[0] = 11204199186432.000
Backward Time Step 1:
Gradient di[0] = 310281863168.000, df[0] = 213967028224.000, dc_hat[0] = 172684066816.000
Gradient do_[0] = 9879112646656.000
Backward Time Step 0:
Gradient di[0] = 375664803840.000, df[0] = 267283349504.000, dc_hat[0] = 352500318208.000
Gradient do_[0] = 5807744221184.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2687958842867712.000, df[0] = -2110224672489472.000, dc_hat[0] = -1215305556688896.000
Gradient do_[0] = -157168590120812544.000
Backward Time Step 3:
Gradient di[0] = -4222054320570368.000, df[0] = -3223033653231616.000, dc_hat[0] = -1745398070771712.000
Gradient do_[0] = -214974211360292864.000
Backward Time Step 2:
Gradient di[0] = -5533445274992640.000, df[0] = -4169262998487040.000, dc_hat[0] = -3050877942235136.000
Gradient do_[0] = -246073983651282944.000
Backward Time Step 1:
Gradient di[0] = -6984863974424576.000, df[0] = -4997567574179840.000, dc_hat[0] = -4230418500943872.000
Gradient do_[0] = -220892865272741888.000
Backward Time Step 0:
Gradient di[0] = -8157238648635392.000, df[0] = -5962128645160960.000, dc_hat[0] = -8246695838089216.000
Gradient do_[0] = -126550945088667648.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 45274275840.000, df[0] = 33194215424.000, dc_hat[0] = 19662186496.000
Gradient do_[0] = 2807757012992.000
Backward Time Step 3:
Gradient di[0] = 71056785408.000, df[0] = 51274842112.000, dc_hat[0] = 27831568384.000
Gradient do_[0] = 3759739240448.000
Backward Time Step 2:
Gradient di[0] = 90458726400.000, df[0] = 64970395648.000, dc_hat[0] = 46245285888.000
Gradient do_[0] = 4099152281600.000
Backward Time Step 1:
Gradient di[0] = 113516748800.000, df[0] = 78279933952.000, dc_hat[0] = 63176474624.000
Gradient do_[0] = 3614275272704.000
Backward Time Step 0:
Gradient di[0] = 137435185152.000, df[0] = 97784332288.000, dc_hat[0] = 128960552960.000
Gradient do_[0] = 2124735447040.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2721959951466496.000, df[0] = -2136919907500032.000, dc_hat[0] = -1230672949673984.000
Gradient do_[0] = -159156197906186240.000
Backward Time Step 3:
Gradient di[0] = -4275404256837632.000, df[0] = -3263763096535040.000, dc_hat[0] = -1767438903410688.000
Gradient do_[0] = -217689730202992640.000
Backward Time Step 2:
Gradient di[0] = -5603300233707520.000, df[0] = -4221897285828608.000, dc_hat[0] = -3089375344721920.000
Gradient do_[0] = -249179760402366464.000
Backward Time Step 1:
Gradient di[0] = -7072899529703424.000, df[0] = -5060555417059328.000, dc_hat[0] = -4283731124682752.000
Gradient do_[0] = -223676914613616640.000
Backward Time Step 0:
Gradient di[0] = -8259883535171584.000, df[0] = -6037152060145664.000, dc_hat[0] = -8350467079798784.000
Gradient do_[0] = -128143372933136384.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -35373416448.000, df[0] = -25935124480.000, dc_hat[0] = -15362240512.000
Gradient do_[0] = -2193729650688.000
Backward Time Step 3:
Gradient di[0] = -55516766208.000, df[0] = -40061165568.000, dc_hat[0] = -21744658432.000
Gradient do_[0] = -2937477136384.000
Backward Time Step 2:
Gradient di[0] = -70674472960.000, df[0] = -50760744960.000, dc_hat[0] = -36130746368.000
Gradient do_[0] = -3202617180160.000
Backward Time Step 1:
Gradient di[0] = -88687484928.000, df[0] = -61157933056.000, dc_hat[0] = -49357950976.000
Gradient do_[0] = -2823732068352.000
Backward Time Step 0:
Gradient di[0] = -107372666880.000, df[0] = -76395036672.000, dc_hat[0] = -100751769600.000
Gradient do_[0] = -1659971698688.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.866
c_state[0] = 0.725, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.981, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 18135912546304.000, df[0] = 12441803030528.000, dc_hat[0] = 7585777319936.000
Gradient do_[0] = 1203732901527552.000
Backward Time Step 3:
Gradient di[0] = 28505244958720.000, df[0] = 19421506568192.000, dc_hat[0] = 10616921128960.000
Gradient do_[0] = 1577605643894784.000
Backward Time Step 2:
Gradient di[0] = 34977192345600.000, df[0] = 23900119367680.000, dc_hat[0] = 16599175135232.000
Gradient do_[0] = 1619275819253760.000
Backward Time Step 1:
Gradient di[0] = 43587513352192.000, df[0] = 28957470621696.000, dc_hat[0] = 22271919915008.000
Gradient do_[0] = 1398003667566592.000
Backward Time Step 0:
Gradient di[0] = 54296229969920.000, df[0] = 37677218922496.000, dc_hat[0] = 47337355345920.000
Gradient do_[0] = 837456211476480.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.690, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.973, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1369055821824.000, df[0] = 1003772379136.000, dc_hat[0] = 594535514112.000
Gradient do_[0] = 84901244174336.000
Backward Time Step 3:
Gradient di[0] = 2148711268352.000, df[0] = 1550535753728.000, dc_hat[0] = 841555574784.000
Gradient do_[0] = 113688983896064.000
Backward Time Step 2:
Gradient di[0] = 2735230156800.000, df[0] = 1964540297216.000, dc_hat[0] = 1398283173888.000
Gradient do_[0] = 123945894281216.000
Backward Time Step 1:
Gradient di[0] = 3432536080384.000, df[0] = 2367039995904.000, dc_hat[0] = 1910336651264.000
Gradient do_[0] = 109288949284864.000
Backward Time Step 0:
Gradient di[0] = 4157123067904.000, df[0] = 2957769179136.000, dc_hat[0] = 3900783722496.000
Gradient do_[0] = 64268745572352.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2111750862274560.000, df[0] = -1657874287689728.000, dc_hat[0] = -954738816319488.000
Gradient do_[0] = -123472827926970368.000
Backward Time Step 3:
Gradient di[0] = -3316958413979648.000, df[0] = -2532127497256960.000, dc_hat[0] = -1371143613186048.000
Gradient do_[0] = -168884728328355840.000
Backward Time Step 2:
Gradient di[0] = -4346831173582848.000, df[0] = -3275201131315200.000, dc_hat[0] = -2396550412107776.000
Gradient do_[0] = -193302321780752384.000
Backward Time Step 1:
Gradient di[0] = -5487022047232000.000, df[0] = -3925883576385536.000, dc_hat[0] = -3323234434940928.000
Gradient do_[0] = -173524220720971776.000
Backward Time Step 0:
Gradient di[0] = -6409828519903232.000, df[0] = -4684946031509504.000, dc_hat[0] = -6480122639024128.000
Gradient do_[0] = -99441712811737088.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1329353064448.000, df[0] = 974662270976.000, dc_hat[0] = 577283883008.000
Gradient do_[0] = 82438072041472.000
Backward Time Step 3:
Gradient di[0] = 2086373163008.000, df[0] = 1505551974400.000, dc_hat[0] = 817110253568.000
Gradient do_[0] = 110388628684800.000
Backward Time Step 2:
Gradient di[0] = 2655855837184.000, df[0] = 1907527778304.000, dc_hat[0] = 1357611663360.000
Gradient do_[0] = 120345914769408.000
Backward Time Step 1:
Gradient di[0] = 3332848222208.000, df[0] = 2298285916160.000, dc_hat[0] = 1854656741376.000
Gradient do_[0] = 106111730450432.000
Backward Time Step 0:
Gradient di[0] = 4036179787776.000, df[0] = 2871718838272.000, dc_hat[0] = 3787297914880.000
Gradient do_[0] = 62398970986496.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2137065667952640.000, df[0] = -1677747504021504.000, dc_hat[0] = -966166918987776.000
Gradient do_[0] = -124951567987113984.000
Backward Time Step 3:
Gradient di[0] = -3356683103371264.000, df[0] = -2562454261334016.000, dc_hat[0] = -1387512270422016.000
Gradient do_[0] = -170904273490542592.000
Backward Time Step 2:
Gradient di[0] = -4398863897067520.000, df[0] = -3314400224083968.000, dc_hat[0] = -2425072484614144.000
Gradient do_[0] = -195611485177643008.000
Backward Time Step 1:
Gradient di[0] = -5552604587229184.000, df[0] = -3972788914225152.000, dc_hat[0] = -3362625727496192.000
Gradient do_[0] = -175593467244707840.000
Backward Time Step 0:
Gradient di[0] = -6486109286563840.000, df[0] = -4740700075720704.000, dc_hat[0] = -6557240387436544.000
Gradient do_[0] = -100625139510542336.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1289449111552.000, df[0] = 945404116992.000, dc_hat[0] = 559945220096.000
Gradient do_[0] = 79962459602944.000
Backward Time Step 3:
Gradient di[0] = 2023719436288.000, df[0] = 1460340916224.000, dc_hat[0] = 792543428608.000
Gradient do_[0] = 107071697584128.000
Backward Time Step 2:
Gradient di[0] = 2576081485824.000, df[0] = 1850228473856.000, dc_hat[0] = 1316741316608.000
Gradient do_[0] = 116728059133952.000
Backward Time Step 1:
Gradient di[0] = 3232657309696.000, df[0] = 2229186330624.000, dc_hat[0] = 1798712066048.000
Gradient do_[0] = 102918766198784.000
Backward Time Step 0:
Gradient di[0] = 3914639867904.000, df[0] = 2785243824128.000, dc_hat[0] = 3673252691968.000
Gradient do_[0] = 60519977320448.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2163623195574272.000, df[0] = -1698596617453568.000, dc_hat[0] = -978157662371840.000
Gradient do_[0] = -126502746965671936.000
Backward Time Step 3:
Gradient di[0] = -3398355291996160.000, df[0] = -2594268694708224.000, dc_hat[0] = -1404687307767808.000
Gradient do_[0] = -173023066757005312.000
Backward Time Step 2:
Gradient di[0] = -4453450247045120.000, df[0] = -3355524267507712.000, dc_hat[0] = -2455002769522688.000
Gradient do_[0] = -198034018531278848.000
Backward Time Step 1:
Gradient di[0] = -5621405131472896.000, df[0] = -4021996354535424.000, dc_hat[0] = -3403962103365632.000
Gradient do_[0] = -177764298334928896.000
Backward Time Step 0:
Gradient di[0] = -6566135264706560.000, df[0] = -4799190550970368.000, dc_hat[0] = -6638143612649472.000
Gradient do_[0] = -101866659936993280.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1248551108608.000, df[0] = 915419430912.000, dc_hat[0] = 542182178816.000
Gradient do_[0] = 77425954258944.000
Backward Time Step 3:
Gradient di[0] = 1959504510976.000, df[0] = 1414004736000.000, dc_hat[0] = 767389138944.000
Gradient do_[0] = 103673824804864.000
Backward Time Step 2:
Gradient di[0] = 2494307762176.000, df[0] = 1791496945664.000, dc_hat[0] = 1274935508992.000
Gradient do_[0] = 113022332829696.000
Backward Time Step 1:
Gradient di[0] = 3129966854144.000, df[0] = 2158372192256.000, dc_hat[0] = 1741570834432.000
Gradient do_[0] = 99649306230784.000
Backward Time Step 0:
Gradient di[0] = 3790228160512.000, df[0] = 2696725659648.000, dc_hat[0] = 3556512366592.000
Gradient do_[0] = 58596586946560.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2190970460307456.000, df[0] = -1720068098490368.000, dc_hat[0] = -990516699201536.000
Gradient do_[0] = -128101316613373952.000
Backward Time Step 3:
Gradient di[0] = -3441261746847744.000, df[0] = -2627025604968448.000, dc_hat[0] = -1422411564056576.000
Gradient do_[0] = -175206903008198656.000
Backward Time Step 2:
Gradient di[0] = -4509624392744960.000, df[0] = -3397850901774336.000, dc_hat[0] = -2485956330389504.000
Gradient do_[0] = -200531456114556928.000
Backward Time Step 1:
Gradient di[0] = -5692195319316480.000, df[0] = -4072645561679872.000, dc_hat[0] = -3446825071673344.000
Gradient do_[0] = -180002886829211648.000
Backward Time Step 0:
Gradient di[0] = -6648679167426560.000, df[0] = -4859521956577280.000, dc_hat[0] = -6721592142856192.000
Gradient do_[0] = -103147230206099456.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1206771253248.000, df[0] = 884788166656.000, dc_hat[0] = 524036177920.000
Gradient do_[0] = 74834771968000.000
Backward Time Step 3:
Gradient di[0] = 1893906907136.000, df[0] = 1366670704640.000, dc_hat[0] = 741693456384.000
Gradient do_[0] = 100202719477760.000
Backward Time Step 2:
Gradient di[0] = 2410766925824.000, df[0] = 1731496116224.000, dc_hat[0] = 1232226746368.000
Gradient do_[0] = 109236587593728.000
Backward Time Step 1:
Gradient di[0] = 3025061281792.000, df[0] = 2086031327232.000, dc_hat[0] = 1683197001728.000
Gradient do_[0] = 96309407121408.000
Backward Time Step 0:
Gradient di[0] = 3663137603584.000, df[0] = 2606301446144.000, dc_hat[0] = 3437258604544.000
Gradient do_[0] = 56631773626368.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2218989082116096.000, df[0] = -1742065981456384.000, dc_hat[0] = -1003178464116736.000
Gradient do_[0] = -129739090722553856.000
Backward Time Step 3:
Gradient di[0] = -3485218321203200.000, df[0] = -2660584868806656.000, dc_hat[0] = -1440570148913152.000
Gradient do_[0] = -177444288911638528.000
Backward Time Step 2:
Gradient di[0] = -4567170512060416.000, df[0] = -3441210475675648.000, dc_hat[0] = -2517663389581312.000
Gradient do_[0] = -203089813513961472.000
Backward Time Step 1:
Gradient di[0] = -5764719600205824.000, df[0] = -4124535209066496.000, dc_hat[0] = -3490737085743104.000
Gradient do_[0] = -182296296286060544.000
Backward Time Step 0:
Gradient di[0] = -6733257810903040.000, df[0] = -4921340494610432.000, dc_hat[0] = -6807099036139520.000
Gradient do_[0] = -104459377074765824.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1163511070720.000, df[0] = 853071364096.000, dc_hat[0] = 505247629312.000
Gradient do_[0] = 72151818305536.000
Backward Time Step 3:
Gradient di[0] = 1825985658880.000, df[0] = 1317659869184.000, dc_hat[0] = 715087806464.000
Gradient do_[0] = 96608754597888.000
Backward Time Step 2:
Gradient di[0] = 2324278280192.000, df[0] = 1669377818624.000, dc_hat[0] = 1188011442176.000
Gradient do_[0] = 105317346050048.000
Backward Time Step 1:
Gradient di[0] = 2916461051904.000, df[0] = 2011141898240.000, dc_hat[0] = 1622767173632.000
Gradient do_[0] = 92851857784832.000
Backward Time Step 0:
Gradient di[0] = 3531576967168.000, df[0] = 2512696639488.000, dc_hat[0] = 3313810538496.000
Gradient do_[0] = 54597859147776.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2247540950958080.000, df[0] = -1764483294822400.000, dc_hat[0] = -1016081418289152.000
Gradient do_[0] = -131408046294302720.000
Backward Time Step 3:
Gradient di[0] = -3530011072004096.000, df[0] = -2694782472159232.000, dc_hat[0] = -1459073673330688.000
Gradient do_[0] = -179724263710785536.000
Backward Time Step 2:
Gradient di[0] = -4625817216745472.000, df[0] = -3485400320442368.000, dc_hat[0] = -2549979260387328.000
Gradient do_[0] = -205697202260017152.000
Backward Time Step 1:
Gradient di[0] = -5838628471177216.000, df[0] = -4177415114850304.000, dc_hat[0] = -3535486081564672.000
Gradient do_[0] = -184633376970375168.000
Backward Time Step 0:
Gradient di[0] = -6819443309019136.000, df[0] = -4984333706199040.000, dc_hat[0] = -6894229427060736.000
Gradient do_[0] = -105796460523552768.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1118650236928.000, df[0] = 820180942848.000, dc_hat[0] = 485764169728.000
Gradient do_[0] = 69369627410432.000
Backward Time Step 3:
Gradient di[0] = 1755556216832.000, df[0] = 1266838798336.000, dc_hat[0] = 687500623872.000
Gradient do_[0] = 92882098716672.000
Backward Time Step 2:
Gradient di[0] = 2234597244928.000, df[0] = 1604967202816.000, dc_hat[0] = 1142165209088.000
Gradient do_[0] = 101253409406976.000
Backward Time Step 1:
Gradient di[0] = 2803861815296.000, df[0] = 1933495631872.000, dc_hat[0] = 1560113577984.000
Gradient do_[0] = 89266977767424.000
Backward Time Step 0:
Gradient di[0] = 3395176890368.000, df[0] = 2415648833536.000, dc_hat[0] = 3185821089792.000
Gradient do_[0] = 52489126674432.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2276485675089920.000, df[0] = -1787208906309632.000, dc_hat[0] = -1029162479386624.000
Gradient do_[0] = -133100005710888960.000
Backward Time Step 3:
Gradient di[0] = -3575421224353792.000, df[0] = -2729450911301632.000, dc_hat[0] = -1477832479866880.000
Gradient do_[0] = -182035626130931712.000
Backward Time Step 2:
Gradient di[0] = -4685270301540352.000, df[0] = -3530196560904192.000, dc_hat[0] = -2582738049695744.000
Gradient do_[0] = -208340393853452288.000
Backward Time Step 1:
Gradient di[0] = -5913555249397760.000, df[0] = -4231023286026240.000, dc_hat[0] = -3580851673628672.000
Gradient do_[0] = -187002704269148160.000
Backward Time Step 0:
Gradient di[0] = -6906815828721664.000, df[0] = -5048193964310528.000, dc_hat[0] = -6982560261341184.000
Gradient do_[0] = -107151952202170368.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1072428023808.000, df[0] = 786292408320.000, dc_hat[0] = 465689772032.000
Gradient do_[0] = 66503072284672.000
Backward Time Step 3:
Gradient di[0] = 1682990956544.000, df[0] = 1214476320768.000, dc_hat[0] = 659077529600.000
Gradient do_[0] = 89042507005952.000
Backward Time Step 2:
Gradient di[0] = 2142200528896.000, df[0] = 1538605318144.000, dc_hat[0] = 1094931972096.000
Gradient do_[0] = 97066478993408.000
Backward Time Step 1:
Gradient di[0] = 2687863095296.000, df[0] = 1853504618496.000, dc_hat[0] = 1495568220160.000
Gradient do_[0] = 85573876318208.000
Backward Time Step 0:
Gradient di[0] = 3254669017088.000, df[0] = 2315678384128.000, dc_hat[0] = 3053977337856.000
Gradient do_[0] = 50316884049920.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2305829160091648.000, df[0] = -1810247379320832.000, dc_hat[0] = -1042422385606656.000
Gradient do_[0] = -134815235260284928.000
Backward Time Step 3:
Gradient di[0] = -3621461931589632.000, df[0] = -2764601192087552.000, dc_hat[0] = -1496850997706752.000
Gradient do_[0] = -184378977467498496.000
Backward Time Step 2:
Gradient di[0] = -4745546946314240.000, df[0] = -3575613961011200.000, dc_hat[0] = -2615950494924800.000
Gradient do_[0] = -211020281647464448.000
Backward Time Step 1:
Gradient di[0] = -5989520335962112.000, df[0] = -4285374218108928.000, dc_hat[0] = -3626846209966080.000
Gradient do_[0] = -189404896657670144.000
Backward Time Step 0:
Gradient di[0] = -6995392013008896.000, df[0] = -5112934690717696.000, dc_hat[0] = -7072107645108224.000
Gradient do_[0] = -108526118398590976.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1024842137600.000, df[0] = 751403728896.000, dc_hat[0] = 445023289344.000
Gradient do_[0] = 63551909658624.000
Backward Time Step 3:
Gradient di[0] = 1608289484800.000, df[0] = 1160572174336.000, dc_hat[0] = 629818458112.000
Gradient do_[0] = 85089920745472.000
Backward Time Step 2:
Gradient di[0] = 2047086821376.000, df[0] = 1470292426752.000, dc_hat[0] = 1046310354944.000
Gradient do_[0] = 92756445757440.000
Backward Time Step 1:
Gradient di[0] = 2568454930432.000, df[0] = 1771162828800.000, dc_hat[0] = 1429125595136.000
Gradient do_[0] = 81772251447296.000
Backward Time Step 0:
Gradient di[0] = 3110030802944.000, df[0] = 2212769038336.000, dc_hat[0] = 2918258049024.000
Gradient do_[0] = 48080795729920.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2335488862060544.000, df[0] = -1833534423564288.000, dc_hat[0] = -1055826240339968.000
Gradient do_[0] = -136548907399249920.000
Backward Time Step 3:
Gradient di[0] = -3667985285775360.000, df[0] = -2800120034754560.000, dc_hat[0] = -1516068694654976.000
Gradient do_[0] = -186747016276082688.000
Backward Time Step 2:
Gradient di[0] = -4806454414409728.000, df[0] = -3621507834052608.000, dc_hat[0] = -2649513785294848.000
Gradient do_[0] = -213728206987984896.000
Backward Time Step 1:
Gradient di[0] = -6066275696508928.000, df[0] = -4340290743697408.000, dc_hat[0] = -3673320172027904.000
Gradient do_[0] = -191832068575985664.000
Backward Time Step 0:
Gradient di[0] = -7084895910232064.000, df[0] = -5178352948215808.000, dc_hat[0] = -7162593479229440.000
Gradient do_[0] = -109914664145518592.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 975017017344.000, df[0] = 714873372672.000, dc_hat[0] = 423385137152.000
Gradient do_[0] = 60461978484736.000
Backward Time Step 3:
Gradient di[0] = 1530075283456.000, df[0] = 1104132964352.000, dc_hat[0] = 599183523840.000
Gradient do_[0] = 80951468097536.000
Backward Time Step 2:
Gradient di[0] = 1947503034368.000, df[0] = 1398768795648.000, dc_hat[0] = 995404546048.000
Gradient do_[0] = 88243894747136.000
Backward Time Step 1:
Gradient di[0] = 2443452088320.000, df[0] = 1684962541568.000, dc_hat[0] = 1359569747968.000
Gradient do_[0] = 77792460931072.000
Backward Time Step 0:
Gradient di[0] = 2958628487168.000, df[0] = 2105047252992.000, dc_hat[0] = 2776191467520.000
Gradient do_[0] = 45740130828288.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2365378344779776.000, df[0] = -1857001722216448.000, dc_hat[0] = -1069333644050432.000
Gradient do_[0] = -138295979836178432.000
Backward Time Step 3:
Gradient di[0] = -3714878812454912.000, df[0] = -2835921539956736.000, dc_hat[0] = -1535439936684032.000
Gradient do_[0] = -189133763962208256.000
Backward Time Step 2:
Gradient di[0] = -4867848287551488.000, df[0] = -3667765705572352.000, dc_hat[0] = -2683339068669952.000
Gradient do_[0] = -216457538445508608.000
Backward Time Step 1:
Gradient di[0] = -6143639868669952.000, df[0] = -4395643208466432.000, dc_hat[0] = -3720162964406272.000
Gradient do_[0] = -194278361688702976.000
Backward Time Step 0:
Gradient di[0] = -7175097739640832.000, df[0] = -5244281769951232.000, dc_hat[0] = -7253784224858112.000
Gradient do_[0] = -111314058979835904.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 923929739264.000, df[0] = 677417451520.000, dc_hat[0] = 401198710784.000
Gradient do_[0] = 57293727072256.000
Backward Time Step 3:
Gradient di[0] = 1449882812416.000, df[0] = 1046266118144.000, dc_hat[0] = 567775330304.000
Gradient do_[0] = 76708417896448.000
Backward Time Step 2:
Gradient di[0] = 1845408169984.000, df[0] = 1325441089536.000, dc_hat[0] = 943215869952.000
Gradient do_[0] = 83617585823744.000
Backward Time Step 1:
Gradient di[0] = 2315297226752.000, df[0] = 1596589211648.000, dc_hat[0] = 1288261468160.000
Gradient do_[0] = 73712342663168.000
Backward Time Step 0:
Gradient di[0] = 2803409879040.000, df[0] = 1994609917952.000, dc_hat[0] = 2630543736832.000
Gradient do_[0] = 43340460457984.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2395648737411072.000, df[0] = -1880768192184320.000, dc_hat[0] = -1083013249105920.000
Gradient do_[0] = -140065343153373184.000
Backward Time Step 3:
Gradient di[0] = -3762363970879488.000, df[0] = -2872174822031360.000, dc_hat[0] = -1555055454978048.000
Gradient do_[0] = -191550679498620928.000
Backward Time Step 2:
Gradient di[0] = -4930008275484672.000, df[0] = -3714603397677056.000, dc_hat[0] = -2717591701291008.000
Gradient do_[0] = -219221057842708480.000
Backward Time Step 1:
Gradient di[0] = -6221973092827136.000, df[0] = -4451688505147392.000, dc_hat[0] = -3767590677643264.000
Gradient do_[0] = -196755423947128832.000
Backward Time Step 0:
Gradient di[0] = -7266433440415744.000, df[0] = -5311038983503872.000, dc_hat[0] = -7346121726754816.000
Gradient do_[0] = -112731037410263040.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 871019184128.000, df[0] = 638624727040.000, dc_hat[0] = 378221068288.000
Gradient do_[0] = 54012472721408.000
Backward Time Step 3:
Gradient di[0] = 1366833364992.000, df[0] = 986337574912.000, dc_hat[0] = 535248994304.000
Gradient do_[0] = 72314263699456.000
Backward Time Step 2:
Gradient di[0] = 1739676712960.000, df[0] = 1249501773824.000, dc_hat[0] = 889169444864.000
Gradient do_[0] = 78826541416448.000
Backward Time Step 1:
Gradient di[0] = 2182589579264.000, df[0] = 1505075920896.000, dc_hat[0] = 1214419697664.000
Gradient do_[0] = 69487294414848.000
Backward Time Step 0:
Gradient di[0] = 2642684149760.000, df[0] = 1880254578688.000, dc_hat[0] = 2479728885760.000
Gradient do_[0] = 40855658299392.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2426150789840896.000, df[0] = -1904716661391360.000, dc_hat[0] = -1096796872900608.000
Gradient do_[0] = -141848269977288704.000
Backward Time Step 3:
Gradient di[0] = -3810214201524224.000, df[0] = -2908706471673856.000, dc_hat[0] = -1574820760256512.000
Gradient do_[0] = -193986183653490688.000
Backward Time Step 2:
Gradient di[0] = -4992653057851392.000, df[0] = -3761805625131008.000, dc_hat[0] = -2752109548142592.000
Gradient do_[0] = -222006137975734272.000
Backward Time Step 1:
Gradient di[0] = -6300908149276672.000, df[0] = -4508164640735232.000, dc_hat[0] = -3815382926229504.000
Gradient do_[0] = -199251555860348928.000
Backward Time Step 0:
Gradient di[0] = -7358474589569024.000, df[0] = -5378311593132032.000, dc_hat[0] = -7439171656351744.000
Gradient do_[0] = -114158959417360384.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 816155852800.000, df[0] = 598399844352.000, dc_hat[0] = 354395881472.000
Gradient do_[0] = 50610166562816.000
Backward Time Step 3:
Gradient di[0] = 1280720502784.000, df[0] = 924197781504.000, dc_hat[0] = 501522628608.000
Gradient do_[0] = 67758033207296.000
Backward Time Step 2:
Gradient di[0] = 1630049271808.000, df[0] = 1170763939840.000, dc_hat[0] = 833132167168.000
Gradient do_[0] = 73858992308224.000
Backward Time Step 1:
Gradient di[0] = 2045000286208.000, df[0] = 1410197094400.000, dc_hat[0] = 1137861591040.000
Gradient do_[0] = 65106826231808.000
Backward Time Step 0:
Gradient di[0] = 2476052578304.000, df[0] = 1761697202176.000, dc_hat[0] = 2323372310528.000
Gradient do_[0] = 38279546142720.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2456929330790400.000, df[0] = -1928882160664576.000, dc_hat[0] = -1110705520508928.000
Gradient do_[0] = -143647371648040960.000
Backward Time Step 3:
Gradient di[0] = -3858498760736768.000, df[0] = -2945570175975424.000, dc_hat[0] = -1594765380419584.000
Gradient do_[0] = -196443643681177600.000
Backward Time Step 2:
Gradient di[0] = -5055857796579328.000, df[0] = -3809429296250880.000, dc_hat[0] = -2786934753591296.000
Gradient do_[0] = -224816163278815232.000
Backward Time Step 1:
Gradient di[0] = -6380554559684608.000, df[0] = -4565149729947648.000, dc_hat[0] = -3863605745287168.000
Gradient do_[0] = -201770073143115776.000
Backward Time Step 0:
Gradient di[0] = -7451329635024896.000, df[0] = -5446179592601600.000, dc_hat[0] = -7533045682798592.000
Gradient do_[0] = -115599508628307968.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 759645470720.000, df[0] = 556967526400.000, dc_hat[0] = 329855500288.000
Gradient do_[0] = 47105766850560.000
Backward Time Step 3:
Gradient di[0] = 1192024211456.000, df[0] = 860193816576.000, dc_hat[0] = 466785894400.000
Gradient do_[0] = 63065215205376.000
Backward Time Step 2:
Gradient di[0] = 1517140705280.000, df[0] = 1089669300224.000, dc_hat[0] = 775418675200.000
Gradient do_[0] = 68742780289024.000
Backward Time Step 1:
Gradient di[0] = 1903304638464.000, df[0] = 1312485933056.000, dc_hat[0] = 1059018702848.000
Gradient do_[0] = 60595634176000.000
Backward Time Step 0:
Gradient di[0] = 2304457310208.000, df[0] = 1639608090624.000, dc_hat[0] = 2162358353920.000
Gradient do_[0] = 35626703388672.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2487958053584896.000, df[0] = -1953244557344768.000, dc_hat[0] = -1124727917641728.000
Gradient do_[0] = -145461007488122880.000
Backward Time Step 3:
Gradient di[0] = -3907168256393216.000, df[0] = -2982727548665856.000, dc_hat[0] = -1614869451243520.000
Gradient do_[0] = -198920894918164480.000
Backward Time Step 2:
Gradient di[0] = -5119574710157312.000, df[0] = -3857438977556480.000, dc_hat[0] = -2822041547833344.000
Gradient do_[0] = -227648659850788864.000
Backward Time Step 1:
Gradient di[0] = -6460836625252352.000, df[0] = -4622589011951616.000, dc_hat[0] = -3912213500788736.000
Gradient do_[0] = -204308742412435456.000
Backward Time Step 0:
Gradient di[0] = -7544928783564800.000, df[0] = -5514591442305024.000, dc_hat[0] = -7627671328522240.000
Gradient do_[0] = -117051602711347200.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 701234544640.000, df[0] = 514141683712.000, dc_hat[0] = 304490381312.000
Gradient do_[0] = 43483494612992.000
Backward Time Step 3:
Gradient di[0] = 1100351799296.000, df[0] = 794042040320.000, dc_hat[0] = 430884421632.000
Gradient do_[0] = 58214951419904.000
Backward Time Step 2:
Gradient di[0] = 1400444944384.000, df[0] = 1005854392320.000, dc_hat[0] = 715770101760.000
Gradient do_[0] = 63454996070400.000
Backward Time Step 1:
Gradient di[0] = 1756859596800.000, df[0] = 1211499937792.000, dc_hat[0] = 977534124032.000
Gradient do_[0] = 55933237460992.000
Backward Time Step 0:
Gradient di[0] = 2127110864896.000, df[0] = 1513427042304.000, dc_hat[0] = 1995947507712.000
Gradient do_[0] = 32884939292672.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2519256285577216.000, df[0] = -1977817944293376.000, dc_hat[0] = -1138871110729728.000
Gradient do_[0] = -147290371498442752.000
Backward Time Step 3:
Gradient di[0] = -3956264027553792.000, df[0] = -3020210801999872.000, dc_hat[0] = -1635148944637952.000
Gradient do_[0] = -201419775610454016.000
Backward Time Step 2:
Gradient di[0] = -5183838158323712.000, df[0] = -3905860975722496.000, dc_hat[0] = -2857451137269760.000
Gradient do_[0] = -230505792355172352.000
Backward Time Step 1:
Gradient di[0] = -6541811254296576.000, df[0] = -4680524899549184.000, dc_hat[0] = -3961240284037120.000
Gradient do_[0] = -206869298835095552.000
Backward Time Step 0:
Gradient di[0] = -7639330017247232.000, df[0] = -5583589555044352.000, dc_hat[0] = -7723107649323008.000
Gradient do_[0] = -118516143609610240.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 641061093376.000, df[0] = 470023274496.000, dc_hat[0] = 278360129536.000
Gradient do_[0] = 39752011087872.000
Backward Time Step 3:
Gradient di[0] = 1005914750976.000, df[0] = 725894955008.000, dc_hat[0] = 393900392448.000
Gradient do_[0] = 53218461614080.000
Backward Time Step 2:
Gradient di[0] = 1280234749952.000, df[0] = 919515561984.000, dc_hat[0] = 654326235136.000
Gradient do_[0] = 58008038014976.000
Backward Time Step 1:
Gradient di[0] = 1606017351680.000, df[0] = 1107481591808.000, dc_hat[0] = 893602955264.000
Gradient do_[0] = 51130855849984.000
Backward Time Step 0:
Gradient di[0] = 1944451416064.000, df[0] = 1383465877504.000, dc_hat[0] = 1824551206912.000
Gradient do_[0] = 30061042335744.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2550819731800064.000, df[0] = -2002600039809024.000, dc_hat[0] = -1153134697119744.000
Gradient do_[0] = -149135334829981696.000
Backward Time Step 3:
Gradient di[0] = -4005779900203008.000, df[0] = -3058015104139264.000, dc_hat[0] = -1655601444683776.000
Gradient do_[0] = -203939959340531712.000
Backward Time Step 2:
Gradient di[0] = -5248657804754944.000, df[0] = -3954701464764416.000, dc_hat[0] = -2893162985029632.000
Gradient do_[0] = -233387474892619776.000
Backward Time Step 1:
Gradient di[0] = -6623483815526400.000, df[0] = -4738959003353088.000, dc_hat[0] = -4010691195305984.000
Gradient do_[0] = -209451965749395456.000
Backward Time Step 0:
Gradient di[0] = -7734550515941376.000, df[0] = -5653185741979648.000, dc_hat[0] = -7819372361940992.000
Gradient do_[0] = -119993380431200256.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 578910289920.000, df[0] = 424455077888.000, dc_hat[0] = 251371618304.000
Gradient do_[0] = 35897919668224.000
Backward Time Step 3:
Gradient di[0] = 908378374144.000, df[0] = 655511060480.000, dc_hat[0] = 355703586816.000
Gradient do_[0] = 48058045825024.000
Backward Time Step 2:
Gradient di[0] = 1156081254400.000, df[0] = 830344200192.000, dc_hat[0] = 590867988480.000
Gradient do_[0] = 52382448746496.000
Backward Time Step 1:
Gradient di[0] = 1450233167872.000, df[0] = 1000055635968.000, dc_hat[0] = 806922027008.000
Gradient do_[0] = 46171112341504.000
Backward Time Step 0:
Gradient di[0] = 1755812724736.000, df[0] = 1249250639872.000, dc_hat[0] = 1647544631296.000
Gradient do_[0] = 27144707112960.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2582669867089920.000, df[0] = -2027607084236800.000, dc_hat[0] = -1167527132528640.000
Gradient do_[0] = -150996996994367488.000
Backward Time Step 3:
Gradient di[0] = -4055739496660992.000, df[0] = -3096156829646848.000, dc_hat[0] = -1676237688799232.000
Gradient do_[0] = -206482700238848000.000
Backward Time Step 2:
Gradient di[0] = -5314055661158400.000, df[0] = -4003977624551424.000, dc_hat[0] = -2929195613159424.000
Gradient do_[0] = -236294875694235648.000
Backward Time Step 1:
Gradient di[0] = -6705884910583808.000, df[0] = -4797916019425280.000, dc_hat[0] = -4060581535416320.000
Gradient do_[0] = -212057619328663552.000
Backward Time Step 0:
Gradient di[0] = -7830605848903680.000, df[0] = -5723392888012800.000, dc_hat[0] = -7916481572503552.000
Gradient do_[0] = -121483588054024192.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 515061710848.000, df[0] = 377641959424.000, dc_hat[0] = 223646220288.000
Gradient do_[0] = 31938565898240.000
Backward Time Step 3:
Gradient di[0] = 808179990528.000, df[0] = 583206043648.000, dc_hat[0] = 316465053696.000
Gradient do_[0] = 42756848222208.000
Backward Time Step 2:
Gradient di[0] = 1028544987136.000, df[0] = 738742829056.000, dc_hat[0] = 525681197056.000
Gradient do_[0] = 46603607998464.000
Backward Time Step 1:
Gradient di[0] = 1290216013824.000, df[0] = 889710379008.000, dc_hat[0] = 717885931520.000
Gradient do_[0] = 41076626817024.000
Backward Time Step 0:
Gradient di[0] = 1562054623232.000, df[0] = 1111392911360.000, dc_hat[0] = 1465734135808.000
Gradient do_[0] = 24149223276544.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2614748977823744.000, df[0] = -2052793577766912.000, dc_hat[0] = -1182023184023552.000
Gradient do_[0] = -152872042276847616.000
Backward Time Step 3:
Gradient di[0] = -4106052622614528.000, df[0] = -3134569674964992.000, dc_hat[0] = -1697019022278656.000
Gradient do_[0] = -209043514359545856.000
Backward Time Step 2:
Gradient di[0] = -5379921132126208.000, df[0] = -4053607045398528.000, dc_hat[0] = -2965487281504256.000
Gradient do_[0] = -239222943878479872.000
Backward Time Step 1:
Gradient di[0] = -6788862605000704.000, df[0] = -4857283204874240.000, dc_hat[0] = -4110820036313088.000
Gradient do_[0] = -214681432029659136.000
Backward Time Step 0:
Gradient di[0] = -7927340860440576.000, df[0] = -5794096102768640.000, dc_hat[0] = -8014276904091648.000
Gradient do_[0] = -122984318346723328.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 449361575936.000, df[0] = 329471131648.000, dc_hat[0] = 195117236224.000
Gradient do_[0] = 27864445485056.000
Backward Time Step 3:
Gradient di[0] = 705080328192.000, df[0] = 508807184384.000, dc_hat[0] = 276091404288.000
Gradient do_[0] = 37302181036032.000
Backward Time Step 2:
Gradient di[0] = 897320091648.000, df[0] = 644492296192.000, dc_hat[0] = 458610245632.000
Gradient do_[0] = 40657628430336.000
Backward Time Step 1:
Gradient di[0] = 1125577261056.000, df[0] = 776178302976.000, dc_hat[0] = 626278858752.000
Gradient do_[0] = 35835013496832.000
Backward Time Step 0:
Gradient di[0] = 1362705645568.000, df[0] = 969557213184.000, dc_hat[0] = 1278677483520.000
Gradient do_[0] = 21067305320448.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2647110751092736.000, df[0] = -2078202470072320.000, dc_hat[0] = -1196646608142336.000
Gradient do_[0] = -154763614593482752.000
Backward Time Step 3:
Gradient di[0] = -4156812693602304.000, df[0] = -3173323701747712.000, dc_hat[0] = -1717984770916352.000
Gradient do_[0] = -211626902828351488.000
Backward Time Step 2:
Gradient di[0] = -5446360518098944.000, df[0] = -4103668647329792.000, dc_hat[0] = -3002093556203520.000
Gradient do_[0] = -242176713147023360.000
Backward Time Step 1:
Gradient di[0] = -6872569906987008.000, df[0] = -4917174376333312.000, dc_hat[0] = -4161502261018624.000
Gradient do_[0] = -217328437554053120.000
Backward Time Step 0:
Gradient di[0] = -8024916074954752.000, df[0] = -5865414034718720.000, dc_hat[0] = -8112922102333440.000
Gradient do_[0] = -124498088160133120.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 381364109312.000, df[0] = 279615897600.000, dc_hat[0] = 165591007232.000
Gradient do_[0] = 23647896993792.000
Backward Time Step 3:
Gradient di[0] = 598378610688.000, df[0] = 431808741376.000, dc_hat[0] = 234307829760.000
Gradient do_[0] = 31657006465024.000
Backward Time Step 2:
Gradient di[0] = 761515540480.000, df[0] = 546952151040.000, dc_hat[0] = 389199659008.000
Gradient do_[0] = 34504219557888.000
Backward Time Step 1:
Gradient di[0] = 955204173824.000, df[0] = 658692112384.000, dc_hat[0] = 531481329664.000
Gradient do_[0] = 30410822123520.000
Backward Time Step 0:
Gradient di[0] = 1156423090176.000, df[0] = 822788358144.000, dc_hat[0] = 1085114941440.000
Gradient do_[0] = 17878195634176.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2679727806480384.000, df[0] = -2103812017881088.000, dc_hat[0] = -1211385325289472.000
Gradient do_[0] = -156670047496962048.000
Backward Time Step 3:
Gradient di[0] = -4207971391242240.000, df[0] = -3212381865902080.000, dc_hat[0] = -1739114265182208.000
Gradient do_[0] = -214230683801878528.000
Backward Time Step 2:
Gradient di[0] = -5513320131985408.000, df[0] = -4154121896591360.000, dc_hat[0] = -3038985714663424.000
Gradient do_[0] = -245153572159750144.000
Backward Time Step 1:
Gradient di[0] = -6956931654615040.000, df[0] = -4977532625485824.000, dc_hat[0] = -4212581501763584.000
Gradient do_[0] = -219996127640944640.000
Backward Time Step 0:
Gradient di[0] = -8123260088614912.000, df[0] = -5937293533642752.000, dc_hat[0] = -8212344689655808.000
Gradient do_[0] = -126023797982625792.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 312032002048.000, df[0] = 228781899776.000, dc_hat[0] = 135485702144.000
Gradient do_[0] = 19348632633344.000
Backward Time Step 3:
Gradient di[0] = 489585541120.000, df[0] = 353300709376.000, dc_hat[0] = 191705858048.000
Gradient do_[0] = 25901259554816.000
Backward Time Step 2:
Gradient di[0] = 623053111296.000, df[0] = 447502974976.000, dc_hat[0] = 318431461376.000
Gradient do_[0] = 28230421577728.000
Backward Time Step 1:
Gradient di[0] = 781504544768.000, df[0] = 538911866880.000, dc_hat[0] = 434833162240.000
Gradient do_[0] = 24880745545728.000
Backward Time Step 0:
Gradient di[0] = 946117541888.000, df[0] = 673157218304.000, dc_hat[0] = 887777329152.000
Gradient do_[0] = 14626891759616.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2712580279762944.000, df[0] = -2129606249283584.000, dc_hat[0] = -1226231013965824.000
Gradient do_[0] = -158590327375003648.000
Backward Time Step 3:
Gradient di[0] = -4259504824778752.000, df[0] = -3251726450688000.000, dc_hat[0] = -1760401331060736.000
Gradient do_[0] = -216853517250330624.000
Backward Time Step 2:
Gradient di[0] = -5580774203981824.000, df[0] = -4204947465830400.000, dc_hat[0] = -3076150066675712.000
Gradient do_[0] = -248152266786209792.000
Backward Time Step 1:
Gradient di[0] = -7041906508824576.000, df[0] = -5038330035044352.000, dc_hat[0] = -4264029304389632.000
Gradient do_[0] = -222683179440406528.000
Backward Time Step 0:
Gradient di[0] = -8222306329427968.000, df[0] = -6009686818029568.000, dc_hat[0] = -8312478094065664.000
Gradient do_[0] = -127560399842181120.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 240040394752.000, df[0] = 175997861888.000, dc_hat[0] = 104226013184.000
Gradient do_[0] = 14884472356864.000
Backward Time Step 3:
Gradient di[0] = 376623005696.000, df[0] = 271783739392.000, dc_hat[0] = 147472171008.000
Gradient do_[0] = 19924946780160.000
Backward Time Step 2:
Gradient di[0] = 479288557568.000, df[0] = 344245436416.000, dc_hat[0] = 244954300416.000
Gradient do_[0] = 21716405321728.000
Backward Time Step 1:
Gradient di[0] = 601163825152.000, df[0] = 414551998464.000, dc_hat[0] = 334490271744.000
Gradient do_[0] = 19139236200448.000
Backward Time Step 0:
Gradient di[0] = 727779835904.000, df[0] = 517811142656.000, dc_hat[0] = 682902945792.000
Gradient do_[0] = 11251410272256.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2745722663337984.000, df[0] = -2155627442864128.000, dc_hat[0] = -1241206759620608.000
Gradient do_[0] = -160527460704714752.000
Backward Time Step 3:
Gradient di[0] = -4311484934914048.000, df[0] = -3291411680067584.000, dc_hat[0] = -1781869725089792.000
Gradient do_[0] = -219499114025451520.000
Backward Time Step 2:
Gradient di[0] = -5648812928401408.000, df[0] = -4256214611394560.000, dc_hat[0] = -3113638151847936.000
Gradient do_[0] = -251177023274221568.000
Backward Time Step 1:
Gradient di[0] = -7127614728699904.000, df[0] = -5099652504354816.000, dc_hat[0] = -4315921099259904.000
Gradient do_[0] = -225393406883397632.000
Backward Time Step 0:
Gradient di[0] = -8322211563700224.000, df[0] = -6082707704512512.000, dc_hat[0] = -8413478544998400.000
Gradient do_[0] = -129110324690288640.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 166459424768.000, df[0] = 122048176128.000, dc_hat[0] = 72275935232.000
Gradient do_[0] = 10321738596352.000
Backward Time Step 3:
Gradient di[0] = 261171298304.000, df[0] = 188470050816.000, dc_hat[0] = 102262194176.000
Gradient do_[0] = 13816839536640.000
Backward Time Step 2:
Gradient di[0] = 332362678272.000, df[0] = 238716796928.000, dc_hat[0] = 169853616128.000
Gradient do_[0] = 15058904023040.000
Backward Time Step 1:
Gradient di[0] = 416867418112.000, df[0] = 287463407616.000, dc_hat[0] = 231926415360.000
Gradient do_[0] = 13271459430400.000
Backward Time Step 0:
Gradient di[0] = 504644534272.000, df[0] = 359051689984.000, dc_hat[0] = 473526796288.000
Gradient do_[0] = 7801758810112.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2779217771102208.000, df[0] = -2181923917004800.000, dc_hat[0] = -1256330144776192.000
Gradient do_[0] = -162484058826211328.000
Backward Time Step 3:
Gradient di[0] = -4364029296377856.000, df[0] = -3331527211483136.000, dc_hat[0] = -1803528708292608.000
Gradient do_[0] = -222170858561470464.000
Backward Time Step 2:
Gradient di[0] = -5717611862032384.000, df[0] = -4308045471416320.000, dc_hat[0] = -3151379103219712.000
Gradient do_[0] = -254231037079453696.000
Backward Time Step 1:
Gradient di[0] = -7214292537442304.000, df[0] = -5161649820401664.000, dc_hat[0] = -4368059049443328.000
Gradient do_[0] = -228129318230818816.000
Backward Time Step 0:
Gradient di[0] = -8423038068457472.000, df[0] = -6156401827119104.000, dc_hat[0] = -8515410466963456.000
Gradient do_[0] = -130674534599622656.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 90576175104.000, df[0] = 66410582016.000, dc_hat[0] = 39327514624.000
Gradient do_[0] = 5616381722624.000
Backward Time Step 3:
Gradient di[0] = 142110113792.000, df[0] = 102551617536.000, dc_hat[0] = 55643009024.000
Gradient do_[0] = 7518076010496.000
Backward Time Step 2:
Gradient di[0] = 180844593152.000, df[0] = 129890222080.000, dc_hat[0] = 92419874816.000
Gradient do_[0] = 8193802502144.000
Backward Time Step 1:
Gradient di[0] = 226819244032.000, df[0] = 156409987072.000, dc_hat[0] = 126191927296.000
Gradient do_[0] = 7221053227008.000
Backward Time Step 0:
Gradient di[0] = 274574966784.000, df[0] = 195358507008.000, dc_hat[0] = 257643937792.000
Gradient do_[0] = 4244904083456.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2812910715797504.000, df[0] = -2208378365411328.000, dc_hat[0] = -1271554327445504.000
Gradient do_[0] = -164453335691165696.000
Backward Time Step 3:
Gradient di[0] = -4416872158068736.000, df[0] = -3371871449907200.000, dc_hat[0] = -1825353316171776.000
Gradient do_[0] = -224860212463403008.000
Backward Time Step 2:
Gradient di[0] = -5786784457818112.000, df[0] = -4360167315472384.000, dc_hat[0] = -3189489690214400.000
Gradient do_[0] = -257306182123782144.000
Backward Time Step 1:
Gradient di[0] = -7301432055169024.000, df[0] = -5223996102541312.000, dc_hat[0] = -4420814938046464.000
Gradient do_[0] = -230884711549894656.000
Backward Time Step 0:
Gradient di[0] = -8524582906494976.000, df[0] = -6230620472606720.000, dc_hat[0] = -8618068238401536.000
Gradient do_[0] = -132249894244057088.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 12755316736.000, df[0] = 9352227840.000, dc_hat[0] = 5538235904.000
Gradient do_[0] = 790919184384.000
Backward Time Step 3:
Gradient di[0] = 20012228608.000, df[0] = 14441545728.000, dc_hat[0] = 7835696640.000
Gradient do_[0] = 1058705702912.000
Backward Time Step 2:
Gradient di[0] = 25466525696.000, df[0] = 18291146752.000, dc_hat[0] = 13014478848.000
Gradient do_[0] = 1153847001088.000
Backward Time Step 1:
Gradient di[0] = 31939893248.000, df[0] = 22025105408.000, dc_hat[0] = 17769887744.000
Gradient do_[0] = 1016842813440.000
Backward Time Step 0:
Gradient di[0] = 38664093696.000, df[0] = 27509280768.000, dc_hat[0] = 36279959552.000
Gradient do_[0] = 597743370240.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2846825388179456.000, df[0] = -2235006625775616.000, dc_hat[0] = -1286879978717184.000
Gradient do_[0] = -166435600537223168.000
Backward Time Step 3:
Gradient di[0] = -4470060496191488.000, df[0] = -3412479560384512.000, dc_hat[0] = -1847320597495808.000
Gradient do_[0] = -227567192911118336.000
Backward Time Step 2:
Gradient di[0] = -5856394603397120.000, df[0] = -4412618261397504.000, dc_hat[0] = -3227840795377664.000
Gradient do_[0] = -260400706060550144.000
Backward Time Step 1:
Gradient di[0] = -7389119181225984.000, df[0] = -5286733763575808.000, dc_hat[0] = -4473900733825024.000
Gradient do_[0] = -233657490896584704.000
Backward Time Step 0:
Gradient di[0] = -8626802054397952.000, df[0] = -6305333039333376.000, dc_hat[0] = -8721408909639680.000
Gradient do_[0] = -133835716428824576.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = -66722713600.000, df[0] = -48921309184.000, dc_hat[0] = -28970186752.000
Gradient do_[0] = -4137259892736.000
Backward Time Step 3:
Gradient di[0] = -104681857024.000, df[0] = -75542315008.000, dc_hat[0] = -40987357184.000
Gradient do_[0] = -5537956102144.000
Backward Time Step 2:
Gradient di[0] = -133210767360.000, df[0] = -95677734912.000, dc_hat[0] = -68075941888.000
Gradient do_[0] = -6035548405760.000
Backward Time Step 1:
Gradient di[0] = -167067713536.000, df[0] = -115206520832.000, dc_hat[0] = -92948643840.000
Gradient do_[0] = -5318789562368.000
Backward Time Step 0:
Gradient di[0] = -202236706816.000, df[0] = -143890268160.000, dc_hat[0] = -189766254592.000
Gradient do_[0] = -3126561341440.000
Time Step 0:
i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829
c_state[0] = 0.469, h_state[0] = 0.075
Time Step 1:
i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.866
c_state[0] = 0.725, h_state[0] = 0.092
Time Step 2:
i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863
c_state[0] = 0.877, h_state[0] = 0.105
Time Step 3:
i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885
c_state[0] = 0.980, h_state[0] = 0.119
Time Step 4:
i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870
c_state[0] = 1.059, h_state[0] = 0.126
Backward Time Step 4:
Gradient di[0] = 19156218937344.000, df[0] = 13142134358016.000, dc_hat[0] = 8010554408960.000
Gradient do_[0] = 1271238378913792.000
Backward Time Step 3:
Gradient di[0] = 30098356961280.000, df[0] = 20507969716224.000, dc_hat[0] = 11206314164224.000
Gradient do_[0] = 1665477721980928.000
Backward Time Step 2:
Gradient di[0] = 36914476351488.000, df[0] = 25224460369920.000, dc_hat[0] = 17511709933568.000
Gradient do_[0] = 1708683079712768.000
Backward Time Step 1:
Gradient di[0] = 45967902507008.000, df[0] = 30538465280000.000, dc_hat[0] = 23479652974592.000
Gradient do_[0] = 1474196118962176.000
Backward Time Step 0:
Gradient di[0] = 57253348507648.000, df[0] = 39729219239936.000, dc_hat[0] = 49915472379904.000
Gradient do_[0] = 883066348240896.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.824, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1345754759168.000, df[0] = 986717749248.000, dc_hat[0] = 584281620480.000
Gradient do_[0] = 83443371212800.000
Backward Time Step 3:
Gradient di[0] = 2111409487872.000, df[0] = 1523686309888.000, dc_hat[0] = 826660618240.000
Gradient do_[0] = 111696756604928.000
Backward Time Step 2:
Gradient di[0] = 2686690263040.000, df[0] = 1929704767488.000, dc_hat[0] = 1372963471360.000
Gradient do_[0] = 121728097320960.000
Backward Time Step 1:
Gradient di[0] = 3369711697920.000, df[0] = 2323685048320.000, dc_hat[0] = 1874749554688.000
Gradient do_[0] = 107278585823232.000
Backward Time Step 0:
Gradient di[0] = 4080442802176.000, df[0] = 2903211507712.000, dc_hat[0] = 3828831485952.000
Gradient do_[0] = 63083267489792.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2209343525093376.000, df[0] = -1734537440657408.000, dc_hat[0] = -998669016891392.000
Gradient do_[0] = -129162302384504832.000
Backward Time Step 3:
Gradient di[0] = -3469109241053184.000, df[0] = -2648370518687744.000, dc_hat[0] = -1433582371340288.000
Gradient do_[0] = -176605361539645440.000
Backward Time Step 2:
Gradient di[0] = -4544658440978432.000, df[0] = -3424274882756608.000, dc_hat[0] = -2504781440483328.000
Gradient do_[0] = -202073091675783168.000
Backward Time Step 1:
Gradient di[0] = -5734208152535040.000, df[0] = -4102685368254464.000, dc_hat[0] = -3471894527344640.000
Gradient do_[0] = -181326131893370880.000
Backward Time Step 0:
Gradient di[0] = -6696747904532480.000, df[0] = -4894655325929472.000, dc_hat[0] = -6770188087197696.000
Gradient do_[0] = -103892965377703936.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1307062960128.000, df[0] = 958347804672.000, dc_hat[0] = 567473537024.000
Gradient do_[0] = 81043340132352.000
Backward Time Step 3:
Gradient di[0] = 2050681602048.000, df[0] = 1479862910976.000, dc_hat[0] = 802855518208.000
Gradient do_[0] = 108482183299072.000
Backward Time Step 2:
Gradient di[0] = 2609398415360.000, df[0] = 1874187124736.000, dc_hat[0] = 1333374091264.000
Gradient do_[0] = 118223177515008.000
Backward Time Step 1:
Gradient di[0] = 3272694300672.000, df[0] = 2256774365184.000, dc_hat[0] = 1820585230336.000
Gradient do_[0] = 104186863681536.000
Backward Time Step 0:
Gradient di[0] = 3962756661248.000, df[0] = 2819478519808.000, dc_hat[0] = 3718402539520.000
Gradient do_[0] = 61263858106368.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2234885695602688.000, df[0] = -1754589435002880.000, dc_hat[0] = -1010197178875904.000
Gradient do_[0] = -130654030425751552.000
Backward Time Step 3:
Gradient di[0] = -3509172159119360.000, df[0] = -2678956591415296.000, dc_hat[0] = -1450085246304256.000
Gradient do_[0] = -178641880412585984.000
Backward Time Step 2:
Gradient di[0] = -4597113413435392.000, df[0] = -3463792876847104.000, dc_hat[0] = -2533525509111808.000
Gradient do_[0] = -204400568813223936.000
Backward Time Step 1:
Gradient di[0] = -5800289445609472.000, df[0] = -4149946383073280.000, dc_hat[0] = -3511574119579648.000
Gradient do_[0] = -183410840299372544.000
Backward Time Step 0:
Gradient di[0] = -6773575205781504.000, df[0] = -4950808802099200.000, dc_hat[0] = -6847858812649472.000
Gradient do_[0] = -105084861752016896.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1267899432960.000, df[0] = 929631961088.000, dc_hat[0] = 550460981248.000
Gradient do_[0] = 78614049587200.000
Backward Time Step 3:
Gradient di[0] = 1989213552640.000, df[0] = 1435505000448.000, dc_hat[0] = 778763304960.000
Gradient do_[0] = 105228678463488.000
Backward Time Step 2:
Gradient di[0] = 2531163111424.000, df[0] = 1817992626176.000, dc_hat[0] = 1293310885888.000
Gradient do_[0] = 114675626803200.000
Backward Time Step 1:
Gradient di[0] = 3174490963968.000, df[0] = 2189045923840.000, dc_hat[0] = 1765774458880.000
Gradient do_[0] = 101057594130432.000
Backward Time Step 0:
Gradient di[0] = 3843652583424.000, df[0] = 2734736801792.000, dc_hat[0] = 3606642688000.000
Gradient do_[0] = 59422516707328.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2261528988352512.000, df[0] = -1775506328387584.000, dc_hat[0] = -1022223087304704.000
Gradient do_[0] = -132210062717353984.000
Backward Time Step 3:
Gradient di[0] = -3550965680570368.000, df[0] = -2710864708763648.000, dc_hat[0] = -1467303770193920.000
Gradient do_[0] = -180766394575486976.000
Backward Time Step 2:
Gradient di[0] = -4651836665495552.000, df[0] = -3505019999485952.000, dc_hat[0] = -2563520218529792.000
Gradient do_[0] = -206828994861989888.000
Backward Time Step 1:
Gradient di[0] = -5869227965677568.000, df[0] = -4199252339195904.000, dc_hat[0] = -3552980288667648.000
Gradient do_[0] = -185586000716627968.000
Backward Time Step 0:
Gradient di[0] = -6853733791039488.000, df[0] = -5009395914113024.000, dc_hat[0] = -6928896255590400.000
Gradient do_[0] = -106328435172835328.000
Epoch 600, Train Loss=0.011401, Weight Norm=12.822992
Sample Predictions at Epoch 600:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.25 | 63.87 | 6.62 |
| 193 | 2024-10-14 | 56.65 | 66.55 | 9.90 |
| 194 | 2024-10-15 | 56.84 | 66.00 | 9.16 |
| 195 | 2024-10-16 | 57.79 | 67.20 | 9.41 |
| 196 | 2024-10-17 | 57.33 | 66.76 | 9.43 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1228033490944.000, df[0] = 900403232768.000, dc_hat[0] = 533149712384.000
Gradient do_[0] = 76141893255168.000
Backward Time Step 3:
Gradient di[0] = 1926640041984.000, df[0] = 1390351613952.000, dc_hat[0] = 754259197952.000
Gradient do_[0] = 101918072373248.000
Backward Time Step 2:
Gradient di[0] = 2451505676288.000, df[0] = 1760780222464.000, dc_hat[0] = 1252601495552.000
Gradient do_[0] = 111066419822592.000
Backward Time Step 1:
Gradient di[0] = 3074514485248.000, df[0] = 2120105197568.000, dc_hat[0] = 1710162051072.000
Gradient do_[0] = 97874905923584.000
Backward Time Step 0:
Gradient di[0] = 3722542055424.000, df[0] = 2648567185408.000, dc_hat[0] = 3493000380416.000
Gradient do_[0] = 57550166818816.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2288907525816320.000, df[0] = -1797002908139520.000, dc_hat[0] = -1034593935294464.000
Gradient do_[0] = -133810230092890112.000
Backward Time Step 3:
Gradient di[0] = -3593905152983040.000, df[0] = -2743648999440384.000, dc_hat[0] = -1485035676893184.000
Gradient do_[0] = -182951605216215040.000
Backward Time Step 2:
Gradient di[0] = -4708038460047360.000, df[0] = -3547367303282688.000, dc_hat[0] = -2594475658444800.000
Gradient do_[0] = -209327205539381248.000
Backward Time Step 1:
Gradient di[0] = -5940022985359360.000, df[0] = -4249903962259456.000, dc_hat[0] = -3595833324863488.000
Gradient do_[0] = -187824434592088064.000
Backward Time Step 0:
Gradient di[0] = -6936257829535744.000, df[0] = -5069713361076224.000, dc_hat[0] = -7012324921573376.000
Gradient do_[0] = -107608713384165376.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1227753783296.000, df[0] = 900198236160.000, dc_hat[0] = 533028274176.000
Gradient do_[0] = 76124562391040.000
Backward Time Step 3:
Gradient di[0] = 1926198853632.000, df[0] = 1390033502208.000, dc_hat[0] = 754086903808.000
Gradient do_[0] = 101894752043008.000
Backward Time Step 2:
Gradient di[0] = 2450942853120.000, df[0] = 1760375865344.000, dc_hat[0] = 1252313661440.000
Gradient do_[0] = 111040901677056.000
Backward Time Step 1:
Gradient di[0] = 3073809055744.000, df[0] = 2119618265088.000, dc_hat[0] = 1709766868992.000
Gradient do_[0] = 97852424454144.000
Backward Time Step 0:
Gradient di[0] = 3721687203840.000, df[0] = 2647959011328.000, dc_hat[0] = 3492198219776.000
Gradient do_[0] = 57536950566912.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2288789414215680.000, df[0] = -1796910432124928.000, dc_hat[0] = -1034540717965312.000
Gradient do_[0] = -133803340965347328.000
Backward Time Step 3:
Gradient di[0] = -3593715905986560.000, df[0] = -2743503775858688.000, dc_hat[0] = -1484956354215936.000
Gradient do_[0] = -182942070388817920.000
Backward Time Step 2:
Gradient di[0] = -4707790962556928.000, df[0] = -3547180472205312.000, dc_hat[0] = -2594337951055872.000
Gradient do_[0] = -209316210423103488.000
Backward Time Step 1:
Gradient di[0] = -5939710526488576.000, df[0] = -4249679550218240.000, dc_hat[0] = -3595638709157888.000
Gradient do_[0] = -187814504627699712.000
Backward Time Step 0:
Gradient di[0] = -6935891683573760.000, df[0] = -5069445462491136.000, dc_hat[0] = -7011955017515008.000
Gradient do_[0] = -107603035437400064.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1227730714624.000, df[0] = 900181262336.000, dc_hat[0] = 533018214400.000
Gradient do_[0] = 76123127939072.000
Backward Time Step 3:
Gradient di[0] = 1926162153472.000, df[0] = 1390006894592.000, dc_hat[0] = 754072027136.000
Gradient do_[0] = 101892797497344.000
Backward Time Step 2:
Gradient di[0] = 2450899861504.000, df[0] = 1760345325568.000, dc_hat[0] = 1252291248128.000
Gradient do_[0] = 111038980685824.000
Backward Time Step 1:
Gradient di[0] = 3073755578368.000, df[0] = 2119581564928.000, dc_hat[0] = 1709735411712.000
Gradient do_[0] = 97850696400896.000
Backward Time Step 0:
Gradient di[0] = 3721621667840.000, df[0] = 2647912349696.000, dc_hat[0] = 3492136353792.000
Gradient do_[0] = 57535931351040.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2288931953442816.000, df[0] = -1797022503927808.000, dc_hat[0] = -1034605008257024.000
Gradient do_[0] = -133811698971705344.000
Backward Time Step 3:
Gradient di[0] = -3593941660205056.000, df[0] = -2743676379856896.000, dc_hat[0] = -1485049635536896.000
Gradient do_[0] = -182953443462217728.000
Backward Time Step 2:
Gradient di[0] = -4708083020333056.000, df[0] = -3547400589279232.000, dc_hat[0] = -2594497133281280.000
Gradient do_[0] = -209329112504860672.000
Backward Time Step 1:
Gradient di[0] = -5940081504288768.000, df[0] = -4249944764448768.000, dc_hat[0] = -3595857752489984.000
Gradient do_[0] = -187826186938744832.000
Backward Time Step 0:
Gradient di[0] = -6936325475270656.000, df[0] = -5069762753200128.000, dc_hat[0] = -7012393641050112.000
Gradient do_[0] = -107609761356185600.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1227910152192.000, df[0] = 900312989696.000, dc_hat[0] = 533096136704.000
Gradient do_[0] = 76134217678848.000
Backward Time Step 3:
Gradient di[0] = 1926444744704.000, df[0] = 1390210842624.000, dc_hat[0] = 754182586368.000
Gradient do_[0] = 101907687276544.000
Backward Time Step 2:
Gradient di[0] = 2451258736640.000, df[0] = 1760602882048.000, dc_hat[0] = 1252473176064.000
Gradient do_[0] = 111055128756224.000
Backward Time Step 1:
Gradient di[0] = 3074205155328.000, df[0] = 2119891156992.000, dc_hat[0] = 1709982482432.000
Gradient do_[0] = 97864915091456.000
Backward Time Step 0:
Gradient di[0] = 3722165354496.000, df[0] = 2648299274240.000, dc_hat[0] = 3492646748160.000
Gradient do_[0] = 57544345124864.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2289242264829952.000, df[0] = -1797265706450944.000, dc_hat[0] = -1034744527585280.000
Gradient do_[0] = -133829797963890688.000
Backward Time Step 3:
Gradient di[0] = -3594429675864064.000, df[0] = -2744049505140736.000, dc_hat[0] = -1485251364782080.000
Gradient do_[0] = -182978251193319424.000
Backward Time Step 2:
Gradient di[0] = -4708725654814720.000, df[0] = -3547885115277312.000, dc_hat[0] = -2594850662776832.000
Gradient do_[0] = -209357716987052032.000
Backward Time Step 1:
Gradient di[0] = -5940892179365888.000, df[0] = -4250524853469184.000, dc_hat[0] = -3596344962842624.000
Gradient do_[0] = -187851716224352256.000
Backward Time Step 0:
Gradient di[0] = -6937267146850304.000, df[0] = -5070451021709312.000, dc_hat[0] = -7013345513177088.000
Gradient do_[0] = -107624364244992000.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1228104663040.000, df[0] = 900455530496.000, dc_hat[0] = 533179990016.000
Gradient do_[0] = 76146314051584.000
Backward Time Step 3:
Gradient di[0] = 1926751322112.000, df[0] = 1390431698944.000, dc_hat[0] = 754302124032.000
Gradient do_[0] = 101923969564672.000
Backward Time Step 2:
Gradient di[0] = 2451648020480.000, df[0] = 1760882196480.000, dc_hat[0] = 1252670963712.000
Gradient do_[0] = 111072803553280.000
Backward Time Step 1:
Gradient di[0] = 3074691956736.000, df[0] = 2120226439168.000, dc_hat[0] = 1710250655744.000
Gradient do_[0] = 97880392073216.000
Backward Time Step 0:
Gradient di[0] = 3722751508480.000, df[0] = 2648716083200.000, dc_hat[0] = 3493196464128.000
Gradient do_[0] = 57553400627200.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2289676593397760.000, df[0] = -1797606485262336.000, dc_hat[0] = -1034940217032704.000
Gradient do_[0] = -133855164040740864.000
Backward Time Step 3:
Gradient di[0] = -3595113917841408.000, df[0] = -2744571612102656.000, dc_hat[0] = -1485532416704512.000
Gradient do_[0] = -183013040428417024.000
Backward Time Step 2:
Gradient di[0] = -4709619008012288.000, df[0] = -3548558082965504.000, dc_hat[0] = -2595341094354944.000
Gradient do_[0] = -209397333765390336.000
Backward Time Step 1:
Gradient di[0] = -5942014239571968.000, df[0] = -4251328012353536.000, dc_hat[0] = -3597020078014464.000
Gradient do_[0] = -187887141114609664.000
Backward Time Step 0:
Gradient di[0] = -6938573890650112.000, df[0] = -5071406115061760.000, dc_hat[0] = -7014666752491520.000
Gradient do_[0] = -107644645080563712.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1228482281472.000, df[0] = 900732420096.000, dc_hat[0] = 533343895552.000
Gradient do_[0] = 76169684713472.000
Backward Time Step 3:
Gradient di[0] = 1927341670400.000, df[0] = 1390857814016.000, dc_hat[0] = 754533269504.000
Gradient do_[0] = 101955108077568.000
Backward Time Step 2:
Gradient di[0] = 2452399849472.000, df[0] = 1761422082048.000, dc_hat[0] = 1253054218240.000
Gradient do_[0] = 111106810970112.000
Backward Time Step 1:
Gradient di[0] = 3075633840128.000, df[0] = 2120875900928.000, dc_hat[0] = 1710772846592.000
Gradient do_[0] = 97910347792384.000
Backward Time Step 0:
Gradient di[0] = 3723890262016.000, df[0] = 2649526632448.000, dc_hat[0] = 3494264963072.000
Gradient do_[0] = 57571008315392.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2290193063215104.000, df[0] = -1798012091236352.000, dc_hat[0] = -1035173688770560.000
Gradient do_[0] = -133885288941355008.000
Backward Time Step 3:
Gradient di[0] = -3595923519176704.000, df[0] = -2745189282086912.000, dc_hat[0] = -1485866618847232.000
Gradient do_[0] = -183054203394981888.000
Backward Time Step 2:
Gradient di[0] = -4710680401805312.000, df[0] = -3549357483753472.000, dc_hat[0] = -2595924673036288.000
Gradient do_[0] = -209444440966692864.000
Backward Time Step 1:
Gradient di[0] = -5943355879981056.000, df[0] = -4252287669108736.000, dc_hat[0] = -3597827531866112.000
Gradient do_[0] = -187929541031755776.000
Backward Time Step 0:
Gradient di[0] = -6940145311809536.000, df[0] = -5072555018813440.000, dc_hat[0] = -7016255353520128.000
Gradient do_[0] = -107669023314935808.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1228668272640.000, df[0] = 900868538368.000, dc_hat[0] = 533424635904.000
Gradient do_[0] = 76181277769728.000
Backward Time Step 3:
Gradient di[0] = 1927639072768.000, df[0] = 1391072641024.000, dc_hat[0] = 754649726976.000
Gradient do_[0] = 101970861883392.000
Backward Time Step 2:
Gradient di[0] = 2452781793280.000, df[0] = 1761696022528.000, dc_hat[0] = 1253248335872.000
Gradient do_[0] = 111124083113984.000
Backward Time Step 1:
Gradient di[0] = 3076113825792.000, df[0] = 2121207119872.000, dc_hat[0] = 1711037480960.000
Gradient do_[0] = 97925581504512.000
Backward Time Step 0:
Gradient di[0] = 3724465668096.000, df[0] = 2649935839232.000, dc_hat[0] = 3494804717568.000
Gradient do_[0] = 57579896045568.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2290729128820736.000, df[0] = -1798432998031360.000, dc_hat[0] = -1035415884660736.000
Gradient do_[0] = -133916616432812032.000
Backward Time Step 3:
Gradient di[0] = -3596766138073088.000, df[0] = -2745832721874944.000, dc_hat[0] = -1486214376980480.000
Gradient do_[0] = -183097153067941888.000
Backward Time Step 2:
Gradient di[0] = -4711788503367680.000, df[0] = -3550191244279808.000, dc_hat[0] = -2596531068731392.000
Gradient do_[0] = -209493626932166656.000
Backward Time Step 1:
Gradient di[0] = -5944751744352256.000, df[0] = -4253285980569600.000, dc_hat[0] = -3598667466407936.000
Gradient do_[0] = -187973624576081920.000
Backward Time Step 0:
Gradient di[0] = -6941765588221952.000, df[0] = -5073739356045312.000, dc_hat[0] = -7017893346672640.000
Gradient do_[0] = -107694166053486592.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1228893978624.000, df[0] = 901034147840.000, dc_hat[0] = 533522644992.000
Gradient do_[0] = 76195211247616.000
Backward Time Step 3:
Gradient di[0] = 1927991525376.000, df[0] = 1391326658560.000, dc_hat[0] = 754786435072.000
Gradient do_[0] = 101989434261504.000
Backward Time Step 2:
Gradient di[0] = 2453227175936.000, df[0] = 1762016231424.000, dc_hat[0] = 1253475745792.000
Gradient do_[0] = 111144249327616.000
Backward Time Step 1:
Gradient di[0] = 3076675076096.000, df[0] = 2121593782272.000, dc_hat[0] = 1711346941952.000
Gradient do_[0] = 97943440850944.000
Backward Time Step 0:
Gradient di[0] = 3725141213184.000, df[0] = 2650416611328.000, dc_hat[0] = 3495438843904.000
Gradient do_[0] = 57590348251136.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2291301164777472.000, df[0] = -1798881956331520.000, dc_hat[0] = -1035673918242816.000
Gradient do_[0] = -133950031278374912.000
Backward Time Step 3:
Gradient di[0] = -3597663517802496.000, df[0] = -2746517769158656.000, dc_hat[0] = -1486583609950208.000
Gradient do_[0] = -183142645361541120.000
Backward Time Step 2:
Gradient di[0] = -4712958345084928.000, df[0] = -3551073323188224.000, dc_hat[0] = -2597174508519424.000
Gradient do_[0] = -209545596036448256.000
Backward Time Step 1:
Gradient di[0] = -5946235655553024.000, df[0] = -4254347105927168.000, dc_hat[0] = -3599562161782784.000
Gradient do_[0] = -188020405359869952.000
Backward Time Step 0:
Gradient di[0] = -6943495923171328.000, df[0] = -5075003150172160.000, dc_hat[0] = -7019642472103936.000
Gradient do_[0] = -107721001009152000.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1229508182016.000, df[0] = 901484445696.000, dc_hat[0] = 533789147136.000
Gradient do_[0] = 76233329082368.000
Backward Time Step 3:
Gradient di[0] = 1928955166720.000, df[0] = 1392022126592.000, dc_hat[0] = 755163660288.000
Gradient do_[0] = 102040453775360.000
Backward Time Step 2:
Gradient di[0] = 2454451388416.000, df[0] = 1762895069184.000, dc_hat[0] = 1254099910656.000
Gradient do_[0] = 111199706415104.000
Backward Time Step 1:
Gradient di[0] = 3078206783488.000, df[0] = 2122649960448.000, dc_hat[0] = 1712197468160.000
Gradient do_[0] = 97992170274816.000
Backward Time Step 0:
Gradient di[0] = 3726992736256.000, df[0] = 2651733884928.000, dc_hat[0] = 3497176334336.000
Gradient do_[0] = 57618970181632.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2291905412988928.000, df[0] = -1799356550217728.000, dc_hat[0] = -1035947521081344.000
Gradient do_[0] = -133985327319613440.000
Backward Time Step 3:
Gradient di[0] = -3598612437139456.000, df[0] = -2747242276454400.000, dc_hat[0] = -1486975659933696.000
Gradient do_[0] = -183190955153686528.000
Backward Time Step 2:
Gradient di[0] = -4714207106826240.000, df[0] = -3552013921026048.000, dc_hat[0] = -2597861166415872.000
Gradient do_[0] = -209601018294435840.000
Backward Time Step 1:
Gradient di[0] = -5947802781745152.000, df[0] = -4255468092391424.000, dc_hat[0] = -3600505443975168.000
Gradient do_[0] = -188069934922727424.000
Backward Time Step 0:
Gradient di[0] = -6945327189852160.000, df[0] = -5076342106226688.000, dc_hat[0] = -7021494139879424.000
Gradient do_[0] = -107749407922847744.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1229749616640.000, df[0] = 901661392896.000, dc_hat[0] = 533893971968.000
Gradient do_[0] = 76248260804608.000
Backward Time Step 3:
Gradient di[0] = 1929330819072.000, df[0] = 1392293445632.000, dc_hat[0] = 755311247360.000
Gradient do_[0] = 102060317999104.000
Backward Time Step 2:
Gradient di[0] = 2454930325504.000, df[0] = 1763239657472.000, dc_hat[0] = 1254344491008.000
Gradient do_[0] = 111221407744000.000
Backward Time Step 1:
Gradient di[0] = 3078813646848.000, df[0] = 2123068342272.000, dc_hat[0] = 1712532226048.000
Gradient do_[0] = 98011447296000.000
Backward Time Step 0:
Gradient di[0] = 3727732244480.000, df[0] = 2652260270080.000, dc_hat[0] = 3497870229504.000
Gradient do_[0] = 57630408048640.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2292506439974912.000, df[0] = -1799828459749376.000, dc_hat[0] = -1036218640891904.000
Gradient do_[0] = -134020460152094720.000
Backward Time Step 3:
Gradient di[0] = -3599553034977280.000, df[0] = -2747960072863744.000, dc_hat[0] = -1487363146514432.000
Gradient do_[0] = -183238801089363968.000
Backward Time Step 2:
Gradient di[0] = -4715432783118336.000, df[0] = -3552938144301056.000, dc_hat[0] = -2598535744716800.000
Gradient do_[0] = -209655530019356672.000
Backward Time Step 1:
Gradient di[0] = -5949357559906304.000, df[0] = -4256580488921088.000, dc_hat[0] = -3601442015281152.000
Gradient do_[0] = -188118966269378560.000
Backward Time Step 0:
Gradient di[0] = -6947141276663808.000, df[0] = -5077668177379328.000, dc_hat[0] = -7023328090914816.000
Gradient do_[0] = -107777548548571136.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1230168784896.000, df[0] = 901968494592.000, dc_hat[0] = 534075572224.000
Gradient do_[0] = 76274273878016.000
Backward Time Step 3:
Gradient di[0] = 1929991553024.000, df[0] = 1392770154496.000, dc_hat[0] = 755569393664.000
Gradient do_[0] = 102095273328640.000
Backward Time Step 2:
Gradient di[0] = 2455771283456.000, df[0] = 1763842981888.000, dc_hat[0] = 1254772178944.000
Gradient do_[0] = 111259391361024.000
Backward Time Step 1:
Gradient di[0] = 3079866679296.000, df[0] = 2123794481152.000, dc_hat[0] = 1713115889664.000
Gradient do_[0] = 98044909453312.000
Backward Time Step 0:
Gradient di[0] = 3729005740032.000, df[0] = 2653166239744.000, dc_hat[0] = 3499065344000.000
Gradient do_[0] = 57650096111616.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2293113640976384.000, df[0] = -1800305201119232.000, dc_hat[0] = -1036493116145664.000
Gradient do_[0] = -134055867862482944.000
Backward Time Step 3:
Gradient di[0] = -3600509202071552.000, df[0] = -2748690217304064.000, dc_hat[0] = -1487758283505664.000
Gradient do_[0] = -183287420119154688.000
Backward Time Step 2:
Gradient di[0] = -4716690671665152.000, df[0] = -3553885184589824.000, dc_hat[0] = -2599225892274176.000
Gradient do_[0] = -209711347414335488.000
Backward Time Step 1:
Gradient di[0] = -5950939718483968.000, df[0] = -4257712212803584.000, dc_hat[0] = -3602394961149952.000
Gradient do_[0] = -188168925328965632.000
Backward Time Step 0:
Gradient di[0] = -6948979522666496.000, df[0] = -5079011428401152.000, dc_hat[0] = -7025186201141248.000
Gradient do_[0] = -107806084311285760.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1230543519744.000, df[0] = 902243352576.000, dc_hat[0] = 534238298112.000
Gradient do_[0] = 76297476767744.000
Backward Time Step 3:
Gradient di[0] = 1930574954496.000, df[0] = 1393191157760.000, dc_hat[0] = 755797262336.000
Gradient do_[0] = 102126126628864.000
Backward Time Step 2:
Gradient di[0] = 2456516034560.000, df[0] = 1764378279936.000, dc_hat[0] = 1255152680960.000
Gradient do_[0] = 111293147119616.000
Backward Time Step 1:
Gradient di[0] = 3080803319808.000, df[0] = 2124440535040.000, dc_hat[0] = 1713634410496.000
Gradient do_[0] = 98074714177536.000
Backward Time Step 0:
Gradient di[0] = 3730137415680.000, df[0] = 2653971283968.000, dc_hat[0] = 3500127027200.000
Gradient do_[0] = 57667586359296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2293729700347904.000, df[0] = -1800788787593216.000, dc_hat[0] = -1036771013951488.000
Gradient do_[0] = -134091954177703936.000
Backward Time Step 3:
Gradient di[0] = -3601476106584064.000, df[0] = -2749428414808064.000, dc_hat[0] = -1488157715464192.000
Gradient do_[0] = -183336657624236032.000
Backward Time Step 2:
Gradient di[0] = -4717956613275648.000, df[0] = -3554838935764992.000, dc_hat[0] = -2599923556024320.000
Gradient do_[0] = -209767594306043904.000
Backward Time Step 1:
Gradient di[0] = -5952534761963520.000, df[0] = -4258853063491584.000, dc_hat[0] = -3603357033824256.000
Gradient do_[0] = -188219313885282304.000
Backward Time Step 0:
Gradient di[0] = -6950843538472960.000, df[0] = -5080374006775808.000, dc_hat[0] = -7027071154913280.000
Gradient do_[0] = -107834989441187840.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1230931361792.000, df[0] = 902527778816.000, dc_hat[0] = 534406692864.000
Gradient do_[0] = 76321526906880.000
Backward Time Step 3:
Gradient di[0] = 1931184439296.000, df[0] = 1393630773248.000, dc_hat[0] = 756035878912.000
Gradient do_[0] = 102158372438016.000
Backward Time Step 2:
Gradient di[0] = 2457291718656.000, df[0] = 1764934811648.000, dc_hat[0] = 1255548125184.000
Gradient do_[0] = 111328270221312.000
Backward Time Step 1:
Gradient di[0] = 3081773514752.000, df[0] = 2125109264384.000, dc_hat[0] = 1714172329984.000
Gradient do_[0] = 98105575866368.000
Backward Time Step 0:
Gradient di[0] = 3731310510080.000, df[0] = 2654805950464.000, dc_hat[0] = 3501228032000.000
Gradient do_[0] = 57685722529792.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2294364013330432.000, df[0] = -1801286869581824.000, dc_hat[0] = -1037057635909632.000
Gradient do_[0] = -134129036925337600.000
Backward Time Step 3:
Gradient di[0] = -3602470123077632.000, df[0] = -2750187281842176.000, dc_hat[0] = -1488567079534592.000
Gradient do_[0] = -183387200799375360.000
Backward Time Step 2:
Gradient di[0] = -4719255840882688.000, df[0] = -3555817651437568.000, dc_hat[0] = -2600636252160000.000
Gradient do_[0] = -209825353026240512.000
Backward Time Step 1:
Gradient di[0] = -5954182955663360.000, df[0] = -4260032032014336.000, dc_hat[0] = -3604351050317824.000
Gradient do_[0] = -188271403248648192.000
Backward Time Step 0:
Gradient di[0] = -6952763925725184.000, df[0] = -5081777387339776.000, dc_hat[0] = -7029013017001984.000
Gradient do_[0] = -107864787924287488.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1231357345792.000, df[0] = 902840123392.000, dc_hat[0] = 534591635456.000
Gradient do_[0] = 76347917467648.000
Backward Time Step 3:
Gradient di[0] = 1931854741504.000, df[0] = 1394114428928.000, dc_hat[0] = 756297629696.000
Gradient do_[0] = 102193730420736.000
Backward Time Step 2:
Gradient di[0] = 2458142638080.000, df[0] = 1765546393600.000, dc_hat[0] = 1255982235648.000
Gradient do_[0] = 111366782320640.000
Backward Time Step 1:
Gradient di[0] = 3082835984384.000, df[0] = 2125841825792.000, dc_hat[0] = 1714760843264.000
Gradient do_[0] = 98139323236352.000
Backward Time Step 0:
Gradient di[0] = 3732597899264.000, df[0] = 2655721881600.000, dc_hat[0] = 3502435729408.000
Gradient do_[0] = 57705624502272.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2294973630251008.000, df[0] = -1801765355782144.000, dc_hat[0] = -1037332849360896.000
Gradient do_[0] = -134164582074679296.000
Backward Time Step 3:
Gradient di[0] = -3603432732622848.000, df[0] = -2750922526556160.000, dc_hat[0] = -1488964766662656.000
Gradient do_[0] = -183436129066811392.000
Backward Time Step 2:
Gradient di[0] = -4720521245622272.000, df[0] = -3556771134177280.000, dc_hat[0] = -2601332305297408.000
Gradient do_[0] = -209881531198472192.000
Backward Time Step 1:
Gradient di[0] = -5955778536013824.000, df[0] = -4261173419573248.000, dc_hat[0] = -3605312049250304.000
Gradient do_[0] = -188321843344572416.000
Backward Time Step 0:
Gradient di[0] = -6954614519758848.000, df[0] = -5083129765167104.000, dc_hat[0] = -7030882401517568.000
Gradient do_[0] = -107893495485693952.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1231771140096.000, df[0] = 903143358464.000, dc_hat[0] = 534771236864.000
Gradient do_[0] = 76373603385344.000
Backward Time Step 3:
Gradient di[0] = 1932502368256.000, df[0] = 1394581962752.000, dc_hat[0] = 756551057408.000
Gradient do_[0] = 102227989495808.000
Backward Time Step 2:
Gradient di[0] = 2458967605248.000, df[0] = 1766138839040.000, dc_hat[0] = 1256402714624.000
Gradient do_[0] = 111404161957888.000
Backward Time Step 1:
Gradient di[0] = 3083879317504.000, df[0] = 2126560886784.000, dc_hat[0] = 1715338608640.000
Gradient do_[0] = 98172500180992.000
Backward Time Step 0:
Gradient di[0] = 3733856714752.000, df[0] = 2656617365504.000, dc_hat[0] = 3503616950272.000
Gradient do_[0] = 57725090267136.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2295601500782592.000, df[0] = -1802258069061632.000, dc_hat[0] = -1037616585637888.000
Gradient do_[0] = -134201338404798464.000
Backward Time Step 3:
Gradient di[0] = -3604414937956352.000, df[0] = -2751671729913856.000, dc_hat[0] = -1489369835765760.000
Gradient do_[0] = -183486191205613568.000
Backward Time Step 2:
Gradient di[0] = -4721809735811072.000, df[0] = -3557742602092544.000, dc_hat[0] = -2602041511772160.000
Gradient do_[0] = -209938808882331648.000
Backward Time Step 1:
Gradient di[0] = -5957400959909888.000, df[0] = -4262333866049536.000, dc_hat[0] = -3606289422745600.000
Gradient do_[0] = -188373108074217472.000
Backward Time Step 0:
Gradient di[0] = -6956504842240000.000, df[0] = -5084511670894592.000, dc_hat[0] = -7032793661964288.000
Gradient do_[0] = -107922812932456448.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1232090169344.000, df[0] = 903377321984.000, dc_hat[0] = 534909124608.000
Gradient do_[0] = 76393325002752.000
Backward Time Step 3:
Gradient di[0] = 1933002801152.000, df[0] = 1394943066112.000, dc_hat[0] = 756746420224.000
Gradient do_[0] = 102254472331264.000
Backward Time Step 2:
Gradient di[0] = 2459605663744.000, df[0] = 1766596935680.000, dc_hat[0] = 1256727379968.000
Gradient do_[0] = 111432968437760.000
Backward Time Step 1:
Gradient di[0] = 3084675186688.000, df[0] = 2127109816320.000, dc_hat[0] = 1715779534848.000
Gradient do_[0] = 98197833777152.000
Backward Time Step 0:
Gradient di[0] = 3734817996800.000, df[0] = 2657301561344.000, dc_hat[0] = 3504518987776.000
Gradient do_[0] = 57739954880512.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2296209507090432.000, df[0] = -1802735481520128.000, dc_hat[0] = -1037890926673920.000
Gradient do_[0] = -134236814834663424.000
Backward Time Step 3:
Gradient di[0] = -3605369226002432.000, df[0] = -2752400532176896.000, dc_hat[0] = -1489763227926528.000
Gradient do_[0] = -183534707156189184.000
Backward Time Step 2:
Gradient di[0] = -4723056886939648.000, df[0] = -3558681320882176.000, dc_hat[0] = -2602725753749504.000
Gradient do_[0] = -209994076521496576.000
Backward Time Step 1:
Gradient di[0] = -5958976139165696.000, df[0] = -4263460221222912.000, dc_hat[0] = -3607237805211648.000
Gradient do_[0] = -188422757896159232.000
Backward Time Step 0:
Gradient di[0] = -6958339867017216.000, df[0] = -5085852774432768.000, dc_hat[0] = -7034649087836160.000
Gradient do_[0] = -107951288565628928.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1232470016000.000, df[0] = 903655784448.000, dc_hat[0] = 535074013184.000
Gradient do_[0] = 76416880214016.000
Backward Time Step 3:
Gradient di[0] = 1933600620544.000, df[0] = 1395374424064.000, dc_hat[0] = 756980056064.000
Gradient do_[0] = 102286097383424.000
Backward Time Step 2:
Gradient di[0] = 2460367978496.000, df[0] = 1767144423424.000, dc_hat[0] = 1257116663808.000
Gradient do_[0] = 111467504336896.000
Backward Time Step 1:
Gradient di[0] = 3085634633728.000, df[0] = 2127771336704.000, dc_hat[0] = 1716310638592.000
Gradient do_[0] = 98228334755840.000
Backward Time Step 0:
Gradient di[0] = 3735974051840.000, df[0] = 2658124169216.000, dc_hat[0] = 3505603739648.000
Gradient do_[0] = 57757818421248.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2296810265640960.000, df[0] = -1803207122616320.000, dc_hat[0] = -1038162382028800.000
Gradient do_[0] = -134271930487275520.000
Backward Time Step 3:
Gradient di[0] = -3606312776630272.000, df[0] = -2753120744505344.000, dc_hat[0] = -1490152861990912.000
Gradient do_[0] = -183582742070427648.000
Backward Time Step 2:
Gradient di[0] = -4724294374391808.000, df[0] = -3559613597220864.000, dc_hat[0] = -2603406774501376.000
Gradient do_[0] = -210049103642492928.000
Backward Time Step 1:
Gradient di[0] = -5960536286035968.000, df[0] = -4264576375848960.000, dc_hat[0] = -3608177060872192.000
Gradient do_[0] = -188471961041502208.000
Backward Time Step 0:
Gradient di[0] = -6960165228118016.000, df[0] = -5087186898649088.000, dc_hat[0] = -7036494313160704.000
Gradient do_[0] = -107979600990044160.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1232821026816.000, df[0] = 903913209856.000, dc_hat[0] = 535226417152.000
Gradient do_[0] = 76438631874560.000
Backward Time Step 3:
Gradient di[0] = 1934147584000.000, df[0] = 1395769344000.000, dc_hat[0] = 757194227712.000
Gradient do_[0] = 102315038081024.000
Backward Time Step 2:
Gradient di[0] = 2461065281536.000, df[0] = 1767645118464.000, dc_hat[0] = 1257472524288.000
Gradient do_[0] = 111499053891584.000
Backward Time Step 1:
Gradient di[0] = 3086506786816.000, df[0] = 2128372563968.000, dc_hat[0] = 1716793507840.000
Gradient do_[0] = 98256025550848.000
Backward Time Step 0:
Gradient di[0] = 3737026035712.000, df[0] = 2658872590336.000, dc_hat[0] = 3506590973952.000
Gradient do_[0] = 57774088126464.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2297430083108864.000, df[0] = -1803694198751232.000, dc_hat[0] = -1038442695753728.000
Gradient do_[0] = -134308197191122944.000
Backward Time Step 3:
Gradient di[0] = -3607287197335552.000, df[0] = -2753864579153920.000, dc_hat[0] = -1490555112521728.000
Gradient do_[0] = -183632340352761856.000
Backward Time Step 2:
Gradient di[0] = -4725575885258752.000, df[0] = -3560578891120640.000, dc_hat[0] = -2604110343831552.000
Gradient do_[0] = -210106054908837888.000
Backward Time Step 1:
Gradient di[0] = -5962159783673856.000, df[0] = -4265737896067072.000, dc_hat[0] = -3609154702802944.000
Gradient do_[0] = -188523260130885632.000
Backward Time Step 0:
Gradient di[0] = -6962061993050112.000, df[0] = -5088573636214784.000, dc_hat[0] = -7038412016058368.000
Gradient do_[0] = -108009038695890944.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1233176887296.000, df[0] = 904174239744.000, dc_hat[0] = 535380754432.000
Gradient do_[0] = 76460685524992.000
Backward Time Step 3:
Gradient di[0] = 1934705688576.000, df[0] = 1396171866112.000, dc_hat[0] = 757412528128.000
Gradient do_[0] = 102344465317888.000
Backward Time Step 2:
Gradient di[0] = 2461770711040.000, df[0] = 1768151973888.000, dc_hat[0] = 1257832185856.000
Gradient do_[0] = 111530947379200.000
Backward Time Step 1:
Gradient di[0] = 3087393095680.000, df[0] = 2128983359488.000, dc_hat[0] = 1717283717120.000
Gradient do_[0] = 98284219662336.000
Backward Time Step 0:
Gradient di[0] = 3738102923264.000, df[0] = 2659638837248.000, dc_hat[0] = 3507601539072.000
Gradient do_[0] = 57790731124736.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2298058758946816.000, df[0] = -1804187180466176.000, dc_hat[0] = -1038726096486400.000
Gradient do_[0] = -134344936341372928.000
Backward Time Step 3:
Gradient di[0] = -3608271550152704.000, df[0] = -2754616735301632.000, dc_hat[0] = -1490961926455296.000
Gradient do_[0] = -183682471211040768.000
Backward Time Step 2:
Gradient di[0] = -4726862227963904.000, df[0] = -3561548479987712.000, dc_hat[0] = -2604818208129024.000
Gradient do_[0] = -210163195153743872.000
Backward Time Step 1:
Gradient di[0] = -5963780596957184.000, df[0] = -4266897268801536.000, dc_hat[0] = -3610132613169152.000
Gradient do_[0] = -188574473320923136.000
Backward Time Step 0:
Gradient di[0] = -6963946409951232.000, df[0] = -5089950710104064.000, dc_hat[0] = -7040317370925056.000
Gradient do_[0] = -108038261653372928.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1233502470144.000, df[0] = 904412790784.000, dc_hat[0] = 535522115584.000
Gradient do_[0] = 76480876904448.000
Backward Time Step 3:
Gradient di[0] = 1935217917952.000, df[0] = 1396541489152.000, dc_hat[0] = 757612609536.000
Gradient do_[0] = 102371585687552.000
Backward Time Step 2:
Gradient di[0] = 2462428430336.000, df[0] = 1768624095232.000, dc_hat[0] = 1258167205888.000
Gradient do_[0] = 111560743714816.000
Backward Time Step 1:
Gradient di[0] = 3088215965696.000, df[0] = 2129550770176.000, dc_hat[0] = 1717739585536.000
Gradient do_[0] = 98310400507904.000
Backward Time Step 0:
Gradient di[0] = 3739094089728.000, df[0] = 2660344004608.000, dc_hat[0] = 3508531625984.000
Gradient do_[0] = 57806061305856.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2298673476141056.000, df[0] = -1804669961633792.000, dc_hat[0] = -1039004061401088.000
Gradient do_[0] = -134380876627705856.000
Backward Time Step 3:
Gradient di[0] = -3609239796842496.000, df[0] = -2755354932805632.000, dc_hat[0] = -1491360016236544.000
Gradient do_[0] = -183731674356383744.000
Backward Time Step 2:
Gradient di[0] = -4728128706445312.000, df[0] = -3562502768033792.000, dc_hat[0] = -2605515066572800.000
Gradient do_[0] = -210219510764929024.000
Backward Time Step 1:
Gradient di[0] = -5965380472274944.000, df[0] = -4268041877585920.000, dc_hat[0] = -3611097370198016.000
Gradient do_[0] = -188625016496062464.000
Backward Time Step 0:
Gradient di[0] = -6965812036370432.000, df[0] = -5091314362220544.000, dc_hat[0] = -7042203398438912.000
Gradient do_[0] = -108067218322882560.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1233884938240.000, df[0] = 904693153792.000, dc_hat[0] = 535688183808.000
Gradient do_[0] = 76504599887872.000
Backward Time Step 3:
Gradient di[0] = 1935820062720.000, df[0] = 1396975992832.000, dc_hat[0] = 757848080384.000
Gradient do_[0] = 102403412066304.000
Backward Time Step 2:
Gradient di[0] = 2463194152960.000, df[0] = 1769173942272.000, dc_hat[0] = 1258557145088.000
Gradient do_[0] = 111595405443072.000
Backward Time Step 1:
Gradient di[0] = 3089176985600.000, df[0] = 2130213470208.000, dc_hat[0] = 1718271606784.000
Gradient do_[0] = 98340935041024.000
Backward Time Step 0:
Gradient di[0] = 3740254076928.000, df[0] = 2661169233920.000, dc_hat[0] = 3509619785728.000
Gradient do_[0] = 57823991955456.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2299288998641664.000, df[0] = -1805153011236864.000, dc_hat[0] = -1039282093424640.000
Gradient do_[0] = -134416765374431232.000
Backward Time Step 3:
Gradient di[0] = -3610204017000448.000, df[0] = -2756090982825984.000, dc_hat[0] = -1491757971800064.000
Gradient do_[0] = -183780722882904064.000
Backward Time Step 2:
Gradient di[0] = -4729393037443072.000, df[0] = -3563455177031680.000, dc_hat[0] = -2606210314403840.000
Gradient do_[0] = -210275671757291520.000
Backward Time Step 1:
Gradient di[0] = -5966971757658112.000, df[0] = -4269179775483904.000, dc_hat[0] = -3612055147905024.000
Gradient do_[0] = -188675198893948928.000
Backward Time Step 0:
Gradient di[0] = -6967661019791360.000, df[0] = -5092666203176960.000, dc_hat[0] = -7044072782954496.000
Gradient do_[0] = -108095900114485248.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1234323636224.000, df[0] = 905014804480.000, dc_hat[0] = 535878303744.000
Gradient do_[0] = 76531778977792.000
Backward Time Step 3:
Gradient di[0] = 1936507797504.000, df[0] = 1397472493568.000, dc_hat[0] = 758117367808.000
Gradient do_[0] = 102439759904768.000
Backward Time Step 2:
Gradient di[0] = 2464064995328.000, df[0] = 1769799286784.000, dc_hat[0] = 1259001872384.000
Gradient do_[0] = 111634823512064.000
Backward Time Step 1:
Gradient di[0] = 3090268028928.000, df[0] = 2130966085632.000, dc_hat[0] = 1718876635136.000
Gradient do_[0] = 98375655489536.000
Backward Time Step 0:
Gradient di[0] = 3741580263424.000, df[0] = 2662112690176.000, dc_hat[0] = 3510864445440.000
Gradient do_[0] = 57844493713408.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2299909084545024.000, df[0] = -1805639684718592.000, dc_hat[0] = -1039562138714112.000
Gradient do_[0] = -134452980538671104.000
Backward Time Step 3:
Gradient di[0] = -3611177900834816.000, df[0] = -2756835085910016.000, dc_hat[0] = -1492160490766336.000
Gradient do_[0] = -183830321165238272.000
Backward Time Step 2:
Gradient di[0] = -4730667568988160.000, df[0] = -3564415102222336.000, dc_hat[0] = -2606910125637632.000
Gradient do_[0] = -210332330965860352.000
Backward Time Step 1:
Gradient di[0] = -5968580759781376.000, df[0] = -4270331095154688.000, dc_hat[0] = -3613025273643008.000
Gradient do_[0] = -188726068486602752.000
Backward Time Step 0:
Gradient di[0] = -6969543826079744.000, df[0] = -5094042203324416.000, dc_hat[0] = -7045976527208448.000
Gradient do_[0] = -108125114482032640.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1234727469056.000, df[0] = 905311092736.000, dc_hat[0] = 536053678080.000
Gradient do_[0] = 76556785418240.000
Backward Time Step 3:
Gradient di[0] = 1937141661696.000, df[0] = 1397929803776.000, dc_hat[0] = 758364962816.000
Gradient do_[0] = 102473280782336.000
Backward Time Step 2:
Gradient di[0] = 2464872923136.000, df[0] = 1770379804672.000, dc_hat[0] = 1259413569536.000
Gradient do_[0] = 111671423008768.000
Backward Time Step 1:
Gradient di[0] = 3091283574784.000, df[0] = 2131666010112.000, dc_hat[0] = 1719438671872.000
Gradient do_[0] = 98407951630336.000
Backward Time Step 0:
Gradient di[0] = 3742804738048.000, df[0] = 2662984056832.000, dc_hat[0] = 3512013422592.000
Gradient do_[0] = 57863422607360.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2300514138062848.000, df[0] = -1806114815475712.000, dc_hat[0] = -1039835003355136.000
Gradient do_[0] = -134488336709451776.000
Backward Time Step 3:
Gradient di[0] = -3612123598946304.000, df[0] = -2757556640415744.000, dc_hat[0] = -1492550124830720.000
Gradient do_[0] = -183878459158691840.000
Backward Time Step 2:
Gradient di[0] = -4731910425149440.000, df[0] = -3565351941963776.000, dc_hat[0] = -2607593830744064.000
Gradient do_[0] = -210387547065417728.000
Backward Time Step 1:
Gradient di[0] = -5970153254682624.000, df[0] = -4271455571279872.000, dc_hat[0] = -3613971777060864.000
Gradient do_[0] = -188775718308544512.000
Backward Time Step 0:
Gradient di[0] = -6971375629631488.000, df[0] = -5095381159378944.000, dc_hat[0] = -7047828194983936.000
Gradient do_[0] = -108153529985662976.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1235178618880.000, df[0] = 905641656320.000, dc_hat[0] = 536249532416.000
Gradient do_[0] = 76584786591744.000
Backward Time Step 3:
Gradient di[0] = 1937847091200.000, df[0] = 1398438887424.000, dc_hat[0] = 758641065984.000
Gradient do_[0] = 102510601699328.000
Backward Time Step 2:
Gradient di[0] = 2465772863488.000, df[0] = 1771025989632.000, dc_hat[0] = 1259873239040.000
Gradient do_[0] = 111712132923392.000
Backward Time Step 1:
Gradient di[0] = 3092416036864.000, df[0] = 2132446674944.000, dc_hat[0] = 1720066637824.000
Gradient do_[0] = 98443921981440.000
Backward Time Step 0:
Gradient di[0] = 3744170770432.000, df[0] = 2663956086784.000, dc_hat[0] = 3513295044608.000
Gradient do_[0] = 57884536733696.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2301129660563456.000, df[0] = -1806597865078784.000, dc_hat[0] = -1040113236705280.000
Gradient do_[0] = -134524371485065216.000
Backward Time Step 3:
Gradient di[0] = -3613096945909760.000, df[0] = -2758299938193408.000, dc_hat[0] = -1492952375361536.000
Gradient do_[0] = -183927902822203392.000
Backward Time Step 2:
Gradient di[0] = -4733180661727232.000, df[0] = -3566308914364416.000, dc_hat[0] = -2608292568236032.000
Gradient do_[0] = -210443931396079616.000
Backward Time Step 1:
Gradient di[0] = -5971750982516736.000, df[0] = -4272598837886976.000, dc_hat[0] = -3614935460347904.000
Gradient do_[0] = -188826141224599552.000
Backward Time Step 0:
Gradient di[0] = -6973239108567040.000, df[0] = -5096743200882688.000, dc_hat[0] = -7049712075014144.000
Gradient do_[0] = -108182443705499648.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1235507216384.000, df[0] = 905882501120.000, dc_hat[0] = 536391745536.000
Gradient do_[0] = 76605145743360.000
Backward Time Step 3:
Gradient di[0] = 1938366136320.000, df[0] = 1398813229056.000, dc_hat[0] = 758843834368.000
Gradient do_[0] = 102538015670272.000
Backward Time Step 2:
Gradient di[0] = 2466430058496.000, df[0] = 1771498110976.000, dc_hat[0] = 1260207865856.000
Gradient do_[0] = 111741920870400.000
Backward Time Step 1:
Gradient di[0] = 3093234450432.000, df[0] = 2133010677760.000, dc_hat[0] = 1720519622656.000
Gradient do_[0] = 98469935054848.000
Backward Time Step 0:
Gradient di[0] = 3745161936896.000, df[0] = 2664660992000.000, dc_hat[0] = 3514224869376.000
Gradient do_[0] = 57899862720512.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2301746525241344.000, df[0] = -1807082391076864.000, dc_hat[0] = -1040392141144064.000
Gradient do_[0] = -134560423440547840.000
Backward Time Step 3:
Gradient di[0] = -3614064924164096.000, df[0] = -2759038672568320.000, dc_hat[0] = -1493351538884608.000
Gradient do_[0] = -183977140327284736.000
Backward Time Step 2:
Gradient di[0] = -4734449287692288.000, df[0] = -3567264544587776.000, dc_hat[0] = -2608990231986176.000
Gradient do_[0] = -210500178287788032.000
Backward Time Step 1:
Gradient di[0] = -5973355689672704.000, df[0] = -4273746399461376.000, dc_hat[0] = -3615901827989504.000
Gradient do_[0] = -188876804658823168.000
Backward Time Step 0:
Gradient di[0] = -6975110640566272.000, df[0] = -5098111147966464.000, dc_hat[0] = -7051604008108032.000
Gradient do_[0] = -108211477684420608.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1235863470080.000, df[0] = 906143858688.000, dc_hat[0] = 536546803712.000
Gradient do_[0] = 76627216171008.000
Backward Time Step 3:
Gradient di[0] = 1938923585536.000, df[0] = 1399215751168.000, dc_hat[0] = 759062069248.000
Gradient do_[0] = 102567484850176.000
Backward Time Step 2:
Gradient di[0] = 2467139944448.000, df[0] = 1772007981056.000, dc_hat[0] = 1260570017792.000
Gradient do_[0] = 111774074404864.000
Backward Time Step 1:
Gradient di[0] = 3094128885760.000, df[0] = 2133627502592.000, dc_hat[0] = 1721014681600.000
Gradient do_[0] = 98498405990400.000
Backward Time Step 0:
Gradient di[0] = 3746241183744.000, df[0] = 2665429073920.000, dc_hat[0] = 3515237793792.000
Gradient do_[0] = 57916547661824.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2302346746920960.000, df[0] = -1807553629519872.000, dc_hat[0] = -1040662858301440.000
Gradient do_[0] = -134595513323356160.000
Backward Time Step 3:
Gradient di[0] = -3615010622275584.000, df[0] = -2759761300815872.000, dc_hat[0] = -1493741307166720.000
Gradient do_[0] = -184025278320738304.000
Backward Time Step 2:
Gradient di[0] = -4735692143853568.000, df[0] = -3568200847458304.000, dc_hat[0] = -2609673131786240.000
Gradient do_[0] = -210555394387345408.000
Backward Time Step 1:
Gradient di[0] = -5974922815864832.000, df[0] = -4274867385925632.000, dc_hat[0] = -3616845378617344.000
Gradient do_[0] = -188926299861942272.000
Backward Time Step 0:
Gradient di[0] = -6976934927925248.000, df[0] = -5099444735311872.000, dc_hat[0] = -7053448696561664.000
Gradient do_[0] = -108239781518901248.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1236194164736.000, df[0] = 906386210816.000, dc_hat[0] = 536689868800.000
Gradient do_[0] = 76647667597312.000
Backward Time Step 3:
Gradient di[0] = 1939442630656.000, df[0] = 1399590354944.000, dc_hat[0] = 759264837632.000
Gradient do_[0] = 102594923986944.000
Backward Time Step 2:
Gradient di[0] = 2467802906624.000, df[0] = 1772484296704.000, dc_hat[0] = 1260907528192.000
Gradient do_[0] = 111804038512640.000
Backward Time Step 1:
Gradient di[0] = 3094956474368.000, df[0] = 2134198190080.000, dc_hat[0] = 1721473040384.000
Gradient do_[0] = 98524687499264.000
Backward Time Step 0:
Gradient di[0] = 3747237593088.000, df[0] = 2666137911296.000, dc_hat[0] = 3516172599296.000
Gradient do_[0] = 57931953340416.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2302951800438784.000, df[0] = -1808028491841536.000, dc_hat[0] = -1040936326922240.000
Gradient do_[0] = -134630860904202240.000
Backward Time Step 3:
Gradient di[0] = -3615960883789824.000, df[0] = -2760486344982528.000, dc_hat[0] = -1494133894021120.000
Gradient do_[0] = -184073588112883712.000
Backward Time Step 2:
Gradient di[0] = -4736934463143936.000, df[0] = -3569137418764288.000, dc_hat[0] = -2610356568457216.000
Gradient do_[0] = -210610662026510336.000
Backward Time Step 1:
Gradient di[0] = -5976491015798784.000, df[0] = -4275988640825344.000, dc_hat[0] = -3617790002987008.000
Gradient do_[0] = -188975846604668928.000
Backward Time Step 0:
Gradient di[0] = -6978766194606080.000, df[0] = -5100782617624576.000, dc_hat[0] = -7055299827466240.000
Gradient do_[0] = -108268179842662400.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1236537835520.000, df[0] = 906638262272.000, dc_hat[0] = 536839061504.000
Gradient do_[0] = 76669016604672.000
Backward Time Step 3:
Gradient di[0] = 1939982778368.000, df[0] = 1399980032000.000, dc_hat[0] = 759475863552.000
Gradient do_[0] = 102623495585792.000
Backward Time Step 2:
Gradient di[0] = 2468489723904.000, df[0] = 1772977258496.000, dc_hat[0] = 1261257883648.000
Gradient do_[0] = 111835185414144.000
Backward Time Step 1:
Gradient di[0] = 3095817355264.000, df[0] = 2134791553024.000, dc_hat[0] = 1721949880320.000
Gradient do_[0] = 98552067915776.000
Backward Time Step 0:
Gradient di[0] = 3748280664064.000, df[0] = 2666880303104.000, dc_hat[0] = 3517151707136.000
Gradient do_[0] = 57948080439296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2303552290553856.000, df[0] = -1808499998720000.000, dc_hat[0] = -1041207782277120.000
Gradient do_[0] = -134665916427272192.000
Backward Time Step 3:
Gradient di[0] = -3616897991966720.000, df[0] = -2761201725472768.000, dc_hat[0] = -1494520575295488.000
Gradient do_[0] = -184121227890130944.000
Backward Time Step 2:
Gradient di[0] = -4738157455081472.000, df[0] = -3570058152378368.000, dc_hat[0] = -2611027925532672.000
Gradient do_[0] = -210664984772870144.000
Backward Time Step 1:
Gradient di[0] = -5978039351508992.000, df[0] = -4277096205516800.000, dc_hat[0] = -3618723889938432.000
Gradient do_[0] = -189024688972759040.000
Backward Time Step 0:
Gradient di[0] = -6980575449579520.000, df[0] = -5102105467551744.000, dc_hat[0] = -7057128946663424.000
Gradient do_[0] = -108296251748909056.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1236853194752.000, df[0] = 906869473280.000, dc_hat[0] = 536975998976.000
Gradient do_[0] = 76688536895488.000
Backward Time Step 3:
Gradient di[0] = 1940473774080.000, df[0] = 1400334319616.000, dc_hat[0] = 759667687424.000
Gradient do_[0] = 102649483493376.000
Backward Time Step 2:
Gradient di[0] = 2469115199488.000, df[0] = 1773426573312.000, dc_hat[0] = 1261576257536.000
Gradient do_[0] = 111863471800320.000
Backward Time Step 1:
Gradient di[0] = 3096606932992.000, df[0] = 2135336157184.000, dc_hat[0] = 1722386350080.000
Gradient do_[0] = 98577183408128.000
Backward Time Step 0:
Gradient di[0] = 3749231984640.000, df[0] = 2667557158912.000, dc_hat[0] = 3518044569600.000
Gradient do_[0] = 57962789863424.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2304171571150848.000, df[0] = -1808985732677632.000, dc_hat[0] = -1041486619607040.000
Gradient do_[0] = -134702105821708288.000
Backward Time Step 3:
Gradient di[0] = -3617872412672000.000, df[0] = -2761946096992256.000, dc_hat[0] = -1494923094261760.000
Gradient do_[0] = -184170843352334336.000
Backward Time Step 2:
Gradient di[0] = -4739438965948416.000, df[0] = -3571023983149056.000, dc_hat[0] = -2611733910781952.000
Gradient do_[0] = -210721918859345920.000
Backward Time Step 1:
Gradient di[0] = -5979657480437760.000, df[0] = -4278254504509440.000, dc_hat[0] = -3619698579079168.000
Gradient do_[0] = -189075833443319808.000
Backward Time Step 0:
Gradient di[0] = -6982452350287872.000, df[0] = -5103476635860992.000, dc_hat[0] = -7059025711595520.000
Gradient do_[0] = -108325371627175936.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1237343141888.000, df[0] = 907228545024.000, dc_hat[0] = 537188401152.000
Gradient do_[0] = 76718895267840.000
Backward Time Step 3:
Gradient di[0] = 1941244215296.000, df[0] = 1400890589184.000, dc_hat[0] = 759969349632.000
Gradient do_[0] = 102690159853568.000
Backward Time Step 2:
Gradient di[0] = 2470092734464.000, df[0] = 1774128594944.000, dc_hat[0] = 1262075117568.000
Gradient do_[0] = 111907738484736.000
Backward Time Step 1:
Gradient di[0] = 3097832456192.000, df[0] = 2136181309440.000, dc_hat[0] = 1723066220544.000
Gradient do_[0] = 98616156880896.000
Backward Time Step 0:
Gradient di[0] = 3750717816832.000, df[0] = 2668614123520.000, dc_hat[0] = 3519438389248.000
Gradient do_[0] = 57985757872128.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2304789241135104.000, df[0] = -1809471332417536.000, dc_hat[0] = -1041766463569920.000
Gradient do_[0] = -134738235086602240.000
Backward Time Step 3:
Gradient di[0] = -3618840927797248.000, df[0] = -2762685368238080.000, dc_hat[0] = -1495322392002560.000
Gradient do_[0] = -184220132397023232.000
Backward Time Step 2:
Gradient di[0] = -4740709202526208.000, df[0] = -3571980955549696.000, dc_hat[0] = -2612430769225728.000
Gradient do_[0] = -210778354729615360.000
Backward Time Step 1:
Gradient di[0] = -5981256282013696.000, df[0] = -4279397234245632.000, dc_hat[0] = -3620660383318016.000
Gradient do_[0] = -189126325078851584.000
Backward Time Step 0:
Gradient di[0] = -6984322271674368.000, df[0] = -5104844046073856.000, dc_hat[0] = -7060917107818496.000
Gradient do_[0] = -108354379836293120.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1237684977664.000, df[0] = 907479285760.000, dc_hat[0] = 537336807424.000
Gradient do_[0] = 76740084891648.000
Backward Time Step 3:
Gradient di[0] = 1941780168704.000, df[0] = 1401277120512.000, dc_hat[0] = 760178606080.000
Gradient do_[0] = 102718488182784.000
Backward Time Step 2:
Gradient di[0] = 2470774308864.000, df[0] = 1774617886720.000, dc_hat[0] = 1262422589440.000
Gradient do_[0] = 111938566619136.000
Backward Time Step 1:
Gradient di[0] = 3098685210624.000, df[0] = 2136768905216.000, dc_hat[0] = 1723537817600.000
Gradient do_[0] = 98643235307520.000
Backward Time Step 0:
Gradient di[0] = 3751748567040.000, df[0] = 2669347602432.000, dc_hat[0] = 3520405700608.000
Gradient do_[0] = 58001696227328.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2305395368394752.000, df[0] = -1809947134263296.000, dc_hat[0] = -1042039999299584.000
Gradient do_[0] = -134773617027186688.000
Backward Time Step 3:
Gradient di[0] = -3619793873666048.000, df[0] = -2763412828323840.000, dc_hat[0] = -1495715381510144.000
Gradient do_[0] = -184268528088514560.000
Backward Time Step 2:
Gradient di[0] = -4741955279912960.000, df[0] = -3572919137468416.000, dc_hat[0] = -2613114205896704.000
Gradient do_[0] = -210833588009041920.000
Backward Time Step 1:
Gradient di[0] = -5982834145624064.000, df[0] = -4280527079079936.000, dc_hat[0] = -3621612523880448.000
Gradient do_[0] = -189176129519616000.000
Backward Time Step 0:
Gradient di[0] = -6986158907064320.000, df[0] = -5106186223353856.000, dc_hat[0] = -7062773607432192.000
Gradient do_[0] = -108382881239269376.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1238110175232.000, df[0] = 907791040512.000, dc_hat[0] = 537521160192.000
Gradient do_[0] = 76766458675200.000
Backward Time Step 3:
Gradient di[0] = 1942449160192.000, df[0] = 1401759727616.000, dc_hat[0] = 760439963648.000
Gradient do_[0] = 102753871331328.000
Backward Time Step 2:
Gradient di[0] = 2471627849728.000, df[0] = 1775231041536.000, dc_hat[0] = 1262858010624.000
Gradient do_[0] = 111977246490624.000
Backward Time Step 1:
Gradient di[0] = 3099758952448.000, df[0] = 2137509068800.000, dc_hat[0] = 1724132491264.000
Gradient do_[0] = 98677402107904.000
Backward Time Step 0:
Gradient di[0] = 3753044606976.000, df[0] = 2670269562880.000, dc_hat[0] = 3521621786624.000
Gradient do_[0] = 58021728223232.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2305996395380736.000, df[0] = -1810418641141760.000, dc_hat[0] = -1042311521763328.000
Gradient do_[0] = -134808792809340928.000
Backward Time Step 3:
Gradient di[0] = -3620738766471168.000, df[0] = -2764133845958656.000, dc_hat[0] = -1496104612921344.000
Gradient do_[0] = -184316648902098944.000
Backward Time Step 2:
Gradient di[0] = -4743193304236032.000, df[0] = -3573851950678016.000, dc_hat[0] = -2613794689777664.000
Gradient do_[0] = -210888632309907456.000
Backward Time Step 1:
Gradient di[0] = -5984389997527040.000, df[0] = -4281639744045056.000, dc_hat[0] = -3622550168928256.000
Gradient do_[0] = -189225298305220608.000
Backward Time Step 0:
Gradient di[0] = -6987972457005056.000, df[0] = -5107511757635584.000, dc_hat[0] = -7064607021596672.000
Gradient do_[0] = -108411021864992768.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1238540615680.000, df[0] = 908106661888.000, dc_hat[0] = 537708003328.000
Gradient do_[0] = 76793142837248.000
Backward Time Step 3:
Gradient di[0] = 1943121821696.000, df[0] = 1402245349376.000, dc_hat[0] = 760703352832.000
Gradient do_[0] = 102789472583680.000
Backward Time Step 2:
Gradient di[0] = 2472482701312.000, df[0] = 1775844851712.000, dc_hat[0] = 1263293562880.000
Gradient do_[0] = 112015917973504.000
Backward Time Step 1:
Gradient di[0] = 3100828499968.000, df[0] = 2138246742016.000, dc_hat[0] = 1724725067776.000
Gradient do_[0] = 98711426301952.000
Backward Time Step 0:
Gradient di[0] = 3754335141888.000, df[0] = 2671187853312.000, dc_hat[0] = 3522832629760.000
Gradient do_[0] = 58041680527360.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2306596617060352.000, df[0] = -1810890013802496.000, dc_hat[0] = -1042582306029568.000
Gradient do_[0] = -134843822562607104.000
Backward Time Step 3:
Gradient di[0] = -3621678022131712.000, df[0] = -2764851105497088.000, dc_hat[0] = -1496492502155264.000
Gradient do_[0] = -184364477657907200.000
Backward Time Step 2:
Gradient di[0] = -4744425422979072.000, df[0] = -3574780468920320.000, dc_hat[0] = -2614473026174976.000
Gradient do_[0] = -210943401732866048.000
Backward Time Step 1:
Gradient di[0] = -5985945312559104.000, df[0] = -4282752140574720.000, dc_hat[0] = -3623486471798784.000
Gradient do_[0] = -189274432731086848.000
Backward Time Step 0:
Gradient di[0] = -6989785470074880.000, df[0] = -5108836755046400.000, dc_hat[0] = -7066439898890240.000
Gradient do_[0] = -108439136720912384.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1238920069120.000, df[0] = 908384796672.000, dc_hat[0] = 537872924672.000
Gradient do_[0] = 76816630939648.000
Backward Time Step 3:
Gradient di[0] = 1943716495360.000, df[0] = 1402674610176.000, dc_hat[0] = 760935940096.000
Gradient do_[0] = 102820929863680.000
Backward Time Step 2:
Gradient di[0] = 2473241083904.000, df[0] = 1776389849088.000, dc_hat[0] = 1263681142784.000
Gradient do_[0] = 112050244157440.000
Backward Time Step 1:
Gradient di[0] = 3101784539136.000, df[0] = 2138905772032.000, dc_hat[0] = 1725254991872.000
Gradient do_[0] = 98741784674304.000
Backward Time Step 0:
Gradient di[0] = 3755494604800.000, df[0] = 2672012558336.000, dc_hat[0] = 3523920527360.000
Gradient do_[0] = 58059606982656.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2307208918335488.000, df[0] = -1811370647486464.000, dc_hat[0] = -1042858459004928.000
Gradient do_[0] = -134879625409986560.000
Backward Time Step 3:
Gradient di[0] = -3622645195079680.000, df[0] = -2765589303001088.000, dc_hat[0] = -1496891128807424.000
Gradient do_[0] = -184413577724035072.000
Backward Time Step 2:
Gradient di[0] = -4745694048944128.000, df[0] = -3575736636014592.000, dc_hat[0] = -2615172300537856.000
Gradient do_[0] = -210999734523920384.000
Backward Time Step 1:
Gradient di[0] = -5987547335360512.000, df[0] = -4283898628407296.000, dc_hat[0] = -3624452571004928.000
Gradient do_[0] = -189325010265964544.000
Backward Time Step 0:
Gradient di[0] = -6991656465203200.000, df[0] = -5110204165259264.000, dc_hat[0] = -7068331831984128.000
Gradient do_[0] = -108468162109898752.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1239381573632.000, df[0] = 908723159040.000, dc_hat[0] = 538072743936.000
Gradient do_[0] = 76845244481536.000
Backward Time Step 3:
Gradient di[0] = 1944442896384.000, df[0] = 1403198373888.000, dc_hat[0] = 761220104192.000
Gradient do_[0] = 102859324522496.000
Backward Time Step 2:
Gradient di[0] = 2474165665792.000, df[0] = 1777053466624.000, dc_hat[0] = 1264152608768.000
Gradient do_[0] = 112092103311360.000
Backward Time Step 1:
Gradient di[0] = 3102941118464.000, df[0] = 2139703345152.000, dc_hat[0] = 1725896458240.000
Gradient do_[0] = 98778585497600.000
Backward Time Step 0:
Gradient di[0] = 3756894453760.000, df[0] = 2673008705536.000, dc_hat[0] = 3525234130944.000
Gradient do_[0] = 58081249591296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2307802697564160.000, df[0] = -1811836919873536.000, dc_hat[0] = -1043126558916608.000
Gradient do_[0] = -134914225666523136.000
Backward Time Step 3:
Gradient di[0] = -3623570492096512.000, df[0] = -2766295556685824.000, dc_hat[0] = -1497273246679040.000
Gradient do_[0] = -184460667745468416.000
Backward Time Step 2:
Gradient di[0] = -4746905766592512.000, df[0] = -3576649316564992.000, dc_hat[0] = -2615837752033280.000
Gradient do_[0] = -211053541874204672.000
Backward Time Step 1:
Gradient di[0] = -5989074196234240.000, df[0] = -4284990086971392.000, dc_hat[0] = -3625371694006272.000
Gradient do_[0] = -189373234158764032.000
Backward Time Step 0:
Gradient di[0] = -6993438339760128.000, df[0] = -5111506614091776.000, dc_hat[0] = -7070132497022976.000
Gradient do_[0] = -108495804519415808.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1239675961344.000, df[0] = 908939034624.000, dc_hat[0] = 538200408064.000
Gradient do_[0] = 76863481315328.000
Backward Time Step 3:
Gradient di[0] = 1944903221248.000, df[0] = 1403530772480.000, dc_hat[0] = 761399869440.000
Gradient do_[0] = 102883668262912.000
Backward Time Step 2:
Gradient di[0] = 2474752081920.000, df[0] = 1777474732032.000, dc_hat[0] = 1264450666496.000
Gradient do_[0] = 112118653255680.000
Backward Time Step 1:
Gradient di[0] = 3103680102400.000, df[0] = 2140212953088.000, dc_hat[0] = 1726304616448.000
Gradient do_[0] = 98802090377216.000
Backward Time Step 0:
Gradient di[0] = 3757787054080.000, df[0] = 2673643880448.000, dc_hat[0] = 3526071681024.000
Gradient do_[0] = 58095048851456.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2308419562242048.000, df[0] = -1812321311653888.000, dc_hat[0] = -1043405530464256.000
Gradient do_[0] = -134950294801874944.000
Backward Time Step 3:
Gradient di[0] = -3624539275657216.000, df[0] = -2767035096367104.000, dc_hat[0] = -1497672812855296.000
Gradient do_[0] = -184509991149895680.000
Backward Time Step 2:
Gradient di[0] = -4748174929428480.000, df[0] = -3577605483659264.000, dc_hat[0] = -2616536757960704.000
Gradient do_[0] = -211109977744474112.000
Backward Time Step 1:
Gradient di[0] = -5990679977132032.000, df[0] = -4286138453852160.000, dc_hat[0] = -3626340209131520.000
Gradient do_[0] = -189423897592987648.000
Backward Time Step 0:
Gradient di[0] = -6995306650533888.000, df[0] = -5112871876820992.000, dc_hat[0] = -7072021745762304.000
Gradient do_[0] = -108524786958729216.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1240114659328.000, df[0] = 909260554240.000, dc_hat[0] = 538390855680.000
Gradient do_[0] = 76890677182464.000
Backward Time Step 3:
Gradient di[0] = 1945592528896.000, df[0] = 1404028190720.000, dc_hat[0] = 761669287936.000
Gradient do_[0] = 102920099987456.000
Backward Time Step 2:
Gradient di[0] = 2475629740032.000, df[0] = 1778105188352.000, dc_hat[0] = 1264898146304.000
Gradient do_[0] = 112158356537344.000
Backward Time Step 1:
Gradient di[0] = 3104778485760.000, df[0] = 2140969762816.000, dc_hat[0] = 1726912921600.000
Gradient do_[0] = 98837020540928.000
Backward Time Step 0:
Gradient di[0] = 3759109832704.000, df[0] = 2674584977408.000, dc_hat[0] = 3527312670720.000
Gradient do_[0] = 58115496083456.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2309018978615296.000, df[0] = -1812791744790528.000, dc_hat[0] = -1043676448948224.000
Gradient do_[0] = -134985307375271936.000
Backward Time Step 3:
Gradient di[0] = -3625482020978688.000, df[0] = -2767755040260096.000, dc_hat[0] = -1498061507395584.000
Gradient do_[0] = -184557940164788224.000
Backward Time Step 2:
Gradient di[0] = -4749408658784256.000, df[0] = -3578534270337024.000, dc_hat[0] = -2617211873132544.000
Gradient do_[0] = -211164747167432704.000
Backward Time Step 1:
Gradient di[0] = -5992233144680448.000, df[0] = -4287249508204544.000, dc_hat[0] = -3627274632953856.000
Gradient do_[0] = -189472946119507968.000
Backward Time Step 0:
Gradient di[0] = -6997116979249152.000, df[0] = -5114195263619072.000, dc_hat[0] = -7073851938701312.000
Gradient do_[0] = -108552884634779648.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1240390565888.000, df[0] = 909462798336.000, dc_hat[0] = 538510589952.000
Gradient do_[0] = 76907764776960.000
Backward Time Step 3:
Gradient di[0] = 1946024017920.000, df[0] = 1404339617792.000, dc_hat[0] = 761838239744.000
Gradient do_[0] = 102942917001216.000
Backward Time Step 2:
Gradient di[0] = 2476177096704.000, df[0] = 1778498142208.000, dc_hat[0] = 1265176936448.000
Gradient do_[0] = 112183153262592.000
Backward Time Step 1:
Gradient di[0] = 3105464778752.000, df[0] = 2141442932736.000, dc_hat[0] = 1727292768256.000
Gradient do_[0] = 98858797367296.000
Backward Time Step 0:
Gradient di[0] = 3759943974912.000, df[0] = 2675178471424.000, dc_hat[0] = 3528095694848.000
Gradient do_[0] = 58128397762560.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2309610073489408.000, df[0] = -1813256003911680.000, dc_hat[0] = -1043943676444672.000
Gradient do_[0] = -135019873272070144.000
Backward Time Step 3:
Gradient di[0] = -3626410807656448.000, df[0] = -2768464246734848.000, dc_hat[0] = -1498445235879936.000
Gradient do_[0] = -184605253524520960.000
Backward Time Step 2:
Gradient di[0] = -4750628429496320.000, df[0] = -3579453661773824.000, dc_hat[0] = -2617884035514368.000
Gradient do_[0] = -211218949654708224.000
Backward Time Step 1:
Gradient di[0] = -5993772353585152.000, df[0] = -4288350630445056.000, dc_hat[0] = -3628202614325248.000
Gradient do_[0] = -189521547969429504.000
Backward Time Step 0:
Gradient di[0] = -6998910128095232.000, df[0] = -5115506302386176.000, dc_hat[0] = -7075664951771136.000
Gradient do_[0] = -108580690253053952.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1240768315392.000, df[0] = 909739884544.000, dc_hat[0] = 538674626560.000
Gradient do_[0] = 76931202547712.000
Backward Time Step 3:
Gradient di[0] = 1946618560512.000, df[0] = 1404768616448.000, dc_hat[0] = 762070368256.000
Gradient do_[0] = 102974374281216.000
Backward Time Step 2:
Gradient di[0] = 2476936003584.000, df[0] = 1779043008512.000, dc_hat[0] = 1265564385280.000
Gradient do_[0] = 112217487835136.000
Backward Time Step 1:
Gradient di[0] = 3106417147904.000, df[0] = 2142099603456.000, dc_hat[0] = 1727820333056.000
Gradient do_[0] = 98889113796608.000
Backward Time Step 0:
Gradient di[0] = 3761090330624.000, df[0] = 2675994263552.000, dc_hat[0] = 3529171271680.000
Gradient do_[0] = 58146114502656.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2310208684556288.000, df[0] = -1813726034395136.000, dc_hat[0] = -1044213990948864.000
Gradient do_[0] = -135054868665597952.000
Backward Time Step 3:
Gradient di[0] = -3627349258010624.000, df[0] = -2769180700966912.000, dc_hat[0] = -1498832856678400.000
Gradient do_[0] = -184653013560852480.000
Backward Time Step 2:
Gradient di[0] = -4751857863884800.000, df[0] = -3580380300967936.000, dc_hat[0] = -2618559419121664.000
Gradient do_[0] = -211273598818582528.000
Backward Time Step 1:
Gradient di[0] = -5995332500455424.000, df[0] = -4289466516635648.000, dc_hat[0] = -3629141601550336.000
Gradient do_[0] = -189570837014118400.000
Backward Time Step 0:
Gradient di[0] = -7000732804841472.000, df[0] = -5116838279118848.000, dc_hat[0] = -7077507492741120.000
Gradient do_[0] = -108608976907665408.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1241101238272.000, df[0] = 909983940608.000, dc_hat[0] = 538819133440.000
Gradient do_[0] = 76951855300608.000
Backward Time Step 3:
Gradient di[0] = 1947139440640.000, df[0] = 1405144399872.000, dc_hat[0] = 762274054144.000
Gradient do_[0] = 103001922469888.000
Backward Time Step 2:
Gradient di[0] = 2477597655040.000, df[0] = 1779518406656.000, dc_hat[0] = 1265901895680.000
Gradient do_[0] = 112247460331520.000
Backward Time Step 1:
Gradient di[0] = 3107247357952.000, df[0] = 2142672257024.000, dc_hat[0] = 1728279478272.000
Gradient do_[0] = 98915512745984.000
Backward Time Step 0:
Gradient di[0] = 3762100109312.000, df[0] = 2676712800256.000, dc_hat[0] = 3530118922240.000
Gradient do_[0] = 58161725702144.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2310812664332288.000, df[0] = -1814199957192704.000, dc_hat[0] = -1044485916065792.000
Gradient do_[0] = -135090130347098112.000
Backward Time Step 3:
Gradient di[0] = -3628296566734848.000, df[0] = -2769903597649920.000, dc_hat[0] = -1499223698702336.000
Gradient do_[0] = -184701031295221760.000
Backward Time Step 2:
Gradient di[0] = -4753102330658816.000, df[0] = -3581317946015744.000, dc_hat[0] = -2619242855792640.000
Gradient do_[0] = -211328780558401536.000
Backward Time Step 1:
Gradient di[0] = -5996894257938432.000, df[0] = -4290584013438976.000, dc_hat[0] = -3630083810000896.000
Gradient do_[0] = -189620194778284032.000
Backward Time Step 0:
Gradient di[0] = -7002552260362240.000, df[0] = -5118167571496960.000, dc_hat[0] = -7079345738743808.000
Gradient do_[0] = -108637186252865536.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1241565233152.000, df[0] = 910324072448.000, dc_hat[0] = 539020492800.000
Gradient do_[0] = 76980603060224.000
Backward Time Step 3:
Gradient di[0] = 1947868856320.000, df[0] = 1405670653952.000, dc_hat[0] = 762559070208.000
Gradient do_[0] = 103040417792000.000
Backward Time Step 2:
Gradient di[0] = 2478525120512.000, df[0] = 1780184383488.000, dc_hat[0] = 1266375065600.000
Gradient do_[0] = 112289436925952.000
Backward Time Step 1:
Gradient di[0] = 3108414423040.000, df[0] = 2143476645888.000, dc_hat[0] = 1728926842880.000
Gradient do_[0] = 98952573616128.000
Backward Time Step 0:
Gradient di[0] = 3763505987584.000, df[0] = 2677712879616.000, dc_hat[0] = 3531438030848.000
Gradient do_[0] = 58183460585472.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2311414228189184.000, df[0] = -1814672000942080.000, dc_hat[0] = -1044757639856128.000
Gradient do_[0] = -135125331899056128.000
Backward Time Step 3:
Gradient di[0] = -3629244949200896.000, df[0] = -2770627836510208.000, dc_hat[0] = -1499614674944000.000
Gradient do_[0] = -184749289547759616.000
Backward Time Step 2:
Gradient di[0] = -4754338207498240.000, df[0] = -3582248611741696.000, dc_hat[0] = -2619922802802688.000
Gradient do_[0] = -211383687420313600.000
Backward Time Step 1:
Gradient di[0] = -5998457089163264.000, df[0] = -4291702315548672.000, dc_hat[0] = -3631025750016000.000
Gradient do_[0] = -189669569722318848.000
Backward Time Step 0:
Gradient di[0] = -7004382990172160.000, df[0] = -5119505990680576.000, dc_hat[0] = -7081197406519296.000
Gradient do_[0] = -108665601756495872.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1241903398912.000, df[0] = 910572126208.000, dc_hat[0] = 539167293440.000
Gradient do_[0] = 77001582968832.000
Backward Time Step 3:
Gradient di[0] = 1948397993984.000, df[0] = 1406052859904.000, dc_hat[0] = 762766360576.000
Gradient do_[0] = 103068452519936.000
Backward Time Step 2:
Gradient di[0] = 2479199354880.000, df[0] = 1780668694528.000, dc_hat[0] = 1266718998528.000
Gradient do_[0] = 112319988236288.000
Backward Time Step 1:
Gradient di[0] = 3109257478144.000, df[0] = 2144058081280.000, dc_hat[0] = 1729393065984.000
Gradient do_[0] = 98979391995904.000
Backward Time Step 0:
Gradient di[0] = 3764527038464.000, df[0] = 2678439542784.000, dc_hat[0] = 3532396167168.000
Gradient do_[0] = 58199247945728.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2312009081159680.000, df[0] = -1815139078635520.000, dc_hat[0] = -1045026477965312.000
Gradient do_[0] = -135160061004611584.000
Backward Time Step 3:
Gradient di[0] = -3630175078055936.000, df[0] = -2771337848291328.000, dc_hat[0] = -1499997598121984.000
Gradient do_[0] = -184796654447099904.000
Backward Time Step 2:
Gradient di[0] = -4755556367597568.000, df[0] = -3583166392565760.000, dc_hat[0] = -2620591207088128.000
Gradient do_[0] = -211437838367981568.000
Backward Time Step 1:
Gradient di[0] = -5999996834938880.000, df[0] = -4292803169353728.000, dc_hat[0] = -3631951852339200.000
Gradient do_[0] = -189718154392371200.000
Backward Time Step 0:
Gradient di[0] = -7006169159696384.000, df[0] = -5120811660738560.000, dc_hat[0] = -7083003440267264.000
Gradient do_[0] = -108693312885489664.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1242248773632.000, df[0] = 910825488384.000, dc_hat[0] = 539317272576.000
Gradient do_[0] = 77022990696448.000
Backward Time Step 3:
Gradient di[0] = 1948942073856.000, df[0] = 1406445289472.000, dc_hat[0] = 762978828288.000
Gradient do_[0] = 103097225445376.000
Backward Time Step 2:
Gradient di[0] = 2479894036480.000, df[0] = 1781167816704.000, dc_hat[0] = 1267073286144.000
Gradient do_[0] = 112351395184640.000
Backward Time Step 1:
Gradient di[0] = 3110125436928.000, df[0] = 2144656293888.000, dc_hat[0] = 1729873707008.000
Gradient do_[0] = 99006998904832.000
Backward Time Step 0:
Gradient di[0] = 3765572468736.000, df[0] = 2679183245312.000, dc_hat[0] = 3533376847872.000
Gradient do_[0] = 58215404404736.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2312606618484736.000, df[0] = -1815608303812608.000, dc_hat[0] = -1045296591142912.000
Gradient do_[0] = -135194979088728064.000
Backward Time Step 3:
Gradient di[0] = -3631114065281024.000, df[0] = -2772054839394304.000, dc_hat[0] = -1500385621573632.000
Gradient do_[0] = -184844466023038976.000
Backward Time Step 2:
Gradient di[0] = -4756787949469696.000, df[0] = -3584094105501696.000, dc_hat[0] = -2621269006614528.000
Gradient do_[0] = -211492573431201792.000
Backward Time Step 1:
Gradient di[0] = -6001550539358208.000, df[0] = -4293914760577024.000, dc_hat[0] = -3632887886774272.000
Gradient do_[0] = -189767168559153152.000
Backward Time Step 0:
Gradient di[0] = -7007987004604416.000, df[0] = -5122140416245760.000, dc_hat[0] = -7084841686269952.000
Gradient do_[0] = -108721522230689792.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1242657980416.000, df[0] = 911125512192.000, dc_hat[0] = 539494678528.000
Gradient do_[0] = 77048341069824.000
Backward Time Step 3:
Gradient di[0] = 1949582622720.000, df[0] = 1406907580416.000, dc_hat[0] = 763229569024.000
Gradient do_[0] = 103131107033088.000
Backward Time Step 2:
Gradient di[0] = 2480708255744.000, df[0] = 1781752266752.000, dc_hat[0] = 1267487997952.000
Gradient do_[0] = 112388237950976.000
Backward Time Step 1:
Gradient di[0] = 3111147798528.000, df[0] = 2145361199104.000, dc_hat[0] = 1730440593408.000
Gradient do_[0] = 99039529926656.000
Backward Time Step 0:
Gradient di[0] = 3766813982720.000, df[0] = 2680066408448.000, dc_hat[0] = 3534542077952.000
Gradient do_[0] = 58234601734144.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2313189123424256.000, df[0] = -1816065583611904.000, dc_hat[0] = -1045559523672064.000
Gradient do_[0] = -135229012409581568.000
Backward Time Step 3:
Gradient di[0] = -3632027819573248.000, df[0] = -2772752503144448.000, dc_hat[0] = -1500763176042496.000
Gradient do_[0] = -184890971928920064.000
Backward Time Step 2:
Gradient di[0] = -4757985708474368.000, df[0] = -3584996853940224.000, dc_hat[0] = -2621927747223552.000
Gradient do_[0] = -211545676406849536.000
Backward Time Step 1:
Gradient di[0] = -6003053777911808.000, df[0] = -4294989844578304.000, dc_hat[0] = -3633793856438272.000
Gradient do_[0] = -189814670897446912.000
Backward Time Step 0:
Gradient di[0] = -7009736666906624.000, df[0] = -5123418705887232.000, dc_hat[0] = -7086609602183168.000
Gradient do_[0] = -108748657834065920.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1242952368128.000, df[0] = 911341322240.000, dc_hat[0] = 539622440960.000
Gradient do_[0] = 77066586292224.000
Backward Time Step 3:
Gradient di[0] = 1950046879744.000, df[0] = 1407242469376.000, dc_hat[0] = 763410448384.000
Gradient do_[0] = 103155534659584.000
Backward Time Step 2:
Gradient di[0] = 2481294934016.000, df[0] = 1782173925376.000, dc_hat[0] = 1267786973184.000
Gradient do_[0] = 112414829838336.000
Backward Time Step 1:
Gradient di[0] = 3111888617472.000, df[0] = 2145871855616.000, dc_hat[0] = 1730849931264.000
Gradient do_[0] = 99063043194880.000
Backward Time Step 0:
Gradient di[0] = 3767703437312.000, df[0] = 2680699224064.000, dc_hat[0] = 3535376744448.000
Gradient do_[0] = 58248354856960.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2313785318572032.000, df[0] = -1816533735047168.000, dc_hat[0] = -1045828563107840.000
Gradient do_[0] = -135263818824548352.000
Backward Time Step 3:
Gradient di[0] = -3632964390879232.000, df[0] = -2773467346763776.000, dc_hat[0] = -1501150125752320.000
Gradient do_[0] = -184938663245774848.000
Backward Time Step 2:
Gradient di[0] = -4759216753475584.000, df[0] = -3585924298440704.000, dc_hat[0] = -2622604741443584.000
Gradient do_[0] = -211600428649938944.000
Backward Time Step 1:
Gradient di[0] = -6004617682878464.000, df[0] = -4296108951994368.000, dc_hat[0] = -3634736870195200.000
Gradient do_[0] = -189863994301874176.000
Backward Time Step 0:
Gradient di[0] = -7011557733040128.000, df[0] = -5124750145748992.000, dc_hat[0] = -7088450532540416.000
Gradient do_[0] = -108776910128939008.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1243370094592.000, df[0] = 911647440896.000, dc_hat[0] = 539803516928.000
Gradient do_[0] = 77092498702336.000
Backward Time Step 3:
Gradient di[0] = 1950700797952.000, df[0] = 1407714590720.000, dc_hat[0] = 763666563072.000
Gradient do_[0] = 103190137667584.000
Backward Time Step 2:
Gradient di[0] = 2482130124800.000, df[0] = 1782773579776.000, dc_hat[0] = 1268212826112.000
Gradient do_[0] = 112452603740160.000
Backward Time Step 1:
Gradient di[0] = 3112935358464.000, df[0] = 2146593669120.000, dc_hat[0] = 1731429924864.000
Gradient do_[0] = 99096345968640.000
Backward Time Step 0:
Gradient di[0] = 3768970117120.000, df[0] = 2681600475136.000, dc_hat[0] = 3536565305344.000
Gradient do_[0] = 58267933868032.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2314383392768000.000, df[0] = -1817003362877440.000, dc_hat[0] = -1046099414482944.000
Gradient do_[0] = -135298822808010752.000
Backward Time Step 3:
Gradient di[0] = -3633904988717056.000, df[0] = -2774185143173120.000, dc_hat[0] = -1501536941244416.000
Gradient do_[0] = -184986371742498816.000
Backward Time Step 2:
Gradient di[0] = -4760444577251328.000, df[0] = -3586849327022080.000, dc_hat[0] = -2623280125050880.000
Gradient do_[0] = -211654991914467328.000
Backward Time Step 1:
Gradient di[0] = -6006164944846848.000, df[0] = -4297215711379456.000, dc_hat[0] = -3635668341227520.000
Gradient do_[0] = -189912888209571840.000
Backward Time Step 0:
Gradient di[0] = -7013356787466240.000, df[0] = -5126064942612480.000, dc_hat[0] = -7090269988061184.000
Gradient do_[0] = -108804818826428416.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1243631321088.000, df[0] = 911838937088.000, dc_hat[0] = 539916861440.000
Gradient do_[0] = 77108688715776.000
Backward Time Step 3:
Gradient di[0] = 1951110660096.000, df[0] = 1408010420224.000, dc_hat[0] = 763826995200.000
Gradient do_[0] = 103211830607872.000
Backward Time Step 2:
Gradient di[0] = 2482651791360.000, df[0] = 1783148052480.000, dc_hat[0] = 1268478509056.000
Gradient do_[0] = 112476234448896.000
Backward Time Step 1:
Gradient di[0] = 3113591504896.000, df[0] = 2147045736448.000, dc_hat[0] = 1731792207872.000
Gradient do_[0] = 99117200048128.000
Backward Time Step 0:
Gradient di[0] = 3769764413440.000, df[0] = 2682165657600.000, dc_hat[0] = 3537310580736.000
Gradient do_[0] = 58280218984448.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2314986567237632.000, df[0] = -1817476748804096.000, dc_hat[0] = -1046370869837824.000
Gradient do_[0] = -135334058719707136.000
Backward Time Step 3:
Gradient di[0] = -3634851492134912.000, df[0] = -2774907502985216.000, dc_hat[0] = -1501927917486080.000
Gradient do_[0] = -185034595635298304.000
Backward Time Step 2:
Gradient di[0] = -4761682601574400.000, df[0] = -3587781334925312.000, dc_hat[0] = -2623958461448192.000
Gradient do_[0] = -211709950315986944.000
Backward Time Step 1:
Gradient di[0] = -6007727776071680.000, df[0] = -4298333476618240.000, dc_hat[0] = -3636609207500800.000
Gradient do_[0] = -189962263153606656.000
Backward Time Step 0:
Gradient di[0] = -7015188054147072.000, df[0] = -5127403361796096.000, dc_hat[0] = -7092121118965760.000
Gradient do_[0] = -108833225740124160.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1244059402240.000, df[0] = 912152788992.000, dc_hat[0] = 540102557696.000
Gradient do_[0] = 77135171551232.000
Backward Time Step 3:
Gradient di[0] = 1951781748736.000, df[0] = 1408494469120.000, dc_hat[0] = 764089139200.000
Gradient do_[0] = 103247331196928.000
Backward Time Step 2:
Gradient di[0] = 2483507691520.000, df[0] = 1783762780160.000, dc_hat[0] = 1268915634176.000
Gradient do_[0] = 112515006595072.000
Backward Time Step 1:
Gradient di[0] = 3114662363136.000, df[0] = 2147784458240.000, dc_hat[0] = 1732386095104.000
Gradient do_[0] = 99151232630784.000
Backward Time Step 0:
Gradient di[0] = 3771054424064.000, df[0] = 2683083423744.000, dc_hat[0] = 3538520899584.000
Gradient do_[0] = 58300158705664.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2315564777209856.000, df[0] = -1817930807377920.000, dc_hat[0] = -1046632191754240.000
Gradient do_[0] = -135367834342522880.000
Backward Time Step 3:
Gradient di[0] = -3635759340847104.000, df[0] = -2775601140203520.000, dc_hat[0] = -1502303190253568.000
Gradient do_[0] = -185080792303534080.000
Backward Time Step 2:
Gradient di[0] = -4762878749966336.000, df[0] = -3588682472751104.000, dc_hat[0] = -2624615591444480.000
Gradient do_[0] = -211763139190980608.000
Backward Time Step 1:
Gradient di[0] = -6009234772721664.000, df[0] = -4299411244974080.000, dc_hat[0] = -3637517593083904.000
Gradient do_[0] = -190009868571115520.000
Backward Time Step 0:
Gradient di[0] = -7016943085158400.000, df[0] = -5128686483275776.000, dc_hat[0] = -7093896014200832.000
Gradient do_[0] = -108860455832780800.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1244627861504.000, df[0] = 912569532416.000, dc_hat[0] = 540349333504.000
Gradient do_[0] = 77170420482048.000
Backward Time Step 3:
Gradient di[0] = 1952674086912.000, df[0] = 1409138425856.000, dc_hat[0] = 764438511616.000
Gradient do_[0] = 103294542282752.000
Backward Time Step 2:
Gradient di[0] = 2484643561472.000, df[0] = 1784578572288.000, dc_hat[0] = 1269494710272.000
Gradient do_[0] = 112566445539328.000
Backward Time Step 1:
Gradient di[0] = 3116086329344.000, df[0] = 2148766187520.000, dc_hat[0] = 1733175934976.000
Gradient do_[0] = 99196531113984.000
Backward Time Step 0:
Gradient di[0] = 3772779069440.000, df[0] = 2684310519808.000, dc_hat[0] = 3540139376640.000
Gradient do_[0] = 58326826090496.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2316174125694976.000, df[0] = -1818409427795968.000, dc_hat[0] = -1046907673640960.000
Gradient do_[0] = -135403448211341312.000
Backward Time Step 3:
Gradient di[0] = -3636720608215040.000, df[0] = -2776334774304768.000, dc_hat[0] = -1502698729897984.000
Gradient do_[0] = -185129669031362560.000
Backward Time Step 2:
Gradient di[0] = -4764135564771328.000, df[0] = -3589629781475328.000, dc_hat[0] = -2625306544308224.000
Gradient do_[0] = -211818939406090240.000
Backward Time Step 1:
Gradient di[0] = -6010818541912064.000, df[0] = -4300544311033856.000, dc_hat[0] = -3638472954871808.000
Gradient do_[0] = -190059896350179328.000
Backward Time Step 0:
Gradient di[0] = -7018787773612032.000, df[0] = -5130034566135808.000, dc_hat[0] = -7095760030007296.000
Gradient do_[0] = -108889086084775936.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1244958818304.000, df[0] = 912812212224.000, dc_hat[0] = 540492988416.000
Gradient do_[0] = 77190939017216.000
Backward Time Step 3:
Gradient di[0] = 1953192083456.000, df[0] = 1409512505344.000, dc_hat[0] = 764640821248.000
Gradient do_[0] = 103321922699264.000
Backward Time Step 2:
Gradient di[0] = 2485299445760.000, df[0] = 1785049513984.000, dc_hat[0] = 1269829337088.000
Gradient do_[0] = 112596141211648.000
Backward Time Step 1:
Gradient di[0] = 3116908150784.000, df[0] = 2149332811776.000, dc_hat[0] = 1733631279104.000
Gradient do_[0] = 99222661627904.000
Backward Time Step 0:
Gradient di[0] = 3773774430208.000, df[0] = 2685018570752.000, dc_hat[0] = 3541073133568.000
Gradient do_[0] = 58342206603264.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2316754751586304.000, df[0] = -1818864962764800.000, dc_hat[0] = -1047170002190336.000
Gradient do_[0] = -135437378452979712.000
Backward Time Step 3:
Gradient di[0] = -3637625772572672.000, df[0] = -2777026532474880.000, dc_hat[0] = -1503072928923648.000
Gradient do_[0] = -185175728260644864.000
Backward Time Step 2:
Gradient di[0] = -4765326881325056.000, df[0] = -3590527161204736.000, dc_hat[0] = -2625962600562688.000
Gradient do_[0] = -211871784683700224.000
Backward Time Step 1:
Gradient di[0] = -6012331981012992.000, df[0] = -4301626374356992.000, dc_hat[0] = -3639384293244928.000
Gradient do_[0] = -190107622026772480.000
Backward Time Step 0:
Gradient di[0] = -7020550857687040.000, df[0] = -5131323056324608.000, dc_hat[0] = -7097542978306048.000
Gradient do_[0] = -108916427846582272.000
Epoch 700, Train Loss=0.011329, Weight Norm=12.909179
Sample Predictions at Epoch 700:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.34 | 63.87 | 6.53 |
| 193 | 2024-10-14 | 56.73 | 66.55 | 9.82 |
| 194 | 2024-10-15 | 56.92 | 66.00 | 9.08 |
| 195 | 2024-10-16 | 57.88 | 67.20 | 9.32 |
| 196 | 2024-10-17 | 57.41 | 66.76 | 9.35 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1245248356352.000, df[0] = 913024548864.000, dc_hat[0] = 540618391552.000
Gradient do_[0] = 77208873861120.000
Backward Time Step 3:
Gradient di[0] = 1953647689728.000, df[0] = 1409841102848.000, dc_hat[0] = 764819079168.000
Gradient do_[0] = 103345930895360.000
Backward Time Step 2:
Gradient di[0] = 2485879570432.000, df[0] = 1785466322944.000, dc_hat[0] = 1270125166592.000
Gradient do_[0] = 112622372388864.000
Backward Time Step 1:
Gradient di[0] = 3117638483968.000, df[0] = 2149835997184.000, dc_hat[0] = 1734034718720.000
Gradient do_[0] = 99245881294848.000
Backward Time Step 0:
Gradient di[0] = 3774652350464.000, df[0] = 2685643522048.000, dc_hat[0] = 3541897052160.000
Gradient do_[0] = 58355783565312.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2317350678298624.000, df[0] = -1819332979982336.000, dc_hat[0] = -1047439377170432.000
Gradient do_[0] = -135472219227684864.000
Backward Time Step 3:
Gradient di[0] = -3638562343878656.000, df[0] = -2777740839223296.000, dc_hat[0] = -1503458804891648.000
Gradient do_[0] = -185223419577499648.000
Backward Time Step 2:
Gradient di[0] = -4766555241971712.000, df[0] = -3591452995092480.000, dc_hat[0] = -2626639057911808.000
Gradient do_[0] = -211926433847574528.000
Backward Time Step 1:
Gradient di[0] = -6013879779852288.000, df[0] = -4302734207483904.000, dc_hat[0] = -3640314958970880.000
Gradient do_[0] = -190156515934470144.000
Backward Time Step 0:
Gradient di[0] = -7022352059596800.000, df[0] = -5132640000671744.000, dc_hat[0] = -7099364044439552.000
Gradient do_[0] = -108944379493744640.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1245425958912.000, df[0] = 913154637824.000, dc_hat[0] = 540695494656.000
Gradient do_[0] = 77219888103424.000
Backward Time Step 3:
Gradient di[0] = 1953926086656.000, df[0] = 1410041905152.000, dc_hat[0] = 764927606784.000
Gradient do_[0] = 103360644513792.000
Backward Time Step 2:
Gradient di[0] = 2486234251264.000, df[0] = 1785720733696.000, dc_hat[0] = 1270305128448.000
Gradient do_[0] = 112638386241536.000
Backward Time Step 1:
Gradient di[0] = 3118080458752.000, df[0] = 2150140870656.000, dc_hat[0] = 1734278250496.000
Gradient do_[0] = 99259898658816.000
Backward Time Step 0:
Gradient di[0] = 3775184240640.000, df[0] = 2686021795840.000, dc_hat[0] = 3542396174336.000
Gradient do_[0] = 58364004401152.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2317926740787200.000, df[0] = -1819785025290240.000, dc_hat[0] = -1047699222691840.000
Gradient do_[0] = -135505891771285504.000
Backward Time Step 3:
Gradient di[0] = -3639468045107200.000, df[0] = -2778432060522496.000, dc_hat[0] = -1503833003917312.000
Gradient do_[0] = -185269478806781952.000
Backward Time Step 2:
Gradient di[0] = -4767739579203584.000, df[0] = -3592345006112768.000, dc_hat[0] = -2627290282328064.000
Gradient do_[0] = -211979038607015936.000
Backward Time Step 1:
Gradient di[0] = -6015372280987648.000, df[0] = -4303800701550592.000, dc_hat[0] = -3641214754619392.000
Gradient do_[0] = -190203571596165120.000
Backward Time Step 0:
Gradient di[0] = -7024085078900736.000, df[0] = -5133905942282240.000, dc_hat[0] = -7101115317354496.000
Gradient do_[0] = -108971248809148416.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1245969121280.000, df[0] = 913552769024.000, dc_hat[0] = 540930998272.000
Gradient do_[0] = 77253551587328.000
Backward Time Step 3:
Gradient di[0] = 1954779496448.000, df[0] = 1410657943552.000, dc_hat[0] = 765261709312.000
Gradient do_[0] = 103405800390656.000
Backward Time Step 2:
Gradient di[0] = 2487320576000.000, df[0] = 1786500874240.000, dc_hat[0] = 1270858907648.000
Gradient do_[0] = 112687585427456.000
Backward Time Step 1:
Gradient di[0] = 3119444918272.000, df[0] = 2151081312256.000, dc_hat[0] = 1735034667008.000
Gradient do_[0] = 99303309705216.000
Backward Time Step 0:
Gradient di[0] = 3776837320704.000, df[0] = 2687198035968.000, dc_hat[0] = 3543947018240.000
Gradient do_[0] = 58389560295424.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2318525351854080.000, df[0] = -1820255189991424.000, dc_hat[0] = -1047969671413760.000
Gradient do_[0] = -135540809855401984.000
Backward Time Step 3:
Gradient di[0] = -3640404884848640.000, df[0] = -2779146904141824.000, dc_hat[0] = -1504218343014400.000
Gradient do_[0] = -185317101404160000.000
Backward Time Step 2:
Gradient di[0] = -4768965255495680.000, df[0] = -3593269497823232.000, dc_hat[0] = -2627964323758080.000
Gradient do_[0] = -212033430072852480.000
Backward Time Step 1:
Gradient di[0] = -6016917932343296.000, df[0] = -4304907192500224.000, dc_hat[0] = -3642146494087168.000
Gradient do_[0] = -190252448323993600.000
Backward Time Step 0:
Gradient di[0] = -7025894333874176.000, df[0] = -5135228792209408.000, dc_hat[0] = -7102944436551680.000
Gradient do_[0] = -108999329305329664.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1246337957888.000, df[0] = 913823367168.000, dc_hat[0] = 541091463168.000
Gradient do_[0] = 77276435709952.000
Backward Time Step 3:
Gradient di[0] = 1955357392896.000, df[0] = 1411074883584.000, dc_hat[0] = 765487480832.000
Gradient do_[0] = 103436360089600.000
Backward Time Step 2:
Gradient di[0] = 2488056676352.000, df[0] = 1787029487616.000, dc_hat[0] = 1271234428928.000
Gradient do_[0] = 112720913367040.000
Backward Time Step 1:
Gradient di[0] = 3120369238016.000, df[0] = 2151718977536.000, dc_hat[0] = 1735547158528.000
Gradient do_[0] = 99332745330688.000
Backward Time Step 0:
Gradient di[0] = 3777956937728.000, df[0] = 2687994691584.000, dc_hat[0] = 3544997691392.000
Gradient do_[0] = 58406870188032.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2319100877471744.000, df[0] = -1820707101081600.000, dc_hat[0] = -1048229852479488.000
Gradient do_[0] = -135574533938610176.000
Backward Time Step 3:
Gradient di[0] = -3641313270431744.000, df[0] = -2779840541360128.000, dc_hat[0] = -1504593749999616.000
Gradient do_[0] = -185363383971741696.000
Backward Time Step 2:
Gradient di[0] = -4770150129598464.000, df[0] = -3594160971972608.000, dc_hat[0] = -2628615279738880.000
Gradient do_[0] = -212086155091378176.000
Backward Time Step 1:
Gradient di[0] = -6018415265316864.000, df[0] = -4305978249969664.000, dc_hat[0] = -3643047095042048.000
Gradient do_[0] = -190299744503857152.000
Backward Time Step 0:
Gradient di[0] = -7027645069918208.000, df[0] = -5136508692463616.000, dc_hat[0] = -7104715036819456.000
Gradient do_[0] = -109026490678509568.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1246638374912.000, df[0] = 914043502592.000, dc_hat[0] = 541221191680.000
Gradient do_[0] = 77295058419712.000
Backward Time Step 3:
Gradient di[0] = 1955828727808.000, df[0] = 1411414884352.000, dc_hat[0] = 765671636992.000
Gradient do_[0] = 103461282643968.000
Backward Time Step 2:
Gradient di[0] = 2488653578240.000, df[0] = 1787458486272.000, dc_hat[0] = 1271538778112.000
Gradient do_[0] = 112747949850624.000
Backward Time Step 1:
Gradient di[0] = 3121113989120.000, df[0] = 2152232255488.000, dc_hat[0] = 1735959642112.000
Gradient do_[0] = 99356376039424.000
Backward Time Step 0:
Gradient di[0] = 3778858188800.000, df[0] = 2688635895808.000, dc_hat[0] = 3545843630080.000
Gradient do_[0] = 58420812054528.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2319693851394048.000, df[0] = -1821172031291392.000, dc_hat[0] = -1048497012867072.000
Gradient do_[0] = -135609185734754304.000
Backward Time Step 3:
Gradient di[0] = -3642244741464064.000, df[0] = -2780551895318528.000, dc_hat[0] = -1504978418008064.000
Gradient do_[0] = -185410800410689536.000
Backward Time Step 2:
Gradient di[0] = -4771370974052352.000, df[0] = -3595080094973952.000, dc_hat[0] = -2629283952459776.000
Gradient do_[0] = -212140374758522880.000
Backward Time Step 1:
Gradient di[0] = -6019960916672512.000, df[0] = -4307083667177472.000, dc_hat[0] = -3643979371380736.000
Gradient do_[0] = -190348552512208896.000
Backward Time Step 0:
Gradient di[0] = -7029441439989760.000, df[0] = -5137821341843456.000, dc_hat[0] = -7106530734243840.000
Gradient do_[0] = -109054365016260608.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1247095816192.000, df[0] = 914378981376.000, dc_hat[0] = 541419864064.000
Gradient do_[0] = 77323369971712.000
Backward Time Step 3:
Gradient di[0] = 1956545953792.000, df[0] = 1411932356608.000, dc_hat[0] = 765952000000.000
Gradient do_[0] = 103499215929344.000
Backward Time Step 2:
Gradient di[0] = 2489570820096.000, df[0] = 1788117516288.000, dc_hat[0] = 1272007229440.000
Gradient do_[0] = 112789448294400.000
Backward Time Step 1:
Gradient di[0] = 3122265325568.000, df[0] = 2153026551808.000, dc_hat[0] = 1736597045248.000
Gradient do_[0] = 99392983924736.000
Backward Time Step 0:
Gradient di[0] = 3780242046976.000, df[0] = 2689620508672.000, dc_hat[0] = 3547142029312.000
Gradient do_[0] = 58442198810624.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2320294341509120.000, df[0] = -1821643672387584.000, dc_hat[0] = -1048768468221952.000
Gradient do_[0] = -135644172538347520.000
Backward Time Step 3:
Gradient di[0] = -3643181044334592.000, df[0] = -2781266738937856.000, dc_hat[0] = -1505364830846976.000
Gradient do_[0] = -185458371468460032.000
Backward Time Step 2:
Gradient di[0] = -4772603629666304.000, df[0] = -3596009686958080.000, dc_hat[0] = -2629962288857088.000
Gradient do_[0] = -212195127001612288.000
Backward Time Step 1:
Gradient di[0] = -6021516768575488.000, df[0] = -4308196600578048.000, dc_hat[0] = -3644915674251264.000
Gradient do_[0] = -190397686938075136.000
Backward Time Step 0:
Gradient di[0] = -7031257674285056.000, df[0] = -5139149023608832.000, dc_hat[0] = -7108367369633792.000
Gradient do_[0] = -109082540001722368.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1247439093760.000, df[0] = 914630705152.000, dc_hat[0] = 541568860160.000
Gradient do_[0] = 77344609927168.000
Backward Time Step 3:
Gradient di[0] = 1957084135424.000, df[0] = 1412320854016.000, dc_hat[0] = 766162632704.000
Gradient do_[0] = 103527628144640.000
Backward Time Step 2:
Gradient di[0] = 2490253705216.000, df[0] = 1788607725568.000, dc_hat[0] = 1272355094528.000
Gradient do_[0] = 112820326760448.000
Backward Time Step 1:
Gradient di[0] = 3123122274304.000, df[0] = 2153616900096.000, dc_hat[0] = 1737070608384.000
Gradient do_[0] = 99420230123520.000
Backward Time Step 0:
Gradient di[0] = 3781281710080.000, df[0] = 2690360279040.000, dc_hat[0] = 3548117729280.000
Gradient do_[0] = 58458275577856.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2320867988078592.000, df[0] = -1822093838647296.000, dc_hat[0] = -1049027105783808.000
Gradient do_[0] = -135677699053060096.000
Backward Time Step 3:
Gradient di[0] = -3644087282434048.000, df[0] = -2781958497107968.000, dc_hat[0] = -1505738493001728.000
Gradient do_[0] = -185504447877611520.000
Backward Time Step 2:
Gradient di[0] = -4773786893156352.000, df[0] = -3596900087365632.000, dc_hat[0] = -2630611634225152.000
Gradient do_[0] = -212247645861707776.000
Backward Time Step 1:
Gradient di[0] = -6023001753518080.000, df[0] = -4309258799677440.000, dc_hat[0] = -3645810638061568.000
Gradient do_[0] = -190444519261470720.000
Backward Time Step 0:
Gradient di[0] = -7032982103654400.000, df[0] = -5140409059639296.000, dc_hat[0] = -7110110052614144.000
Gradient do_[0] = -109109280468107264.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1247745015808.000, df[0] = 914854838272.000, dc_hat[0] = 541701144576.000
Gradient do_[0] = 77363576569856.000
Backward Time Step 3:
Gradient di[0] = 1957562417152.000, df[0] = 1412666228736.000, dc_hat[0] = 766349475840.000
Gradient do_[0] = 103552944963584.000
Backward Time Step 2:
Gradient di[0] = 2490863452160.000, df[0] = 1789045506048.000, dc_hat[0] = 1272665735168.000
Gradient do_[0] = 112847958835200.000
Backward Time Step 1:
Gradient di[0] = 3123889045504.000, df[0] = 2154145775616.000, dc_hat[0] = 1737495543808.000
Gradient do_[0] = 99444615806976.000
Backward Time Step 0:
Gradient di[0] = 3782209437696.000, df[0] = 2691020357632.000, dc_hat[0] = 3548988571648.000
Gradient do_[0] = 58472615903232.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2321446466486272.000, df[0] = -1822548165656576.000, dc_hat[0] = -1049288293482496.000
Gradient do_[0] = -135711509035614208.000
Backward Time Step 3:
Gradient di[0] = -3644993520533504.000, df[0] = -2782650792148992.000, dc_hat[0] = -1506112020938752.000
Gradient do_[0] = -185550627365978112.000
Backward Time Step 2:
Gradient di[0] = -4774979820322816.000, df[0] = -3597799883014144.000, dc_hat[0] = -2631269301092352.000
Gradient do_[0] = -212300680117878784.000
Backward Time Step 1:
Gradient di[0] = -6024510360780800.000, df[0] = -4310338715516928.000, dc_hat[0] = -3646720097386496.000
Gradient do_[0] = -190492176218587136.000
Backward Time Step 0:
Gradient di[0] = -7034742503374848.000, df[0] = -5141695402344448.000, dc_hat[0] = -7111889779687424.000
Gradient do_[0] = -109136587870175232.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1248160776192.000, df[0] = 915159711744.000, dc_hat[0] = 541881958400.000
Gradient do_[0] = 77389346373632.000
Backward Time Step 3:
Gradient di[0] = 1958213844992.000, df[0] = 1413135859712.000, dc_hat[0] = 766604476416.000
Gradient do_[0] = 103587396976640.000
Backward Time Step 2:
Gradient di[0] = 2491691040768.000, df[0] = 1789640048640.000, dc_hat[0] = 1273088049152.000
Gradient do_[0] = 112885455912960.000
Backward Time Step 1:
Gradient di[0] = 3124929757184.000, df[0] = 2154863394816.000, dc_hat[0] = 1738072391680.000
Gradient do_[0] = 99477692088320.000
Backward Time Step 0:
Gradient di[0] = 3783464583168.000, df[0] = 2691913220096.000, dc_hat[0] = 3550166122496.000
Gradient do_[0] = 58492022947840.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2322032461086720.000, df[0] = -1823008129810432.000, dc_hat[0] = -1049553440604160.000
Gradient do_[0] = -135745782874636288.000
Backward Time Step 3:
Gradient di[0] = -3645915596324864.000, df[0] = -2783354361479168.000, dc_hat[0] = -1506492930850816.000
Gradient do_[0] = -185597425329635328.000
Backward Time Step 2:
Gradient di[0] = -4776179726811136.000, df[0] = -3598703168323584.000, dc_hat[0] = -2631928847007744.000
Gradient do_[0] = -212353851813003264.000
Backward Time Step 1:
Gradient di[0] = -6026023799881728.000, df[0] = -4311419973533696.000, dc_hat[0] = -3647628751405056.000
Gradient do_[0] = -190539936254918656.000
Backward Time Step 0:
Gradient di[0] = -7036515251126272.000, df[0] = -5142991945596928.000, dc_hat[0] = -7113682391662592.000
Gradient do_[0] = -109164110020608000.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1248509034496.000, df[0] = 915415236608.000, dc_hat[0] = 542033149952.000
Gradient do_[0] = 77410938650624.000
Backward Time Step 3:
Gradient di[0] = 1958763429888.000, df[0] = 1413532745728.000, dc_hat[0] = 766818975744.000
Gradient do_[0] = 103616438337536.000
Backward Time Step 2:
Gradient di[0] = 2492389130240.000, df[0] = 1790141267968.000, dc_hat[0] = 1273443909632.000
Gradient do_[0] = 112917039022080.000
Backward Time Step 1:
Gradient di[0] = 3125802172416.000, df[0] = 2155464753152.000, dc_hat[0] = 1738554605568.000
Gradient do_[0] = 99505449992192.000
Backward Time Step 0:
Gradient di[0] = 3784523382784.000, df[0] = 2692666621952.000, dc_hat[0] = 3551159386112.000
Gradient do_[0] = 58508384927744.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2322611744800768.000, df[0] = -1823462993690624.000, dc_hat[0] = -1049814963847168.000
Gradient do_[0] = -135779644396797952.000
Backward Time Step 3:
Gradient di[0] = -3646827203133440.000, df[0] = -2784050146181120.000, dc_hat[0] = -1506868606271488.000
Gradient do_[0] = -185643845336170496.000
Backward Time Step 2:
Gradient di[0] = -4777378022686720.000, df[0] = -3599606722068480.000, dc_hat[0] = -2632587319181312.000
Gradient do_[0] = -212407160947081216.000
Backward Time Step 1:
Gradient di[0] = -6027541533949952.000, df[0] = -4312506063388672.000, dc_hat[0] = -3648544384745472.000
Gradient do_[0] = -190587868089942016.000
Backward Time Step 0:
Gradient di[0] = -7038277798330368.000, df[0] = -5144279898914816.000, dc_hat[0] = -7115464266219520.000
Gradient do_[0] = -109191451782414336.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1248919814144.000, df[0] = 915716177920.000, dc_hat[0] = 542211211264.000
Gradient do_[0] = 77436389687296.000
Backward Time Step 3:
Gradient di[0] = 1959406338048.000, df[0] = 1413996609536.000, dc_hat[0] = 767070437376.000
Gradient do_[0] = 103650445754368.000
Backward Time Step 2:
Gradient di[0] = 2493207281664.000, df[0] = 1790729125888.000, dc_hat[0] = 1273861373952.000
Gradient do_[0] = 112954099892224.000
Backward Time Step 1:
Gradient di[0] = 3126828204032.000, df[0] = 2156171886592.000, dc_hat[0] = 1739123589120.000
Gradient do_[0] = 99538090065920.000
Backward Time Step 0:
Gradient di[0] = 3785764634624.000, df[0] = 2693549785088.000, dc_hat[0] = 3552324091904.000
Gradient do_[0] = 58527578062848.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2323197470965760.000, df[0] = -1823922823626752.000, dc_hat[0] = -1050079641206784.000
Gradient do_[0] = -135813866696212480.000
Backward Time Step 3:
Gradient di[0] = -3647742568038400.000, df[0] = -2784749152108544.000, dc_hat[0] = -1507246294958080.000
Gradient do_[0] = -185690419961528320.000
Backward Time Step 2:
Gradient di[0] = -4778580613529600.000, df[0] = -3600512423297024.000, dc_hat[0] = -2633248207273984.000
Gradient do_[0] = -212460590340243456.000
Backward Time Step 1:
Gradient di[0] = -6029061952372736.000, df[0] = -4313593495420928.000, dc_hat[0] = -3649461360263168.000
Gradient do_[0] = -190635920184049664.000
Backward Time Step 0:
Gradient di[0] = -7040045177372672.000, df[0] = -5145571610329088.000, dc_hat[0] = -7117250435743744.000
Gradient do_[0] = -109218862263697408.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1249243955200.000, df[0] = 915953876992.000, dc_hat[0] = 542351654912.000
Gradient do_[0] = 77456497180672.000
Backward Time Step 3:
Gradient di[0] = 1959914635264.000, df[0] = 1414363086848.000, dc_hat[0] = 767269339136.000
Gradient do_[0] = 103677331243008.000
Backward Time Step 2:
Gradient di[0] = 2493856612352.000, df[0] = 1791195086848.000, dc_hat[0] = 1274191806464.000
Gradient do_[0] = 112983451631616.000
Backward Time Step 1:
Gradient di[0] = 3127640064000.000, df[0] = 2156731957248.000, dc_hat[0] = 1739573297152.000
Gradient do_[0] = 99563851481088.000
Backward Time Step 0:
Gradient di[0] = 3786744791040.000, df[0] = 2694247088128.000, dc_hat[0] = 3553243955200.000
Gradient do_[0] = 58542727888896.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2323785076178944.000, df[0] = -1824383861522432.000, dc_hat[0] = -1050344654110720.000
Gradient do_[0] = -135848209254711296.000
Backward Time Step 3:
Gradient di[0] = -3648667328184320.000, df[0] = -2785454868922368.000, dc_hat[0] = -1507627607523328.000
Gradient do_[0] = -185737441263484928.000
Backward Time Step 2:
Gradient di[0] = -4779795015532544.000, df[0] = -3601427251331072.000, dc_hat[0] = -2633916343123968.000
Gradient do_[0] = -212514432050266112.000
Backward Time Step 1:
Gradient di[0] = -6030589886988288.000, df[0] = -4314686564597760.000, dc_hat[0] = -3650381825441792.000
Gradient do_[0] = -190684178436587520.000
Backward Time Step 0:
Gradient di[0] = -7041830273155072.000, df[0] = -5146876743516160.000, dc_hat[0] = -7119055395749888.000
Gradient do_[0] = -109246556212822016.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1249571504128.000, df[0] = 916194197504.000, dc_hat[0] = 542494130176.000
Gradient do_[0] = 77476831166464.000
Backward Time Step 3:
Gradient di[0] = 1960431976448.000, df[0] = 1414736510976.000, dc_hat[0] = 767471386624.000
Gradient do_[0] = 103704644550656.000
Backward Time Step 2:
Gradient di[0] = 2494512496640.000, df[0] = 1791666421760.000, dc_hat[0] = 1274526826496.000
Gradient do_[0] = 113013180858368.000
Backward Time Step 1:
Gradient di[0] = 3128469487616.000, df[0] = 2157303431168.000, dc_hat[0] = 1740031393792.000
Gradient do_[0] = 99590216876032.000
Backward Time Step 0:
Gradient di[0] = 3787744083968.000, df[0] = 2694958022656.000, dc_hat[0] = 3554181644288.000
Gradient do_[0] = 58558179704832.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2324363286151168.000, df[0] = -1824837920096256.000, dc_hat[0] = -1050605908918272.000
Gradient do_[0] = -135881967697657856.000
Backward Time Step 3:
Gradient di[0] = -3649575445331968.000, df[0] = -2786148506140672.000, dc_hat[0] = -1508002880290816.000
Gradient do_[0] = -185783689471328256.000
Backward Time Step 2:
Gradient di[0] = -4780974520926208.000, df[0] = -3602316041125888.000, dc_hat[0] = -2634563809443840.000
Gradient do_[0] = -212566882190884864.000
Backward Time Step 1:
Gradient di[0] = -6032079703769088.000, df[0] = -4315752521793536.000, dc_hat[0] = -3651278131429376.000
Gradient do_[0] = -190731131019067392.000
Backward Time Step 0:
Gradient di[0] = -7043569198039040.000, df[0] = -5148147516964864.000, dc_hat[0] = -7120813647986688.000
Gradient do_[0] = -109273537197375488.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1249989623808.000, df[0] = 916500709376.000, dc_hat[0] = 542675566592.000
Gradient do_[0] = 77502743576576.000
Backward Time Step 3:
Gradient di[0] = 1961083666432.000, df[0] = 1415206666240.000, dc_hat[0] = 767726583808.000
Gradient do_[0] = 103739113340928.000
Backward Time Step 2:
Gradient di[0] = 2495344541696.000, df[0] = 1792263585792.000, dc_hat[0] = 1274950713344.000
Gradient do_[0] = 113050778599424.000
Backward Time Step 1:
Gradient di[0] = 3129509675008.000, df[0] = 2158020788224.000, dc_hat[0] = 1740608372736.000
Gradient do_[0] = 99623301545984.000
Backward Time Step 0:
Gradient di[0] = 3789001064448.000, df[0] = 2695852457984.000, dc_hat[0] = 3555361030144.000
Gradient do_[0] = 58577616109568.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2324933979930624.000, df[0] = -1825286207307776.000, dc_hat[0] = -1050863338520576.000
Gradient do_[0] = -135915339593547776.000
Backward Time Step 3:
Gradient di[0] = -3650467993223168.000, df[0] = -2786829526892544.000, dc_hat[0] = -1508370502647808.000
Gradient do_[0] = -185829147405189120.000
Backward Time Step 2:
Gradient di[0] = -4782149731352576.000, df[0] = -3603201878130688.000, dc_hat[0] = -2635210470457344.000
Gradient do_[0] = -212619074633465856.000
Backward Time Step 1:
Gradient di[0] = -6033562004357120.000, df[0] = -4316813110280192.000, dc_hat[0] = -3652172558368768.000
Gradient do_[0] = -190777980522332160.000
Backward Time Step 0:
Gradient di[0] = -7045301680472064.000, df[0] = -5149413458575360.000, dc_hat[0] = -7122564920901632.000
Gradient do_[0] = -109300415102713856.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1250321891328.000, df[0] = 916744306688.000, dc_hat[0] = 542819844096.000
Gradient do_[0] = 77523329220608.000
Backward Time Step 3:
Gradient di[0] = 1961605595136.000, df[0] = 1415583629312.000, dc_hat[0] = 767930793984.000
Gradient do_[0] = 103766669918208.000
Backward Time Step 2:
Gradient di[0] = 2496007241728.000, df[0] = 1792739770368.000, dc_hat[0] = 1275288485888.000
Gradient do_[0] = 113080751095808.000
Backward Time Step 1:
Gradient di[0] = 3130343292928.000, df[0] = 2158595538944.000, dc_hat[0] = 1741070139392.000
Gradient do_[0] = 99649784381440.000
Backward Time Step 0:
Gradient di[0] = 3790010056704.000, df[0] = 2696570470400.000, dc_hat[0] = 3556307894272.000
Gradient do_[0] = 58593218920448.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2325530711949312.000, df[0] = -1825754627178496.000, dc_hat[0] = -1051133585915904.000
Gradient do_[0] = -135950120238710784.000
Backward Time Step 3:
Gradient di[0] = -3651399464255488.000, df[0] = -2787540880850944.000, dc_hat[0] = -1508755573309440.000
Gradient do_[0] = -185876512304529408.000
Backward Time Step 2:
Gradient di[0] = -4783373260161024.000, df[0] = -3604123417051136.000, dc_hat[0] = -2635882901274624.000
Gradient do_[0] = -212673431739564032.000
Backward Time Step 1:
Gradient di[0] = -6035103897616384.000, df[0] = -4317915574697984.000, dc_hat[0] = -3653100002869248.000
Gradient do_[0] = -190826685451468800.000
Backward Time Step 0:
Gradient di[0] = -7047099661156352.000, df[0] = -5150727718567936.000, dc_hat[0] = -7124382765809664.000
Gradient do_[0] = -109328306620334080.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1250656649216.000, df[0] = 916989804544.000, dc_hat[0] = 542965202944.000
Gradient do_[0] = 77544099414016.000
Backward Time Step 3:
Gradient di[0] = 1962131718144.000, df[0] = 1415963082752.000, dc_hat[0] = 768136314880.000
Gradient do_[0] = 103794511708160.000
Backward Time Step 2:
Gradient di[0] = 2496676495360.000, df[0] = 1793220411392.000, dc_hat[0] = 1275629666304.000
Gradient do_[0] = 113111075913728.000
Backward Time Step 1:
Gradient di[0] = 3131184775168.000, df[0] = 2159175401472.000, dc_hat[0] = 1741535576064.000
Gradient do_[0] = 99676544040960.000
Backward Time Step 0:
Gradient di[0] = 3791024553984.000, df[0] = 2697292414976.000, dc_hat[0] = 3557259739136.000
Gradient do_[0] = 58608901423104.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2326087178649600.000, df[0] = -1826191237447680.000, dc_hat[0] = -1051385177047040.000
Gradient do_[0] = -135982650321010688.000
Backward Time Step 3:
Gradient di[0] = -3652279932551168.000, df[0] = -2788213311668224.000, dc_hat[0] = -1509118363828224.000
Gradient do_[0] = -185921265863753728.000
Backward Time Step 2:
Gradient di[0] = -4784525385138176.000, df[0] = -3604991537315840.000, dc_hat[0] = -2636515872079872.000
Gradient do_[0] = -212724644929601536.000
Backward Time Step 1:
Gradient di[0] = -6036554522820608.000, df[0] = -4318952809299968.000, dc_hat[0] = -3653974297149440.000
Gradient do_[0] = -190872521342451712.000
Backward Time Step 0:
Gradient di[0] = -7048777382756352.000, df[0] = -5151953931730944.000, dc_hat[0] = -7126078204149760.000
Gradient do_[0] = -109354334122147840.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1250971877376.000, df[0] = 917220753408.000, dc_hat[0] = 543101321216.000
Gradient do_[0] = 77563628093440.000
Backward Time Step 3:
Gradient di[0] = 1962626383872.000, df[0] = 1416320122880.000, dc_hat[0] = 768329580544.000
Gradient do_[0] = 103820667387904.000
Backward Time Step 2:
Gradient di[0] = 2497304854528.000, df[0] = 1793671561216.000, dc_hat[0] = 1275950399488.000
Gradient do_[0] = 113139504906240.000
Backward Time Step 1:
Gradient di[0] = 3131970420736.000, df[0] = 2159717253120.000, dc_hat[0] = 1741970210816.000
Gradient do_[0] = 99701508538368.000
Backward Time Step 0:
Gradient di[0] = 3791977447424.000, df[0] = 2697970057216.000, dc_hat[0] = 3558153650176.000
Gradient do_[0] = 58623627624448.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2326676394475520.000, df[0] = -1826653751738368.000, dc_hat[0] = -1051650861039616.000
Gradient do_[0] = -136017078778855424.000
Backward Time Step 3:
Gradient di[0] = -3653202276777984.000, df[0] = -2788916880998400.000, dc_hat[0] = -1509498065780736.000
Gradient do_[0] = -185968184086495232.000
Backward Time Step 2:
Gradient di[0] = -4785729049722880.000, df[0] = -3605898043850752.000, dc_hat[0] = -2637176223301632.000
Gradient do_[0] = -212778125862371328.000
Backward Time Step 1:
Gradient di[0] = -6038082457436160.000, df[0] = -4320046146912256.000, dc_hat[0] = -3654894493892608.000
Gradient do_[0] = -190920693695643648.000
Backward Time Step 0:
Gradient di[0] = -7050566236635136.000, df[0] = -5153261212401664.000, dc_hat[0] = -7127887459123200.000
Gradient do_[0] = -109382088200814592.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1251348316160.000, df[0] = 917496659968.000, dc_hat[0] = 543264669696.000
Gradient do_[0] = 77586973589504.000
Backward Time Step 3:
Gradient di[0] = 1963214766080.000, df[0] = 1416744534016.000, dc_hat[0] = 768559480832.000
Gradient do_[0] = 103851763957760.000
Backward Time Step 2:
Gradient di[0] = 2498055110656.000, df[0] = 1794210267136.000, dc_hat[0] = 1276332212224.000
Gradient do_[0] = 113173495545856.000
Backward Time Step 1:
Gradient di[0] = 3132913090560.000, df[0] = 2160366977024.000, dc_hat[0] = 1742491746304.000
Gradient do_[0] = 99731497811968.000
Backward Time Step 0:
Gradient di[0] = 3793119084544.000, df[0] = 2698782441472.000, dc_hat[0] = 3559225032704.000
Gradient do_[0] = 58641281449984.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2327254872883200.000, df[0] = -1827108078747648.000, dc_hat[0] = -1051911511867392.000
Gradient do_[0] = -136050905941278720.000
Backward Time Step 3:
Gradient di[0] = -3654114152022016.000, df[0] = -2789613202571264.000, dc_hat[0] = -1509874814943232.000
Gradient do_[0] = -186014604093030400.000
Backward Time Step 2:
Gradient di[0] = -4786927345598464.000, df[0] = -3606801060724736.000, dc_hat[0] = -2637835769217024.000
Gradient do_[0] = -212831314737364992.000
Backward Time Step 1:
Gradient di[0] = -6039582474764288.000, df[0] = -4321119351865344.000, dc_hat[0] = -3655798316072960.000
Gradient do_[0] = -190968110134591488.000
Backward Time Step 0:
Gradient di[0] = -7052314825195520.000, df[0] = -5154539502043136.000, dc_hat[0] = -7129654301294592.000
Gradient do_[0] = -109409215214256128.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1251858186240.000, df[0] = 917870608384.000, dc_hat[0] = 543486083072.000
Gradient do_[0] = 77618590253056.000
Backward Time Step 3:
Gradient di[0] = 1964016271360.000, df[0] = 1417323085824.000, dc_hat[0] = 768873267200.000
Gradient do_[0] = 103894168371200.000
Backward Time Step 2:
Gradient di[0] = 2499078520832.000, df[0] = 1794945318912.000, dc_hat[0] = 1276854009856.000
Gradient do_[0] = 113219825827840.000
Backward Time Step 1:
Gradient di[0] = 3134193664000.000, df[0] = 2161250271232.000, dc_hat[0] = 1743202418688.000
Gradient do_[0] = 99772232892416.000
Backward Time Step 0:
Gradient di[0] = 3794661277696.000, df[0] = 2699879776256.000, dc_hat[0] = 3560672329728.000
Gradient do_[0] = 58665126068224.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2327817245163520.000, df[0] = -1827549520855040.000, dc_hat[0] = -1052165653135360.000
Gradient do_[0] = -136083745261223936.000
Backward Time Step 3:
Gradient di[0] = -3654994888753152.000, df[0] = -2790285901824000.000, dc_hat[0] = -1510238679203840.000
Gradient do_[0] = -186059443551600640.000
Backward Time Step 2:
Gradient di[0] = -4788080007446528.000, df[0] = -3607668912553984.000, dc_hat[0] = -2638470350635008.000
Gradient do_[0] = -212882545107271680.000
Backward Time Step 1:
Gradient di[0] = -6041049742966784.000, df[0] = -4322168934498304.000, dc_hat[0] = -3656680931852288.000
Gradient do_[0] = -191014461421649920.000
Backward Time Step 0:
Gradient di[0] = -7054018853470208.000, df[0] = -5155785042558976.000, dc_hat[0] = -7131378193793024.000
Gradient do_[0] = -109435655032930304.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1252142874624.000, df[0] = 918079340544.000, dc_hat[0] = 543609651200.000
Gradient do_[0] = 77636181164032.000
Backward Time Step 3:
Gradient di[0] = 1964464013312.000, df[0] = 1417646047232.000, dc_hat[0] = 769048051712.000
Gradient do_[0] = 103917782302720.000
Backward Time Step 2:
Gradient di[0] = 2499643703296.000, df[0] = 1795351248896.000, dc_hat[0] = 1277142237184.000
Gradient do_[0] = 113245385916416.000
Backward Time Step 1:
Gradient di[0] = 3134903549952.000, df[0] = 2161739431936.000, dc_hat[0] = 1743594717184.000
Gradient do_[0] = 99794781470720.000
Backward Time Step 0:
Gradient di[0] = 3795520585728.000, df[0] = 2700491096064.000, dc_hat[0] = 3561478684672.000
Gradient do_[0] = 58678401040384.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2328404313505792.000, df[0] = -1828010290315264.000, dc_hat[0] = -1052430867365888.000
Gradient do_[0] = -136117993330442240.000
Backward Time Step 3:
Gradient di[0] = -3655916964544512.000, df[0] = -2790989739589632.000, dc_hat[0] = -1510618649591808.000
Gradient do_[0] = -186106275874996224.000
Backward Time Step 2:
Gradient di[0] = -4789284745773056.000, df[0] = -3608577029701632.000, dc_hat[0] = -2639131775598592.000
Gradient do_[0] = -212936077579649024.000
Backward Time Step 1:
Gradient di[0] = -6042569087647744.000, df[0] = -4323255024353280.000, dc_hat[0] = -3657597370499072.000
Gradient do_[0] = -191062427616411648.000
Backward Time Step 0:
Gradient di[0] = -7055797506801664.000, df[0] = -5157084807036928.000, dc_hat[0] = -7133176174477312.000
Gradient do_[0] = -109463245902839808.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1252556668928.000, df[0] = 918382706688.000, dc_hat[0] = 543789252608.000
Gradient do_[0] = 77661841915904.000
Backward Time Step 3:
Gradient di[0] = 1965112688640.000, df[0] = 1418114498560.000, dc_hat[0] = 769302462464.000
Gradient do_[0] = 103952091709440.000
Backward Time Step 2:
Gradient di[0] = 2500469456896.000, df[0] = 1795944611840.000, dc_hat[0] = 1277563895808.000
Gradient do_[0] = 113282782330880.000
Backward Time Step 1:
Gradient di[0] = 3135940329472.000, df[0] = 2162454429696.000, dc_hat[0] = 1744169074688.000
Gradient do_[0] = 99827757088768.000
Backward Time Step 0:
Gradient di[0] = 3796777041920.000, df[0] = 2701384744960.000, dc_hat[0] = 3562657284096.000
Gradient do_[0] = 58697824862208.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2328966417350656.000, df[0] = -1828451463987200.000, dc_hat[0] = -1052684471762944.000
Gradient do_[0] = -136150918549733376.000
Backward Time Step 3:
Gradient di[0] = -3656803606855680.000, df[0] = -2791666465374208.000, dc_hat[0] = -1510984527118336.000
Gradient do_[0] = -186151458930950144.000
Backward Time Step 2:
Gradient di[0] = -4790449755652096.000, df[0] = -3609454276771840.000, dc_hat[0] = -2639770383548416.000
Gradient do_[0] = -212987857705369600.000
Backward Time Step 1:
Gradient di[0] = -6044027765915648.000, df[0] = -4324299238277120.000, dc_hat[0] = -3658475422875648.000
Gradient do_[0] = -191108469665824768.000
Backward Time Step 0:
Gradient di[0] = -7057494018883584.000, df[0] = -5158324978843648.000, dc_hat[0] = -7134891477041152.000
Gradient do_[0] = -109489574052364288.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1252880809984.000, df[0] = 918620340224.000, dc_hat[0] = 543929696256.000
Gradient do_[0] = 77681932632064.000
Backward Time Step 3:
Gradient di[0] = 1965623214080.000, df[0] = 1418482679808.000, dc_hat[0] = 769500905472.000
Gradient do_[0] = 103979077861376.000
Backward Time Step 2:
Gradient di[0] = 2501119049728.000, df[0] = 1796411228160.000, dc_hat[0] = 1277895901184.000
Gradient do_[0] = 113312150847488.000
Backward Time Step 1:
Gradient di[0] = 3136752975872.000, df[0] = 2163014631424.000, dc_hat[0] = 1744619175936.000
Gradient do_[0] = 99853577224192.000
Backward Time Step 0:
Gradient di[0] = 3797760606208.000, df[0] = 2702084931584.000, dc_hat[0] = 3563580293120.000
Gradient do_[0] = 58713033408512.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2329550532902912.000, df[0] = -1828910220181504.000, dc_hat[0] = -1052948478033920.000
Gradient do_[0] = -136185020590063616.000
Backward Time Step 3:
Gradient di[0] = -3657721387679744.000, df[0] = -2792367081914368.000, dc_hat[0] = -1511362752675840.000
Gradient do_[0] = -186198033556307968.000
Backward Time Step 2:
Gradient di[0] = -4791652346494976.000, df[0] = -3610360514871296.000, dc_hat[0] = -2640432882253824.000
Gradient do_[0] = -213041166839447552.000
Backward Time Step 1:
Gradient di[0] = -6045546573725696.000, df[0] = -4325385596567552.000, dc_hat[0] = -3659390519345152.000
Gradient do_[0] = -191156418680717312.000
Backward Time Step 0:
Gradient di[0] = -7059261397925888.000, df[0] = -5159616690257920.000, dc_hat[0] = -7136677646565376.000
Gradient do_[0] = -109516984533647360.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1253276254208.000, df[0] = 918910337024.000, dc_hat[0] = 544101400576.000
Gradient do_[0] = 77706427367424.000
Backward Time Step 3:
Gradient di[0] = 1966240956416.000, df[0] = 1418928324608.000, dc_hat[0] = 769742602240.000
Gradient do_[0] = 104011734712320.000
Backward Time Step 2:
Gradient di[0] = 2501905481728.000, df[0] = 1796975755264.000, dc_hat[0] = 1278296457216.000
Gradient do_[0] = 113347752099840.000
Backward Time Step 1:
Gradient di[0] = 3137739948032.000, df[0] = 2163695026176.000, dc_hat[0] = 1745166008320.000
Gradient do_[0] = 99884984172544.000
Backward Time Step 0:
Gradient di[0] = 3798954147840.000, df[0] = 2702934016000.000, dc_hat[0] = 3564700172288.000
Gradient do_[0] = 58731488346112.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2330137869680640.000, df[0] = -1829371392294912.000, dc_hat[0] = -1053213960699904.000
Gradient do_[0] = -136219388918366208.000
Backward Time Step 3:
Gradient di[0] = -3658643731906560.000, df[0] = -2793071188115456.000, dc_hat[0] = -1511743125716992.000
Gradient do_[0] = -186244934599180288.000
Backward Time Step 2:
Gradient di[0] = -4792858158563328.000, df[0] = -3611269168889856.000, dc_hat[0] = -2641096454701056.000
Gradient do_[0] = -213094716491694080.000
Backward Time Step 1:
Gradient di[0] = -6047076118953984.000, df[0] = -4326479739486208.000, dc_hat[0] = -3660312326701056.000
Gradient do_[0] = -191204745652731904.000
Backward Time Step 0:
Gradient di[0] = -7061042735611904.000, df[0] = -5160918602219520.000, dc_hat[0] = -7138478848475136.000
Gradient do_[0] = -109544618353229824.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1253485445120.000, df[0] = 919063429120.000, dc_hat[0] = 544191873024.000
Gradient do_[0] = 77719421321216.000
Backward Time Step 3:
Gradient di[0] = 1966572830720.000, df[0] = 1419167793152.000, dc_hat[0] = 769872625664.000
Gradient do_[0] = 104029283680256.000
Backward Time Step 2:
Gradient di[0] = 2502329106432.000, df[0] = 1797279973376.000, dc_hat[0] = 1278511546368.000
Gradient do_[0] = 113366936846336.000
Backward Time Step 1:
Gradient di[0] = 3138268430336.000, df[0] = 2164059275264.000, dc_hat[0] = 1745457119232.000
Gradient do_[0] = 99901744611328.000
Backward Time Step 0:
Gradient di[0] = 3799582769152.000, df[0] = 2703381233664.000, dc_hat[0] = 3565290258432.000
Gradient do_[0] = 58741206548480.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2330701852573696.000, df[0] = -1829814176579584.000, dc_hat[0] = -1053468370403328.000
Gradient do_[0] = -136252365677264896.000
Backward Time Step 3:
Gradient di[0] = -3659528495169536.000, df[0] = -2793746840158208.000, dc_hat[0] = -1512107795283968.000
Gradient do_[0] = -186289963036311552.000
Backward Time Step 2:
Gradient di[0] = -4794015652249600.000, df[0] = -3612141047250944.000, dc_hat[0] = -2641731572989952.000
Gradient do_[0] = -213146170199900160.000
Backward Time Step 1:
Gradient di[0] = -6048537481576448.000, df[0] = -4327524758716416.000, dc_hat[0] = -3661191452819456.000
Gradient do_[0] = -191250856421621760.000
Backward Time Step 0:
Gradient di[0] = -7062761259401216.000, df[0] = -5162174880153600.000, dc_hat[0] = -7140216699617280.000
Gradient do_[0] = -109571272920268800.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1254010519552.000, df[0] = 919448322048.000, dc_hat[0] = 544419774464.000
Gradient do_[0] = 77751969120256.000
Backward Time Step 3:
Gradient di[0] = 1967392161792.000, df[0] = 1419759190016.000, dc_hat[0] = 770193096704.000
Gradient do_[0] = 104072627617792.000
Backward Time Step 2:
Gradient di[0] = 2503372963840.000, df[0] = 1798029443072.000, dc_hat[0] = 1279043698688.000
Gradient do_[0] = 113414198263808.000
Backward Time Step 1:
Gradient di[0] = 3139584655360.000, df[0] = 2164966948864.000, dc_hat[0] = 1746187321344.000
Gradient do_[0] = 99943595376640.000
Backward Time Step 0:
Gradient di[0] = 3801177653248.000, df[0] = 2704516055040.000, dc_hat[0] = 3566786838528.000
Gradient do_[0] = 58765860667392.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2331275230707712.000, df[0] = -1830264074403840.000, dc_hat[0] = -1053727142182912.000
Gradient do_[0] = -136285892191977472.000
Backward Time Step 3:
Gradient di[0] = -3660426411769856.000, df[0] = -2794431887441920.000, dc_hat[0] = -1512478638866432.000
Gradient do_[0] = -186335678668210176.000
Backward Time Step 2:
Gradient di[0] = -4795198378868736.000, df[0] = -3613031716093952.000, dc_hat[0] = -2642380918358016.000
Gradient do_[0] = -213198723419734016.000
Backward Time Step 1:
Gradient di[0] = -6050021929648128.000, df[0] = -4328586689380352.000, dc_hat[0] = -3662086685065216.000
Gradient do_[0] = -191297740284624896.000
Backward Time Step 0:
Gradient di[0] = -7064478709448704.000, df[0] = -5163430084345856.000, dc_hat[0] = -7141951866404864.000
Gradient do_[0] = -109597918897373184.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1254301761536.000, df[0] = 919661969408.000, dc_hat[0] = 544546193408.000
Gradient do_[0] = 77770004627456.000
Backward Time Step 3:
Gradient di[0] = 1967851569152.000, df[0] = 1420090802176.000, dc_hat[0] = 770372927488.000
Gradient do_[0] = 104096946192384.000
Backward Time Step 2:
Gradient di[0] = 2503957020672.000, df[0] = 1798449004544.000, dc_hat[0] = 1279342018560.000
Gradient do_[0] = 113440672710656.000
Backward Time Step 1:
Gradient di[0] = 3140316823552.000, df[0] = 2165471707136.000, dc_hat[0] = 1746592333824.000
Gradient do_[0] = 99966898929664.000
Backward Time Step 0:
Gradient di[0] = 3802056622080.000, df[0] = 2705141268480.000, dc_hat[0] = 3567611543552.000
Gradient do_[0] = 58779450212352.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2331836260810752.000, df[0] = -1830704711204864.000, dc_hat[0] = -1053981015015424.000
Gradient do_[0] = -136318602662903808.000
Backward Time Step 3:
Gradient di[0] = -3661307685371904.000, df[0] = -2795105123565568.000, dc_hat[0] = -1512842637344768.000
Gradient do_[0] = -186380500946911232.000
Backward Time Step 2:
Gradient di[0] = -4796347282620416.000, df[0] = -3613897420439552.000, dc_hat[0] = -2643013889163264.000
Gradient do_[0] = -213249764811079680.000
Backward Time Step 1:
Gradient di[0] = -6051477923561472.000, df[0] = -4329628218949632.000, dc_hat[0] = -3662963663699968.000
Gradient do_[0] = -191343747974299648.000
Backward Time Step 0:
Gradient di[0] = -7066182200852480.000, df[0] = -5164675624861696.000, dc_hat[0] = -7143674685161472.000
Gradient do_[0] = -109624358716047360.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1254649102336.000, df[0] = 919916511232.000, dc_hat[0] = 544696958976.000
Gradient do_[0] = 77791496241152.000
Backward Time Step 3:
Gradient di[0] = 1968394993664.000, df[0] = 1420483100672.000, dc_hat[0] = 770585133056.000
Gradient do_[0] = 104125593288704.000
Backward Time Step 2:
Gradient di[0] = 2504649080832.000, df[0] = 1798945898496.000, dc_hat[0] = 1279694471168.000
Gradient do_[0] = 113471995772928.000
Backward Time Step 1:
Gradient di[0] = 3141184782336.000, df[0] = 2166069788672.000, dc_hat[0] = 1747072450560.000
Gradient do_[0] = 99994480672768.000
Backward Time Step 0:
Gradient di[0] = 3803107033088.000, df[0] = 2705888641024.000, dc_hat[0] = 3568596942848.000
Gradient do_[0] = 58795686363136.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2332412860170240.000, df[0] = -1831157293383680.000, dc_hat[0] = -1054241531625472.000
Gradient do_[0] = -136352275206504448.000
Backward Time Step 3:
Gradient di[0] = -3662213923471360.000, df[0] = -2795796881735680.000, dc_hat[0] = -1513216836370432.000
Gradient do_[0] = -186426646075539456.000
Backward Time Step 2:
Gradient di[0] = -4797538062303232.000, df[0] = -3614795337039872.000, dc_hat[0] = -2643668871675904.000
Gradient do_[0] = -213302730347773952.000
Backward Time Step 1:
Gradient di[0] = -6052987067695104.000, df[0] = -4330707597918208.000, dc_hat[0] = -3663871512412160.000
Gradient do_[0] = -191391387751546880.000
Backward Time Step 0:
Gradient di[0] = -7067937231863808.000, df[0] = -5165957672599552.000, dc_hat[0] = -7145449043525632.000
Gradient do_[0] = -109651580218769408.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1255059357696.000, df[0] = 920217321472.000, dc_hat[0] = 544874954752.000
Gradient do_[0] = 77816938889216.000
Backward Time Step 3:
Gradient di[0] = 1969039736832.000, df[0] = 1420948144128.000, dc_hat[0] = 770837577728.000
Gradient do_[0] = 104159701368832.000
Backward Time Step 2:
Gradient di[0] = 2505468018688.000, df[0] = 1799534280704.000, dc_hat[0] = 1280112197632.000
Gradient do_[0] = 113509090197504.000
Backward Time Step 1:
Gradient di[0] = 3142209241088.000, df[0] = 2166776397824.000, dc_hat[0] = 1747639861248.000
Gradient do_[0] = 100027078803456.000
Backward Time Step 0:
Gradient di[0] = 3804353789952.000, df[0] = 2706775998464.000, dc_hat[0] = 3569767153664.000
Gradient do_[0] = 58814967578624.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2332970400612352.000, df[0] = -1831595111612416.000, dc_hat[0] = -1054493324083200.000
Gradient do_[0] = -136384865418346496.000
Backward Time Step 3:
Gradient di[0] = -3663093049589760.000, df[0] = -2796468238811136.000, dc_hat[0] = -1513579090018304.000
Gradient do_[0] = -186471313735417856.000
Backward Time Step 2:
Gradient di[0] = -4798686966054912.000, df[0] = -3615660504514560.000, dc_hat[0] = -2644299694997504.000
Gradient do_[0] = -213353685839773696.000
Backward Time Step 1:
Gradient di[0] = -6054430176706560.000, df[0] = -4331739732246528.000, dc_hat[0] = -3664740169547776.000
Gradient do_[0] = -191436897225015296.000
Backward Time Step 0:
Gradient di[0] = -7069621395914752.000, df[0] = -5167188717600768.000, dc_hat[0] = -7147150924316672.000
Gradient do_[0] = -109677702209863680.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1255378911232.000, df[0] = 920451678208.000, dc_hat[0] = 545013465088.000
Gradient do_[0] = 77836752781312.000
Backward Time Step 3:
Gradient di[0] = 1969541349376.000, df[0] = 1421310164992.000, dc_hat[0] = 771033333760.000
Gradient do_[0] = 104186242924544.000
Backward Time Step 2:
Gradient di[0] = 2506109222912.000, df[0] = 1799994736640.000, dc_hat[0] = 1280438960128.000
Gradient do_[0] = 113538056060928.000
Backward Time Step 1:
Gradient di[0] = 3143014547456.000, df[0] = 2167331618816.000, dc_hat[0] = 1748085768192.000
Gradient do_[0] = 100052655669248.000
Backward Time Step 0:
Gradient di[0] = 3805322936320.000, df[0] = 2707465437184.000, dc_hat[0] = 3570676269056.000
Gradient do_[0] = 58829949632512.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2333537067859968.000, df[0] = -1832039506509824.000, dc_hat[0] = -1054748807528448.000
Gradient do_[0] = -136417988206133248.000
Backward Time Step 3:
Gradient di[0] = -3663977812852736.000, df[0] = -2797143353982976.000, dc_hat[0] = -1513943625367552.000
Gradient do_[0] = -186516342172549120.000
Backward Time Step 2:
Gradient di[0] = -4799850365321216.000, df[0] = -3616537214713856.000, dc_hat[0] = -2644938571382784.000
Gradient do_[0] = -213405208267456512.000
Backward Time Step 1:
Gradient di[0] = -6055898518650880.000, df[0] = -4332790120185856.000, dc_hat[0] = -3665625469681664.000
Gradient do_[0] = -191483282871812096.000
Backward Time Step 0:
Gradient di[0] = -7071340456574976.000, df[0] = -5168444995534848.000, dc_hat[0] = -7148888775458816.000
Gradient do_[0] = -109704373956771840.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1255745781760.000, df[0] = 920720637952.000, dc_hat[0] = 545172979712.000
Gradient do_[0] = 77859485908992.000
Backward Time Step 3:
Gradient di[0] = 1970114265088.000, df[0] = 1421723435008.000, dc_hat[0] = 771257139200.000
Gradient do_[0] = 104216525799424.000
Backward Time Step 2:
Gradient di[0] = 2506839556096.000, df[0] = 1800519417856.000, dc_hat[0] = 1280811597824.000
Gradient do_[0] = 113571157508096.000
Backward Time Step 1:
Gradient di[0] = 3143933100032.000, df[0] = 2167964827648.000, dc_hat[0] = 1748594589696.000
Gradient do_[0] = 100081864802304.000
Backward Time Step 0:
Gradient di[0] = 3806432591872.000, df[0] = 2708255014912.000, dc_hat[0] = 3571717767168.000
Gradient do_[0] = 58847104335872.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2334109640687616.000, df[0] = -1832489270116352.000, dc_hat[0] = -1055007646416896.000
Gradient do_[0] = -136451480361107456.000
Backward Time Step 3:
Gradient di[0] = -3664876266323968.000, df[0] = -2797829743443968.000, dc_hat[0] = -1514315811127296.000
Gradient do_[0] = -186562057804447744.000
Backward Time Step 2:
Gradient di[0] = -4801027186360320.000, df[0] = -3617423320154112.000, dc_hat[0] = -2645584427089920.000
Gradient do_[0] = -213457469429514240.000
Backward Time Step 1:
Gradient di[0] = -6057388872302592.000, df[0] = -4333856614252544.000, dc_hat[0] = -3666523923152896.000
Gradient do_[0] = -191530321353637888.000
Backward Time Step 0:
Gradient di[0] = -7073071328395264.000, df[0] = -5169710400274432.000, dc_hat[0] = -7150639511502848.000
Gradient do_[0] = -109731226092306432.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1256068087808.000, df[0] = 920957026304.000, dc_hat[0] = 545312342016.000
Gradient do_[0] = 77879501127680.000
Backward Time Step 3:
Gradient di[0] = 1970622955520.000, df[0] = 1422090305536.000, dc_hat[0] = 771456040960.000
Gradient do_[0] = 104243419676672.000
Backward Time Step 2:
Gradient di[0] = 2507484954624.000, df[0] = 1800982757376.000, dc_hat[0] = 1281140850688.000
Gradient do_[0] = 113600383418368.000
Backward Time Step 1:
Gradient di[0] = 3144738930688.000, df[0] = 2168520572928.000, dc_hat[0] = 1749040758784.000
Gradient do_[0] = 100107450056704.000
Backward Time Step 0:
Gradient di[0] = 3807403835392.000, df[0] = 2708945764352.000, dc_hat[0] = 3572628979712.000
Gradient do_[0] = 58862119944192.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2334678992289792.000, df[0] = -1832936617803776.000, dc_hat[0] = -1055264941801472.000
Gradient do_[0] = -136484731997913088.000
Backward Time Step 3:
Gradient di[0] = -3665771498569728.000, df[0] = -2798512643244032.000, dc_hat[0] = -1514683567702016.000
Gradient do_[0] = -186607653177262080.000
Backward Time Step 2:
Gradient di[0] = -4802201859915776.000, df[0] = -3618307814981632.000, dc_hat[0] = -2646230282797056.000
Gradient do_[0] = -213509661872095232.000
Backward Time Step 1:
Gradient di[0] = -6058870636019712.000, df[0] = -4334916665868288.000, dc_hat[0] = -3667415934173184.000
Gradient do_[0] = -191577153677033472.000
Backward Time Step 0:
Gradient di[0] = -7074793610280960.000, df[0] = -5170969362563072.000, dc_hat[0] = -7152380046999552.000
Gradient do_[0] = -109757940788887552.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1256432467968.000, df[0] = 921224085504.000, dc_hat[0] = 545470513152.000
Gradient do_[0] = 77902083260416.000
Backward Time Step 3:
Gradient di[0] = 1971194953728.000, df[0] = 1422503313408.000, dc_hat[0] = 771679911936.000
Gradient do_[0] = 104273677385728.000
Backward Time Step 2:
Gradient di[0] = 2508211093504.000, df[0] = 1801504161792.000, dc_hat[0] = 1281510342656.000
Gradient do_[0] = 113633266761728.000
Backward Time Step 1:
Gradient di[0] = 3145655123968.000, df[0] = 2169152208896.000, dc_hat[0] = 1749548662784.000
Gradient do_[0] = 100136600469504.000
Backward Time Step 0:
Gradient di[0] = 3808516112384.000, df[0] = 2709737177088.000, dc_hat[0] = 3573672574976.000
Gradient do_[0] = 58879316590592.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2335242438311936.000, df[0] = -1833378462564352.000, dc_hat[0] = -1055519015960576.000
Gradient do_[0] = -136517682987008000.000
Backward Time Step 3:
Gradient di[0] = -3666655456526336.000, df[0] = -2799187489980416.000, dc_hat[0] = -1515048505704448.000
Gradient do_[0] = -186652595715047424.000
Backward Time Step 2:
Gradient di[0] = -4803356669247488.000, df[0] = -3619177814294528.000, dc_hat[0] = -2646865401085952.000
Gradient do_[0] = -213561046860824576.000
Backward Time Step 1:
Gradient di[0] = -6060326629933056.000, df[0] = -4335957927002112.000, dc_hat[0] = -3668291839066112.000
Gradient do_[0] = -191623161366708224.000
Backward Time Step 0:
Gradient di[0] = -7076503544135680.000, df[0] = -5172219198046208.000, dc_hat[0] = -7154108771336192.000
Gradient do_[0] = -109784475096842240.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1256852553728.000, df[0] = 921532039168.000, dc_hat[0] = 545652834304.000
Gradient do_[0] = 77928096333824.000
Backward Time Step 3:
Gradient di[0] = 1971855032320.000, df[0] = 1422979891200.000, dc_hat[0] = 771938451456.000
Gradient do_[0] = 104308557217792.000
Backward Time Step 2:
Gradient di[0] = 2509053886464.000, df[0] = 1802109452288.000, dc_hat[0] = 1281940783104.000
Gradient do_[0] = 113671384596480.000
Backward Time Step 1:
Gradient di[0] = 3146709204992.000, df[0] = 2169879003136.000, dc_hat[0] = 1750132064256.000
Gradient do_[0] = 100170138124288.000
Backward Time Step 0:
Gradient di[0] = 3809785151488.000, df[0] = 2710640263168.000, dc_hat[0] = 3574863757312.000
Gradient do_[0] = 58898933350400.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2335799710318592.000, df[0] = -1833816012357632.000, dc_hat[0] = -1055770741309440.000
Gradient do_[0] = -136550281788784640.000
Backward Time Step 3:
Gradient di[0] = -3667535119515648.000, df[0] = -2799859115491328.000, dc_hat[0] = -1515411162005504.000
Gradient do_[0] = -186697297734664192.000
Backward Time Step 2:
Gradient di[0] = -4804514699804672.000, df[0] = -3620050766397440.000, dc_hat[0] = -2647502129987584.000
Gradient do_[0] = -213612449029423104.000
Backward Time Step 1:
Gradient di[0] = -6061784234459136.000, df[0] = -4337000261877760.000, dc_hat[0] = -3669169623007232.000
Gradient do_[0] = -191669151876513792.000
Backward Time Step 0:
Gradient di[0] = -7078192540024832.000, df[0] = -5173453464272896.000, dc_hat[0] = -7155817094578176.000
Gradient do_[0] = -109810682987282432.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1257187180544.000, df[0] = 921777405952.000, dc_hat[0] = 545798160384.000
Gradient do_[0] = 77948849750016.000
Backward Time Step 3:
Gradient di[0] = 1972377878528.000, df[0] = 1423357247488.000, dc_hat[0] = 772142989312.000
Gradient do_[0] = 104336231235584.000
Backward Time Step 2:
Gradient di[0] = 2509718683648.000, df[0] = 1802586685440.000, dc_hat[0] = 1282279342080.000
Gradient do_[0] = 113701466144768.000
Backward Time Step 1:
Gradient di[0] = 3147544920064.000, df[0] = 2170454933504.000, dc_hat[0] = 1750593961984.000
Gradient do_[0] = 100196713234432.000
Backward Time Step 0:
Gradient di[0] = 3810799124480.000, df[0] = 2711361945600.000, dc_hat[0] = 3575815340032.000
Gradient do_[0] = 58914611658752.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2336370404098048.000, df[0] = -1834264165351424.000, dc_hat[0] = -1056028640673792.000
Gradient do_[0] = -136583645094739968.000
Backward Time Step 3:
Gradient di[0] = -3668431157067776.000, df[0] = -2800543357468672.000, dc_hat[0] = -1515780529192960.000
Gradient do_[0] = -186742858747740160.000
Backward Time Step 2:
Gradient di[0] = -4805684541521920.000, df[0] = -3620932308434944.000, dc_hat[0] = -2648144764469248.000
Gradient do_[0] = -213664435313573888.000
Backward Time Step 1:
Gradient di[0] = -6063268145659904.000, df[0] = -4338061924106240.000, dc_hat[0] = -3670064586817536.000
Gradient do_[0] = -191716035739516928.000
Backward Time Step 0:
Gradient di[0] = -7079923948716032.000, df[0] = -5174718332141568.000, dc_hat[0] = -7157566756880384.000
Gradient do_[0] = -109837543712751616.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1257487597568.000, df[0] = 921997606912.000, dc_hat[0] = 545928183808.000
Gradient do_[0] = 77967447293952.000
Backward Time Step 3:
Gradient di[0] = 1972849344512.000, df[0] = 1423697510400.000, dc_hat[0] = 772327079936.000
Gradient do_[0] = 104361128624128.000
Backward Time Step 2:
Gradient di[0] = 2510317944832.000, df[0] = 1803017125888.000, dc_hat[0] = 1282584346624.000
Gradient do_[0] = 113728544571392.000
Backward Time Step 1:
Gradient di[0] = 3148292030464.000, df[0] = 2170970177536.000, dc_hat[0] = 1751007756288.000
Gradient do_[0] = 100220452995072.000
Backward Time Step 0:
Gradient di[0] = 3811700375552.000, df[0] = 2712002625536.000, dc_hat[0] = 3576660230144.000
Gradient do_[0] = 58928540942336.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2336920428347392.000, df[0] = -1834695943782400.000, dc_hat[0] = -1056277077688320.000
Gradient do_[0] = -136615719910506496.000
Backward Time Step 3:
Gradient di[0] = -3669292029575168.000, df[0] = -2801200487464960.000, dc_hat[0] = -1516136206172160.000
Gradient do_[0] = -186786650234290176.000
Backward Time Step 2:
Gradient di[0] = -4806811970437120.000, df[0] = -3621781369782272.000, dc_hat[0] = -2648765655678976.000
Gradient do_[0] = -213714548991983616.000
Backward Time Step 1:
Gradient di[0] = -6064687095480320.000, df[0] = -4339076610129920.000, dc_hat[0] = -3670917943132160.000
Gradient do_[0] = -191760823658479616.000
Backward Time Step 0:
Gradient di[0] = -7081585564188672.000, df[0] = -5175933271015424.000, dc_hat[0] = -7159247162834944.000
Gradient do_[0] = -109863322106462208.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1257894838272.000, df[0] = 922296320000.000, dc_hat[0] = 546105294848.000
Gradient do_[0] = 77992688615424.000
Backward Time Step 3:
Gradient di[0] = 1973489238016.000, df[0] = 1424159277056.000, dc_hat[0] = 772577099776.000
Gradient do_[0] = 104394959880192.000
Backward Time Step 2:
Gradient di[0] = 2511131901952.000, df[0] = 1803601838080.000, dc_hat[0] = 1283000238080.000
Gradient do_[0] = 113765454446592.000
Backward Time Step 1:
Gradient di[0] = 3149314392064.000, df[0] = 2171674951680.000, dc_hat[0] = 1751574249472.000
Gradient do_[0] = 100252992405504.000
Backward Time Step 0:
Gradient di[0] = 3812939268096.000, df[0] = 2712884215808.000, dc_hat[0] = 3577822838784.000
Gradient do_[0] = 58947700523008.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2337491658997760.000, df[0] = -1835144499429376.000, dc_hat[0] = -1056535245488128.000
Gradient do_[0] = -136649126166134784.000
Backward Time Step 3:
Gradient di[0] = -3670187798691840.000, df[0] = -2801883924135936.000, dc_hat[0] = -1516506244448256.000
Gradient do_[0] = -186832245607104512.000
Backward Time Step 2:
Gradient di[0] = -4807982349025280.000, df[0] = -3622663448690688.000, dc_hat[0] = -2649409095467008.000
Gradient do_[0] = -213766500916396032.000
Backward Time Step 1:
Gradient di[0] = -6066166711713792.000, df[0] = -4340135051132928.000, dc_hat[0] = -3671809954152448.000
Gradient do_[0] = -191807518542921728.000
Backward Time Step 0:
Gradient di[0] = -7083305698590720.000, df[0] = -5177191159562240.000, dc_hat[0] = -7160986087718912.000
Gradient do_[0] = -109890011033239552.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1258289496064.000, df[0] = 922585595904.000, dc_hat[0] = 546276343808.000
Gradient do_[0] = 78017141407744.000
Backward Time Step 3:
Gradient di[0] = 1974108160000.000, df[0] = 1424605708288.000, dc_hat[0] = 772818927616.000
Gradient do_[0] = 104427658674176.000
Backward Time Step 2:
Gradient di[0] = 2511918071808.000, df[0] = 1804166234112.000, dc_hat[0] = 1283400531968.000
Gradient do_[0] = 113801038921728.000
Backward Time Step 1:
Gradient di[0] = 3150301102080.000, df[0] = 2172355346432.000, dc_hat[0] = 1752120950784.000
Gradient do_[0] = 100284332244992.000
Backward Time Step 0:
Gradient di[0] = 3814138052608.000, df[0] = 2713737232384.000, dc_hat[0] = 3578947960832.000
Gradient do_[0] = 58966226763776.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2338064768696320.000, df[0] = -1835594263035904.000, dc_hat[0] = -1056794218594304.000
Gradient do_[0] = -136682601141239808.000
Backward Time Step 3:
Gradient di[0] = -3671087594340352.000, df[0] = -2802571118903296.000, dc_hat[0] = -1516877088030720.000
Gradient do_[0] = -186878064318218240.000
Backward Time Step 2:
Gradient di[0] = -4809172591837184.000, df[0] = -3623560023113728.000, dc_hat[0] = -2650063809544192.000
Gradient do_[0] = -213819363373875200.000
Backward Time Step 1:
Gradient di[0] = -6067669413396480.000, df[0] = -4341209598263296.000, dc_hat[0] = -3672714044768256.000
Gradient do_[0] = -191854969341607936.000
Backward Time Step 0:
Gradient di[0] = -7085051602796544.000, df[0] = -5178466764849152.000, dc_hat[0] = -7162750782406656.000
Gradient do_[0] = -109917086507073536.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1258579951616.000, df[0] = 922798587904.000, dc_hat[0] = 546402336768.000
Gradient do_[0] = 78035168526336.000
Backward Time Step 3:
Gradient di[0] = 1974563241984.000, df[0] = 1424934305792.000, dc_hat[0] = 772997513216.000
Gradient do_[0] = 104451750756352.000
Backward Time Step 2:
Gradient di[0] = 2512497934336.000, df[0] = 1804583043072.000, dc_hat[0] = 1283696754688.000
Gradient do_[0] = 113827303653376.000
Backward Time Step 1:
Gradient di[0] = 3151030910976.000, df[0] = 2172858138624.000, dc_hat[0] = 1752524259328.000
Gradient do_[0] = 100307535134720.000
Backward Time Step 0:
Gradient di[0] = 3815011254272.000, df[0] = 2714358513664.000, dc_hat[0] = 3579767160832.000
Gradient do_[0] = 58979728228352.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2338632509685760.000, df[0] = -1836040268546048.000, dc_hat[0] = -1057050440237056.000
Gradient do_[0] = -136715818418307072.000
Backward Time Step 3:
Gradient di[0] = -3671981752844288.000, df[0] = -2803253481832448.000, dc_hat[0] = -1517246455218176.000
Gradient do_[0] = -186923539431948288.000
Backward Time Step 2:
Gradient di[0] = -4810337064845312.000, df[0] = -3624437001748480.000, dc_hat[0] = -2650701343752192.000
Gradient do_[0] = -213871160679464960.000
Backward Time Step 1:
Gradient di[0] = -6069139365953536.000, df[0] = -4342261328379904.000, dc_hat[0] = -3673599613337600.000
Gradient do_[0] = -191901354988404736.000
Backward Time Step 0:
Gradient di[0] = -7086769052844032.000, df[0] = -5179721969041408.000, dc_hat[0] = -7164487022936064.000
Gradient do_[0] = -109943732484177920.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1258921132032.000, df[0] = 923048804352.000, dc_hat[0] = 546550382592.000
Gradient do_[0] = 78056324595712.000
Backward Time Step 3:
Gradient di[0] = 1975098540032.000, df[0] = 1425320574976.000, dc_hat[0] = 773206573056.000
Gradient do_[0] = 104480062308352.000
Backward Time Step 2:
Gradient di[0] = 2513178984448.000, df[0] = 1805071810560.000, dc_hat[0] = 1284043309056.000
Gradient do_[0] = 113858047901696.000
Backward Time Step 1:
Gradient di[0] = 3151884451840.000, df[0] = 2173446651904.000, dc_hat[0] = 1752997036032.000
Gradient do_[0] = 100334638727168.000
Backward Time Step 0:
Gradient di[0] = 3816043053056.000, df[0] = 2715092516864.000, dc_hat[0] = 3580735520768.000
Gradient do_[0] = 58995674972160.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2339188439515136.000, df[0] = -1836476476162048.000, dc_hat[0] = -1057301091844096.000
Gradient do_[0] = -136748271191195648.000
Backward Time Step 3:
Gradient di[0] = -3672851752157184.000, df[0] = -2803918128021504.000, dc_hat[0] = -1517605756076032.000
Gradient do_[0] = -186967811954835456.000
Backward Time Step 2:
Gradient di[0] = -4811478989275136.000, df[0] = -3625297605820416.000, dc_hat[0] = -2651330556461056.000
Gradient do_[0] = -213921944372772864.000
Backward Time Step 1:
Gradient di[0] = -6070584085577728.000, df[0] = -4343294804885504.000, dc_hat[0] = -3674470417956864.000
Gradient do_[0] = -191947001900826624.000
Backward Time Step 0:
Gradient di[0] = -7088452143153152.000, df[0] = -5180951940300800.000, dc_hat[0] = -7166188903727104.000
Gradient do_[0] = -109969845885337600.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1259247501312.000, df[0] = 923288076288.000, dc_hat[0] = 546692104192.000
Gradient do_[0] = 78076557918208.000
Backward Time Step 3:
Gradient di[0] = 1975609720832.000, df[0] = 1425689149440.000, dc_hat[0] = 773406326784.000
Gradient do_[0] = 104507090403328.000
Backward Time Step 2:
Gradient di[0] = 2513830150144.000, df[0] = 1805539737600.000, dc_hat[0] = 1284375576576.000
Gradient do_[0] = 113887575801856.000
Backward Time Step 1:
Gradient di[0] = 3152698933248.000, df[0] = 2174008295424.000, dc_hat[0] = 1753447137280.000
Gradient do_[0] = 100360551137280.000
Backward Time Step 0:
Gradient di[0] = 3817031335936.000, df[0] = 2715795849216.000, dc_hat[0] = 3581662724096.000
Gradient do_[0] = 59010963210240.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2339768260100096.000, df[0] = -1836932011130880.000, dc_hat[0] = -1057563286175744.000
Gradient do_[0] = -136782175663030272.000
Backward Time Step 3:
Gradient di[0] = -3673763090530304.000, df[0] = -2804613375852544.000, dc_hat[0] = -1517980894625792.000
Gradient do_[0] = -187014146062024704.000
Backward Time Step 2:
Gradient di[0] = -4812675674537984.000, df[0] = -3626199012081664.000, dc_hat[0] = -2651987954892800.000
Gradient do_[0] = -213974995808813056.000
Backward Time Step 1:
Gradient di[0] = -6072087324131328.000, df[0] = -4344370694193152.000, dc_hat[0] = -3675375582314496.000
Gradient do_[0] = -191994487059251200.000
Backward Time Step 0:
Gradient di[0] = -7090205026680832.000, df[0] = -5182233451167744.000, dc_hat[0] = -7167960577736704.000
Gradient do_[0] = -109997041618255872.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1259551457280.000, df[0] = 923510964224.000, dc_hat[0] = 546824028160.000
Gradient do_[0] = 78095356788736.000
Backward Time Step 3:
Gradient di[0] = 1976087085056.000, df[0] = 1426033999872.000, dc_hat[0] = 773592842240.000
Gradient do_[0] = 104532306558976.000
Backward Time Step 2:
Gradient di[0] = 2514437799936.000, df[0] = 1805976076288.000, dc_hat[0] = 1284685168640.000
Gradient do_[0] = 113915098824704.000
Backward Time Step 1:
Gradient di[0] = 3153460723712.000, df[0] = 2174533763072.000, dc_hat[0] = 1753868926976.000
Gradient do_[0] = 100384794214400.000
Backward Time Step 0:
Gradient di[0] = 3817949102080.000, df[0] = 2716448849920.000, dc_hat[0] = 3582523867136.000
Gradient do_[0] = 59025144152064.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2340301372915712.000, df[0] = -1837350367789056.000, dc_hat[0] = -1057803603017728.000
Gradient do_[0] = -136813331355795456.000
Backward Time Step 3:
Gradient di[0] = -3674602219765760.000, df[0] = -2805254131286016.000, dc_hat[0] = -1518326907928576.000
Gradient do_[0] = -187056872396685312.000
Backward Time Step 2:
Gradient di[0] = -4813770891198464.000, df[0] = -3627023914237952.000, dc_hat[0] = -2652589518749696.000
Gradient do_[0] = -214023614838603776.000
Backward Time Step 1:
Gradient di[0] = -6073469229858816.000, df[0] = -4345359073542144.000, dc_hat[0] = -3676208537534464.000
Gradient do_[0] = -192038123926978560.000
Backward Time Step 0:
Gradient di[0] = -7091821008125952.000, df[0] = -5183414567174144.000, dc_hat[0] = -7169594812792832.000
Gradient do_[0] = -110022107047395328.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1260009422848.000, df[0] = 923846443008.000, dc_hat[0] = 547022143488.000
Gradient do_[0] = 78123727060992.000
Backward Time Step 3:
Gradient di[0] = 1976804835328.000, df[0] = 1426551603200.000, dc_hat[0] = 773873664000.000
Gradient do_[0] = 104570248232960.000
Backward Time Step 2:
Gradient di[0] = 2515352420352.000, df[0] = 1806633009152.000, dc_hat[0] = 1285151391744.000
Gradient do_[0] = 113956471439360.000
Backward Time Step 1:
Gradient di[0] = 3154609438720.000, df[0] = 2175325569024.000, dc_hat[0] = 1754505936896.000
Gradient do_[0] = 100421318213632.000
Backward Time Step 0:
Gradient di[0] = 3819336630272.000, df[0] = 2717436084224.000, dc_hat[0] = 3583825674240.000
Gradient do_[0] = 59046598017024.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2340879046017024.000, df[0] = -1837803889491968.000, dc_hat[0] = -1058064656498688.000
Gradient do_[0] = -136847003899396096.000
Backward Time Step 3:
Gradient di[0] = -3675507384123392.000, df[0] = -2805945084149760.000, dc_hat[0] = -1518700033212416.000
Gradient do_[0] = -187102862906490880.000
Backward Time Step 2:
Gradient di[0] = -4814955765301248.000, df[0] = -3627917267435520.000, dc_hat[0] = -2653242085343232.000
Gradient do_[0] = -214076253957783552.000
Backward Time Step 1:
Gradient di[0] = -6074966025961472.000, df[0] = -4346429057269760.000, dc_hat[0] = -3677108601618432.000
Gradient do_[0] = -192085282667888640.000
Backward Time Step 0:
Gradient di[0] = -7093557248655360.000, df[0] = -5184683193139200.000, dc_hat[0] = -7171349306933248.000
Gradient do_[0] = -110049036492341248.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1260288868352.000, df[0] = 924051374080.000, dc_hat[0] = 547143483392.000
Gradient do_[0] = 78141083090944.000
Backward Time Step 3:
Gradient di[0] = 1977243533312.000, df[0] = 1426868142080.000, dc_hat[0] = 774044975104.000
Gradient do_[0] = 104593434345472.000
Backward Time Step 2:
Gradient di[0] = 2515908952064.000, df[0] = 1807032516608.000, dc_hat[0] = 1285435031552.000
Gradient do_[0] = 113981679206400.000
Backward Time Step 1:
Gradient di[0] = 3155307790336.000, df[0] = 2175807389696.000, dc_hat[0] = 1754892206080.000
Gradient do_[0] = 100443489304576.000
Backward Time Step 0:
Gradient di[0] = 3820180471808.000, df[0] = 2718036393984.000, dc_hat[0] = 3584617611264.000
Gradient do_[0] = 59059638108160.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2341424506863616.000, df[0] = -1838232044044288.000, dc_hat[0] = -1058311080247296.000
Gradient do_[0] = -136878881146667008.000
Backward Time Step 3:
Gradient di[0] = -3676364230098944.000, df[0] = -2806599261356032.000, dc_hat[0] = -1519053965361152.000
Gradient do_[0] = -187146465414479872.000
Backward Time Step 2:
Gradient di[0] = -4816079972990976.000, df[0] = -3628764181299200.000, dc_hat[0] = -2653860292198400.000
Gradient do_[0] = -214126230197239808.000
Backward Time Step 1:
Gradient di[0] = -6076386049523712.000, df[0] = -4347445353906176.000, dc_hat[0] = -3677964910723072.000
Gradient do_[0] = -192130190845935616.000
Backward Time Step 0:
Gradient di[0] = -7095217790386176.000, df[0] = -5185897058271232.000, dc_hat[0] = -7173028639145984.000
Gradient do_[0] = -110074806296117248.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1260611698688.000, df[0] = 924288090112.000, dc_hat[0] = 547283599360.000
Gradient do_[0] = 78161056366592.000
Backward Time Step 3:
Gradient di[0] = 1977749471232.000, df[0] = 1427233439744.000, dc_hat[0] = 774242566144.000
Gradient do_[0] = 104620210782208.000
Backward Time Step 2:
Gradient di[0] = 2516553826304.000, df[0] = 1807495462912.000, dc_hat[0] = 1285763891200.000
Gradient do_[0] = 114010821230592.000
Backward Time Step 1:
Gradient di[0] = 3156117815296.000, df[0] = 2176365625344.000, dc_hat[0] = 1755339948032.000
Gradient do_[0] = 100469250719744.000
Backward Time Step 0:
Gradient di[0] = 3821159055360.000, df[0] = 2718732648448.000, dc_hat[0] = 3585535901696.000
Gradient do_[0] = 59074771156992.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2341971578322944.000, df[0] = -1838661540773888.000, dc_hat[0] = -1058558376411136.000
Gradient do_[0] = -136910861473153024.000
Backward Time Step 3:
Gradient di[0] = -3677223491993600.000, df[0] = -2807255049175040.000, dc_hat[0] = -1519408434380800.000
Gradient do_[0] = -187190205361422336.000
Backward Time Step 2:
Gradient di[0] = -4817202570067968.000, df[0] = -3629610021421056.000, dc_hat[0] = -2654476620005376.000
Gradient do_[0] = -214176086177611776.000
Backward Time Step 1:
Gradient di[0] = -6077802851860480.000, df[0] = -4348458160881664.000, dc_hat[0] = -3678817998602240.000
Gradient do_[0] = -192174910045421568.000
Backward Time Step 0:
Gradient di[0] = -7096868131569664.000, df[0] = -5187103407210496.000, dc_hat[0] = -7174696697069568.000
Gradient do_[0] = -110100412891136000.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1260980797440.000, df[0] = 924558819328.000, dc_hat[0] = 547443867648.000
Gradient do_[0] = 78183957266432.000
Backward Time Step 3:
Gradient di[0] = 1978329333760.000, df[0] = 1427651690496.000, dc_hat[0] = 774469255168.000
Gradient do_[0] = 104650871144448.000
Backward Time Step 2:
Gradient di[0] = 2517289926656.000, df[0] = 1808024338432.000, dc_hat[0] = 1286139281408.000
Gradient do_[0] = 114044191113216.000
Backward Time Step 1:
Gradient di[0] = 3157045280768.000, df[0] = 2177004863488.000, dc_hat[0] = 1755853094912.000
Gradient do_[0] = 100498686345216.000
Backward Time Step 0:
Gradient di[0] = 3822278148096.000, df[0] = 2719529041920.000, dc_hat[0] = 3586586050560.000
Gradient do_[0] = 59092076855296.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2342538514006016.000, df[0] = -1839106875195392.000, dc_hat[0] = -1058814396727296.000
Gradient do_[0] = -136944018620678144.000
Backward Time Step 3:
Gradient di[0] = -3678110402740224.000, df[0] = -2807932580265984.000, dc_hat[0] = -1519774043471872.000
Gradient do_[0] = -187235405597245440.000
Backward Time Step 2:
Gradient di[0] = -4818371338043392.000, df[0] = -3630490758152192.000, dc_hat[0] = -2655118180745216.000
Gradient do_[0] = -214228055281893376.000
Backward Time Step 1:
Gradient di[0] = -6079282468093952.000, df[0] = -4349516601884672.000, dc_hat[0] = -3679708399009792.000
Gradient do_[0] = -192221656469471232.000
Backward Time Step 0:
Gradient di[0] = -7098593634680832.000, df[0] = -5188365053853696.000, dc_hat[0] = -7176441527533568.000
Gradient do_[0] = -110127187717259264.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1261256310784.000, df[0] = 924760801280.000, dc_hat[0] = 547563503616.000
Gradient do_[0] = 78201036472320.000
Backward Time Step 3:
Gradient di[0] = 1978764361728.000, df[0] = 1427965607936.000, dc_hat[0] = 774638993408.000
Gradient do_[0] = 104673830764544.000
Backward Time Step 2:
Gradient di[0] = 2517843312640.000, df[0] = 1808421486592.000, dc_hat[0] = 1286420955136.000
Gradient do_[0] = 114069231108096.000
Backward Time Step 1:
Gradient di[0] = 3157735505920.000, df[0] = 2177480654848.000, dc_hat[0] = 1756235300864.000
Gradient do_[0] = 100520647720960.000
Backward Time Step 0:
Gradient di[0] = 3823110979584.000, df[0] = 2720121487360.000, dc_hat[0] = 3587367763968.000
Gradient do_[0] = 59104949174272.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2343093906964480.000, df[0] = -1839542545940480.000, dc_hat[0] = -1059064243027968.000
Gradient do_[0] = -136976462803632128.000
Backward Time Step 3:
Gradient di[0] = -3678980670488576.000, df[0] = -2808596958019584.000, dc_hat[0] = -1520133612765184.000
Gradient do_[0] = -187279609400655872.000
Backward Time Step 2:
Gradient di[0] = -4819508430635008.000, df[0] = -3631346798821376.000, dc_hat[0] = -2655743903793152.000
Gradient do_[0] = -214278461018079232.000
Backward Time Step 1:
Gradient di[0] = -6080719671525376.000, df[0] = -4350544978116608.000, dc_hat[0] = -3680574103355392.000
Gradient do_[0] = -192267045683855360.000
Backward Time Step 0:
Gradient di[0] = -7100268135055360.000, df[0] = -5189588582662144.000, dc_hat[0] = -7178134281519104.000
Gradient do_[0] = -110153155089530880.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1261608894464.000, df[0] = 925019406336.000, dc_hat[0] = 547716628480.000
Gradient do_[0] = 78222922350592.000
Backward Time Step 3:
Gradient di[0] = 1979316436992.000, df[0] = 1428364066816.000, dc_hat[0] = 774855196672.000
Gradient do_[0] = 104703014731776.000
Backward Time Step 2:
Gradient di[0] = 2518545858560.000, df[0] = 1808926113792.000, dc_hat[0] = 1286778519552.000
Gradient do_[0] = 114101032321024.000
Backward Time Step 1:
Gradient di[0] = 3158615523328.000, df[0] = 2178087518208.000, dc_hat[0] = 1756722495488.000
Gradient do_[0] = 100548640505856.000
Backward Time Step 0:
Gradient di[0] = 3824176332800.000, df[0] = 2720879607808.000, dc_hat[0] = 3588367319040.000
Gradient do_[0] = 59121416011776.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2343645810262016.000, df[0] = -1839975666548736.000, dc_hat[0] = -1059313552457728.000
Gradient do_[0] = -137008718008025088.000
Backward Time Step 3:
Gradient di[0] = -3679853891026944.000, df[0] = -2809263751692288.000, dc_hat[0] = -1520494255800320.000
Gradient do_[0] = -187324019362496512.000
Backward Time Step 2:
Gradient di[0] = -4820650891935744.000, df[0] = -3632207939764224.000, dc_hat[0] = -2656371774324736.000
Gradient do_[0] = -214329227531517952.000
Backward Time Step 1:
Gradient di[0] = -6082154727473152.000, df[0] = -4351570669993984.000, dc_hat[0] = -3681437391781888.000
Gradient do_[0] = -192312331819024384.000
Backward Time Step 0:
Gradient di[0] = -7101942098558976.000, df[0] = -5190811574599680.000, dc_hat[0] = -7179826498633728.000
Gradient do_[0] = -110179122461802496.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1261972488192.000, df[0] = 925286006784.000, dc_hat[0] = 547874439168.000
Gradient do_[0] = 78245454151680.000
Backward Time Step 3:
Gradient di[0] = 1979884371968.000, df[0] = 1428773797888.000, dc_hat[0] = 775077101568.000
Gradient do_[0] = 104733054337024.000
Backward Time Step 2:
Gradient di[0] = 2519268851712.000, df[0] = 1809445289984.000, dc_hat[0] = 1287147749376.000
Gradient do_[0] = 114133789835264.000
Backward Time Step 1:
Gradient di[0] = 3159528046592.000, df[0] = 2178716794880.000, dc_hat[0] = 1757228171264.000
Gradient do_[0] = 100577665089536.000
Backward Time Step 0:
Gradient di[0] = 3825282580480.000, df[0] = 2721666564096.000, dc_hat[0] = 3589405671424.000
Gradient do_[0] = 59138524577792.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2344190734237696.000, df[0] = -1840403686883328.000, dc_hat[0] = -1059559976206336.000
Gradient do_[0] = -137040552305623040.000
Backward Time Step 3:
Gradient di[0] = -3680706173599744.000, df[0] = -2809913902366720.000, dc_hat[0] = -1520844966723584.000
Gradient do_[0] = -187367381352316928.000
Backward Time Step 2:
Gradient di[0] = -4821768657174528.000, df[0] = -3633049484918784.000, dc_hat[0] = -2656985417777152.000
Gradient do_[0] = -214378860173590528.000
Backward Time Step 1:
Gradient di[0] = -6083568308584448.000, df[0] = -4352582403227648.000, dc_hat[0] = -3682289942790144.000
Gradient do_[0] = -192356930759426048.000
Backward Time Step 0:
Gradient di[0] = -7103585997291520.000, df[0] = -5192013091700736.000, dc_hat[0] = -7181488114106368.000
Gradient do_[0] = -110204634567540736.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1262233845760.000, df[0] = 925477371904.000, dc_hat[0] = 547987456000.000
Gradient do_[0] = 78261585444864.000
Backward Time Step 3:
Gradient di[0] = 1980296200192.000, df[0] = 1429071069184.000, dc_hat[0] = 775238189056.000
Gradient do_[0] = 104754831163392.000
Backward Time Step 2:
Gradient di[0] = 2519797334016.000, df[0] = 1809825136640.000, dc_hat[0] = 1287416971264.000
Gradient do_[0] = 114157680590848.000
Backward Time Step 1:
Gradient di[0] = 3160186290176.000, df[0] = 2179170566144.000, dc_hat[0] = 1757591764992.000
Gradient do_[0] = 100598586277888.000
Backward Time Step 0:
Gradient di[0] = 3826073468928.000, df[0] = 2722229125120.000, dc_hat[0] = 3590147276800.000
Gradient do_[0] = 59150746779648.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2344753106518016.000, df[0] = -1840845263208448.000, dc_hat[0] = -1059813916147712.000
Gradient do_[0] = -137073408805437440.000
Backward Time Step 3:
Gradient di[0] = -3681587178766336.000, df[0] = -2810586601619456.000, dc_hat[0] = -1521209099419648.000
Gradient do_[0] = -187412203631017984.000
Backward Time Step 2:
Gradient di[0] = -4822922929635328.000, df[0] = -3633920021102592.000, dc_hat[0] = -2657620267630592.000
Gradient do_[0] = -214430124903235584.000
Backward Time Step 1:
Gradient di[0] = -6085026449981440.000, df[0] = -4353625543409664.000, dc_hat[0] = -3683168532037632.000
Gradient do_[0] = -192403007168577536.000
Backward Time Step 0:
Gradient di[0] = -7105290025566208.000, df[0] = -5193259169087488.000, dc_hat[0] = -7183210932862976.000
Gradient do_[0] = -110231074386214912.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1262608842752.000, df[0] = 925752426496.000, dc_hat[0] = 548150280192.000
Gradient do_[0] = 78284847054848.000
Backward Time Step 3:
Gradient di[0] = 1980883009536.000, df[0] = 1429494562816.000, dc_hat[0] = 775467433984.000
Gradient do_[0] = 104785869012992.000
Backward Time Step 2:
Gradient di[0] = 2520542085120.000, df[0] = 1810359648256.000, dc_hat[0] = 1287796293632.000
Gradient do_[0] = 114191386017792.000
Backward Time Step 1:
Gradient di[0] = 3161120047104.000, df[0] = 2179814129664.000, dc_hat[0] = 1758108712960.000
Gradient do_[0] = 100628240007168.000
Backward Time Step 0:
Gradient di[0] = 3827204882432.000, df[0] = 2723034169344.000, dc_hat[0] = 3591208960000.000
Gradient do_[0] = 59168241221632.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2345306083557376.000, df[0] = -1841279323340800.000, dc_hat[0] = -1060063963774976.000
Gradient do_[0] = -137105732729307136.000
Backward Time Step 3:
Gradient di[0] = -3682459325562880.000, df[0] = -2811252321550336.000, dc_hat[0] = -1521567863406592.000
Gradient do_[0] = -187456613592858624.000
Backward Time Step 2:
Gradient di[0] = -4824065927806976.000, df[0] = -3634780356739072.000, dc_hat[0] = -2658248138162176.000
Gradient do_[0] = -214480960136151040.000
Backward Time Step 1:
Gradient di[0] = -6086464190283776.000, df[0] = -4354653919641600.000, dc_hat[0] = -3684033431076864.000
Gradient do_[0] = -192448362023223296.000
Backward Time Step 0:
Gradient di[0] = -7106962915328000.000, df[0] = -5194481624154112.000, dc_hat[0] = -7184902613106688.000
Gradient do_[0] = -110257015988682752.000
Epoch 800, Train Loss=0.011262, Weight Norm=13.014078
Sample Predictions at Epoch 800:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.42 | 63.87 | 6.45 |
| 193 | 2024-10-14 | 56.80 | 66.55 | 9.75 |
| 194 | 2024-10-15 | 56.99 | 66.00 | 9.01 |
| 195 | 2024-10-16 | 57.95 | 67.20 | 9.25 |
| 196 | 2024-10-17 | 57.48 | 66.76 | 9.28 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1263036137472.000, df[0] = 926065623040.000, dc_hat[0] = 548335648768.000
Gradient do_[0] = 78311287947264.000
Backward Time Step 3:
Gradient di[0] = 1981556195328.000, df[0] = 1429980315648.000, dc_hat[0] = 775730954240.000
Gradient do_[0] = 104821445099520.000
Backward Time Step 2:
Gradient di[0] = 2521398509568.000, df[0] = 1810975162368.000, dc_hat[0] = 1288233943040.000
Gradient do_[0] = 114230166552576.000
Backward Time Step 1:
Gradient di[0] = 3162195361792.000, df[0] = 2180555735040.000, dc_hat[0] = 1758705090560.000
Gradient do_[0] = 100662465527808.000
Backward Time Step 0:
Gradient di[0] = 3828499087360.000, df[0] = 2723955081216.000, dc_hat[0] = 3592423211008.000
Gradient do_[0] = 59188248051712.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2345841880727552.000, df[0] = -1841699693264896.000, dc_hat[0] = -1060305287249920.000
Gradient do_[0] = -137137060220764160.000
Backward Time Step 3:
Gradient di[0] = -3683299528540160.000, df[0] = -2811893345419264.000, dc_hat[0] = -1521914547798016.000
Gradient do_[0] = -187499271208042496.000
Backward Time Step 2:
Gradient di[0] = -4825164902563840.000, df[0] = -3635609016991744.000, dc_hat[0] = -2658853191680000.000
Gradient do_[0] = -214529716604895232.000
Backward Time Step 1:
Gradient di[0] = -6087848243494912.000, df[0] = -4355643909603328.000, dc_hat[0] = -3684866654732288.000
Gradient do_[0] = -192492084790296576.000
Backward Time Step 0:
Gradient di[0] = -7108584802353152.000, df[0] = -5195667035127808.000, dc_hat[0] = -7186541680001024.000
Gradient do_[0] = -110282175907102720.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1263385313280.000, df[0] = 926321410048.000, dc_hat[0] = 548487266304.000
Gradient do_[0] = 78332980887552.000
Backward Time Step 3:
Gradient di[0] = 1982102765568.000, df[0] = 1430374711296.000, dc_hat[0] = 775944536064.000
Gradient do_[0] = 104850335465472.000
Backward Time Step 2:
Gradient di[0] = 2522092666880.000, df[0] = 1811473498112.000, dc_hat[0] = 1288587444224.000
Gradient do_[0] = 114261581889536.000
Backward Time Step 1:
Gradient di[0] = 3163065679872.000, df[0] = 2181155782656.000, dc_hat[0] = 1759186518016.000
Gradient do_[0] = 100690097602560.000
Backward Time Step 0:
Gradient di[0] = 3829553168384.000, df[0] = 2724705075200.000, dc_hat[0] = 3593412280320.000
Gradient do_[0] = 59204542922752.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2346419016957952.000, df[0] = -1842152946532352.000, dc_hat[0] = -1060566206513152.000
Gradient do_[0] = -137170741354299392.000
Backward Time Step 3:
Gradient di[0] = -3684208182558720.000, df[0] = -2812587251073024.000, dc_hat[0] = -1522290089000960.000
Gradient do_[0] = -187545536595755008.000
Backward Time Step 2:
Gradient di[0] = -4826354071633920.000, df[0] = -3636504517672960.000, dc_hat[0] = -2659505489838080.000
Gradient do_[0] = -214582579062374400.000
Backward Time Step 1:
Gradient di[0] = -6089352018919424.000, df[0] = -4356719262040064.000, dc_hat[0] = -3685771819089920.000
Gradient do_[0] = -192539552768851968.000
Backward Time Step 0:
Gradient di[0] = -7110333927784448.000, df[0] = -5196945324769280.000, dc_hat[0] = -7188310669656064.000
Gradient do_[0] = -110309320100413440.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1263829385216.000, df[0] = 926646992896.000, dc_hat[0] = 548680007680.000
Gradient do_[0] = 78360478744576.000
Backward Time Step 3:
Gradient di[0] = 1982800068608.000, df[0] = 1430878027776.000, dc_hat[0] = 776217559040.000
Gradient do_[0] = 104887144677376.000
Backward Time Step 2:
Gradient di[0] = 2522979762176.000, df[0] = 1812110639104.000, dc_hat[0] = 1289039773696.000
Gradient do_[0] = 114301738156032.000
Backward Time Step 1:
Gradient di[0] = 3164177432576.000, df[0] = 2181922422784.000, dc_hat[0] = 1759802556416.000
Gradient do_[0] = 100725447196672.000
Backward Time Step 0:
Gradient di[0] = 3830897967104.000, df[0] = 2725661900800.000, dc_hat[0] = 3594674765824.000
Gradient do_[0] = 59225338281984.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2346969041207296.000, df[0] = -1842584993398784.000, dc_hat[0] = -1060814777745408.000
Gradient do_[0] = -137202876299608064.000
Backward Time Step 3:
Gradient di[0] = -3685074692210688.000, df[0] = -2813248676036608.000, dc_hat[0] = -1522646705504256.000
Gradient do_[0] = -187589517060866048.000
Backward Time Step 2:
Gradient di[0] = -4827493311709184.000, df[0] = -3637362437390336.000, dc_hat[0] = -2660132286627840.000
Gradient do_[0] = -214633139417382912.000
Backward Time Step 1:
Gradient di[0] = -6090789222350848.000, df[0] = -4357747101401088.000, dc_hat[0] = -3686637791870976.000
Gradient do_[0] = -192584924803366912.000
Backward Time Step 0:
Gradient di[0] = -7112004133191680.000, df[0] = -5198166169223168.000, dc_hat[0] = -7189999128674304.000
Gradient do_[0] = -110335244523012096.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1264081436672.000, df[0] = 926831869952.000, dc_hat[0] = 548789485568.000
Gradient do_[0] = 78376115109888.000
Backward Time Step 3:
Gradient di[0] = 1983194988544.000, df[0] = 1431163109376.000, dc_hat[0] = 776372158464.000
Gradient do_[0] = 104908040699904.000
Backward Time Step 2:
Gradient di[0] = 2523483602944.000, df[0] = 1812472528896.000, dc_hat[0] = 1289296674816.000
Gradient do_[0] = 114324546781184.000
Backward Time Step 1:
Gradient di[0] = 3164809723904.000, df[0] = 2182358499328.000, dc_hat[0] = 1760152518656.000
Gradient do_[0] = 100745571467264.000
Backward Time Step 0:
Gradient di[0] = 3831655038976.000, df[0] = 2726200344576.000, dc_hat[0] = 3595384651776.000
Gradient do_[0] = 59237044584448.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2347518528585728.000, df[0] = -1843016234958848.000, dc_hat[0] = -1061063013433344.000
Gradient do_[0] = -137235011244916736.000
Backward Time Step 3:
Gradient di[0] = -3685931001315328.000, df[0] = -2813902853242880.000, dc_hat[0] = -1523000637652992.000
Gradient do_[0] = -187633136748724224.000
Backward Time Step 2:
Gradient di[0] = -4828610540077056.000, df[0] = -3638204787851264.000, dc_hat[0] = -2660747809128448.000
Gradient do_[0] = -214682754879586304.000
Backward Time Step 1:
Gradient di[0] = -6092200119107584.000, df[0] = -4358756955586560.000, dc_hat[0] = -3687489000701952.000
Gradient do_[0] = -192629506563899392.000
Backward Time Step 0:
Gradient di[0] = -7113655011246080.000, df[0] = -5199373055033344.000, dc_hat[0] = -7191668260339712.000
Gradient do_[0] = -110360851118030848.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1264424321024.000, df[0] = 927083397120.000, dc_hat[0] = 548937826304.000
Gradient do_[0] = 78397380231168.000
Backward Time Step 3:
Gradient di[0] = 1983734611968.000, df[0] = 1431552393216.000, dc_hat[0] = 776582856704.000
Gradient do_[0] = 104936578744320.000
Backward Time Step 2:
Gradient di[0] = 2524164915200.000, df[0] = 1812961427456.000, dc_hat[0] = 1289643491328.000
Gradient do_[0] = 114355391692800.000
Backward Time Step 1:
Gradient di[0] = 3165661167616.000, df[0] = 2182945177600.000, dc_hat[0] = 1760624115712.000
Gradient do_[0] = 100772649893888.000
Backward Time Step 0:
Gradient di[0] = 3832690245632.000, df[0] = 2726936969216.000, dc_hat[0] = 3596356157440.000
Gradient do_[0] = 59253045854208.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2348046809563136.000, df[0] = -1843431101956096.000, dc_hat[0] = -1061301384118272.000
Gradient do_[0] = -137265892059774976.000
Backward Time Step 3:
Gradient di[0] = -3686765030277120.000, df[0] = -2814539313709056.000, dc_hat[0] = -1523345040343040.000
Gradient do_[0] = -187675605385347072.000
Backward Time Step 2:
Gradient di[0] = -4829707904221184.000, df[0] = -3639031300620288.000, dc_hat[0] = -2661349104549888.000
Gradient do_[0] = -214731494168461312.000
Backward Time Step 1:
Gradient di[0] = -6093580414222336.000, df[0] = -4359744261193728.000, dc_hat[0] = -3688319808438272.000
Gradient do_[0] = -192673109071888384.000
Backward Time Step 0:
Gradient di[0] = -7115261329014784.000, df[0] = -5200546654846976.000, dc_hat[0] = -7193291221106688.000
Gradient do_[0] = -110385761928347648.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1264741908480.000, df[0] = 927316115456.000, dc_hat[0] = 549075681280.000
Gradient do_[0] = 78417076682752.000
Backward Time Step 3:
Gradient di[0] = 1984232685568.000, df[0] = 1431911661568.000, dc_hat[0] = 776777760768.000
Gradient do_[0] = 104962918973440.000
Backward Time Step 2:
Gradient di[0] = 2524800876544.000, df[0] = 1813418475520.000, dc_hat[0] = 1289968418816.000
Gradient do_[0] = 114384206561280.000
Backward Time Step 1:
Gradient di[0] = 3166463066112.000, df[0] = 2183498039296.000, dc_hat[0] = 1761067008000.000
Gradient do_[0] = 100798151262208.000
Backward Time Step 0:
Gradient di[0] = 3833665159168.000, df[0] = 2727630864384.000, dc_hat[0] = 3597271040000.000
Gradient do_[0] = 59268115988480.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2348598712860672.000, df[0] = -1843864088346624.000, dc_hat[0] = -1061550760656896.000
Gradient do_[0] = -137298138674233344.000
Backward Time Step 3:
Gradient di[0] = -3687629392445440.000, df[0] = -2815199128059904.000, dc_hat[0] = -1523701254193152.000
Gradient do_[0] = -187719551490719744.000
Backward Time Step 2:
Gradient di[0] = -4830834796265472.000, df[0] = -3639880630403072.000, dc_hat[0] = -2661968653582336.000
Gradient do_[0] = -214781642206609408.000
Backward Time Step 1:
Gradient di[0] = -6095009564590080.000, df[0] = -4360766463410176.000, dc_hat[0] = -3689179607203840.000
Gradient do_[0] = -192718189048627200.000
Backward Time Step 0:
Gradient di[0] = -7116927239454720.000, df[0] = -5201764814946304.000, dc_hat[0] = -7194976458899456.000
Gradient do_[0] = -110411600451600384.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1265069850624.000, df[0] = 927556632576.000, dc_hat[0] = 549217959936.000
Gradient do_[0] = 78437385502720.000
Backward Time Step 3:
Gradient di[0] = 1984745308160.000, df[0] = 1432281677824.000, dc_hat[0] = 776978038784.000
Gradient do_[0] = 104990047731712.000
Backward Time Step 2:
Gradient di[0] = 2525455974400.000, df[0] = 1813888892928.000, dc_hat[0] = 1290302390272.000
Gradient do_[0] = 114413860290560.000
Backward Time Step 1:
Gradient di[0] = 3167288295424.000, df[0] = 2184066891776.000, dc_hat[0] = 1761523400704.000
Gradient do_[0] = 100824407605248.000
Backward Time Step 0:
Gradient di[0] = 3834658160640.000, df[0] = 2728337342464.000, dc_hat[0] = 3598202699776.000
Gradient do_[0] = 59283467141120.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2349143099965440.000, df[0] = -1844291571810304.000, dc_hat[0] = -1061796647534592.000
Gradient do_[0] = -137329930022158336.000
Backward Time Step 3:
Gradient di[0] = -3688485969985536.000, df[0] = -2815853305266176.000, dc_hat[0] = -1524054246817792.000
Gradient do_[0] = -187763102459101184.000
Backward Time Step 2:
Gradient di[0] = -4831962225180672.000, df[0] = -3640729960185856.000, dc_hat[0] = -2662587665743872.000
Gradient do_[0] = -214831549726588928.000
Backward Time Step 1:
Gradient di[0] = -6096422608830464.000, df[0] = -4361777122902016.000, dc_hat[0] = -3690029742292992.000
Gradient do_[0] = -192762856708505600.000
Backward Time Step 0:
Gradient di[0] = -7118573285670912.000, df[0] = -5202967405789184.000, dc_hat[0] = -7196639684984832.000
Gradient do_[0] = -110437155507011584.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1265375641600.000, df[0] = 927781093376.000, dc_hat[0] = 549350703104.000
Gradient do_[0] = 78456318590976.000
Backward Time Step 3:
Gradient di[0] = 1985223983104.000, df[0] = 1432627052544.000, dc_hat[0] = 777164750848.000
Gradient do_[0] = 105015372939264.000
Backward Time Step 2:
Gradient di[0] = 2526065721344.000, df[0] = 1814326804480.000, dc_hat[0] = 1290612113408.000
Gradient do_[0] = 114441408479232.000
Backward Time Step 1:
Gradient di[0] = 3168048775168.000, df[0] = 2184591048704.000, dc_hat[0] = 1761944403968.000
Gradient do_[0] = 100848541630464.000
Backward Time Step 0:
Gradient di[0] = 3835574616064.000, df[0] = 2728989294592.000, dc_hat[0] = 3599062794240.000
Gradient do_[0] = 59297635500032.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2349681313054720.000, df[0] = -1844713955000320.000, dc_hat[0] = -1062039111860224.000
Gradient do_[0] = -137361274693484544.000
Backward Time Step 3:
Gradient di[0] = -3689328052011008.000, df[0] = -2816495939747840.000, dc_hat[0] = -1524402139168768.000
Gradient do_[0] = -187805914693107712.000
Backward Time Step 2:
Gradient di[0] = -4833064421163008.000, df[0] = -3641560499486720.000, dc_hat[0] = -2663194598309888.000
Gradient do_[0] = -214880546713501696.000
Backward Time Step 1:
Gradient di[0] = -6097822768168960.000, df[0] = -4362778655588352.000, dc_hat[0] = -3690873703366656.000
Gradient do_[0] = -192807008972308480.000
Backward Time Step 0:
Gradient di[0] = -7120201078276096.000, df[0] = -5204157648601088.000, dc_hat[0] = -7198285731201024.000
Gradient do_[0] = -110462392734842880.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1265747755008.000, df[0] = 928053919744.000, dc_hat[0] = 549512249344.000
Gradient do_[0] = 78479395651584.000
Backward Time Step 3:
Gradient di[0] = 1985808826368.000, df[0] = 1433048973312.000, dc_hat[0] = 777393209344.000
Gradient do_[0] = 105046284959744.000
Backward Time Step 2:
Gradient di[0] = 2526809686016.000, df[0] = 1814861185024.000, dc_hat[0] = 1290992222208.000
Gradient do_[0] = 114475097128960.000
Backward Time Step 1:
Gradient di[0] = 3168987512832.000, df[0] = 2185238282240.000, dc_hat[0] = 1762464497664.000
Gradient do_[0] = 100878405074944.000
Backward Time Step 0:
Gradient di[0] = 3836712321024.000, df[0] = 2729798533120.000, dc_hat[0] = 3600130244608.000
Gradient do_[0] = 59315222216704.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2350227310772224.000, df[0] = -1845142914859008.000, dc_hat[0] = -1062286005370880.000
Gradient do_[0] = -137393229250166784.000
Backward Time Step 3:
Gradient di[0] = -3690184092680192.000, df[0] = -2817149580083200.000, dc_hat[0] = -1524754997575680.000
Gradient do_[0] = -187849448481619968.000
Backward Time Step 2:
Gradient di[0] = -4834185407627264.000, df[0] = -3642404192124928.000, dc_hat[0] = -2663809583939584.000
Gradient do_[0] = -214930385514004480.000
Backward Time Step 1:
Gradient di[0] = -6099235812409344.000, df[0] = -4363789315080192.000, dc_hat[0] = -3691725449068544.000
Gradient do_[0] = -192851693812056064.000
Backward Time Step 0:
Gradient di[0] = -7121860009394176.000, df[0] = -5205370439991296.000, dc_hat[0] = -7199962915930112.000
Gradient do_[0] = -110488119588945920.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1266070454272.000, df[0] = 928290439168.000, dc_hat[0] = 549652267008.000
Gradient do_[0] = 78499377315840.000
Backward Time Step 3:
Gradient di[0] = 1986315419648.000, df[0] = 1433414533120.000, dc_hat[0] = 777591390208.000
Gradient do_[0] = 105073069785088.000
Backward Time Step 2:
Gradient di[0] = 2527455346688.000, df[0] = 1815324786688.000, dc_hat[0] = 1291321212928.000
Gradient do_[0] = 114504297873408.000
Backward Time Step 1:
Gradient di[0] = 3169793605632.000, df[0] = 2185794158592.000, dc_hat[0] = 1762910797824.000
Gradient do_[0] = 100903998717952.000
Backward Time Step 0:
Gradient di[0] = 3837681991680.000, df[0] = 2730488758272.000, dc_hat[0] = 3601040146432.000
Gradient do_[0] = 59330216853504.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2350781629988864.000, df[0] = -1845578182950912.000, dc_hat[0] = -1062536522760192.000
Gradient do_[0] = -137425613303578624.000
Backward Time Step 3:
Gradient di[0] = -3691054360428544.000, df[0] = -2817814494707712.000, dc_hat[0] = -1525114566868992.000
Gradient do_[0] = -187893789723983872.000
Backward Time Step 2:
Gradient di[0] = -4835323573960704.000, df[0] = -3643261843406848.000, dc_hat[0] = -2664434501681152.000
Gradient do_[0] = -214980928689143808.000
Backward Time Step 1:
Gradient di[0] = -6100681068904448.000, df[0] = -4364823596892160.000, dc_hat[0] = -3692595985252352.000
Gradient do_[0] = -192897289184870400.000
Backward Time Step 0:
Gradient di[0] = -7123543636574208.000, df[0] = -5206600948121600.000, dc_hat[0] = -7201664796721152.000
Gradient do_[0] = -110514258759909376.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1266371788800.000, df[0] = 928511229952.000, dc_hat[0] = 549782945792.000
Gradient do_[0] = 78518109077504.000
Backward Time Step 3:
Gradient di[0] = 1986789507072.000, df[0] = 1433756762112.000, dc_hat[0] = 777776988160.000
Gradient do_[0] = 105098109779968.000
Backward Time Step 2:
Gradient di[0] = 2528060637184.000, df[0] = 1815759421440.000, dc_hat[0] = 1291629887488.000
Gradient do_[0] = 114531745398784.000
Backward Time Step 1:
Gradient di[0] = 3170551988224.000, df[0] = 2186317135872.000, dc_hat[0] = 1763329966080.000
Gradient do_[0] = 100928124354560.000
Backward Time Step 0:
Gradient di[0] = 3838595039232.000, df[0] = 2731138351104.000, dc_hat[0] = 3601896833024.000
Gradient do_[0] = 59344334880768.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2351324138045440.000, df[0] = -1846003921584128.000, dc_hat[0] = -1062781201678336.000
Gradient do_[0] = -137457318752157696.000
Backward Time Step 3:
Gradient di[0] = -3691906374565888.000, df[0] = -2818464645382144.000, dc_hat[0] = -1525466083098624.000
Gradient do_[0] = -187937134533935104.000
Backward Time Step 2:
Gradient di[0] = -4836443486683136.000, df[0] = -3644105536045056.000, dc_hat[0] = -2665050024181760.000
Gradient do_[0] = -215030698770169856.000
Backward Time Step 1:
Gradient di[0] = -6102083912597504.000, df[0] = -4365826740191232.000, dc_hat[0] = -3693439677890560.000
Gradient do_[0] = -192941561707757568.000
Backward Time Step 0:
Gradient di[0] = -7125167671083008.000, df[0] = -5207787432837120.000, dc_hat[0] = -7203307084840960.000
Gradient do_[0] = -110539444448133120.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1266900271104.000, df[0] = 928898613248.000, dc_hat[0] = 550012387328.000
Gradient do_[0] = 78550841425920.000
Backward Time Step 3:
Gradient di[0] = 1987616309248.000, df[0] = 1434353270784.000, dc_hat[0] = 778100211712.000
Gradient do_[0] = 105141831204864.000
Backward Time Step 2:
Gradient di[0] = 2529110786048.000, df[0] = 1816513740800.000, dc_hat[0] = 1292165709824.000
Gradient do_[0] = 114579233308672.000
Backward Time Step 1:
Gradient di[0] = 3171871621120.000, df[0] = 2187226513408.000, dc_hat[0] = 1764062003200.000
Gradient do_[0] = 100970084171776.000
Backward Time Step 0:
Gradient di[0] = 3840196214784.000, df[0] = 2732277366784.000, dc_hat[0] = 3603399180288.000
Gradient do_[0] = 59369081274368.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2351869598892032.000, df[0] = -1846432344571904.000, dc_hat[0] = -1063027826753536.000
Gradient do_[0] = -137489221769232384.000
Backward Time Step 3:
Gradient di[0] = -3692764294283264.000, df[0] = -2819119359459328.000, dc_hat[0] = -1525819612594176.000
Gradient do_[0] = -187980771401662464.000
Backward Time Step 2:
Gradient di[0] = -4837562862534656.000, df[0] = -3644949228683264.000, dc_hat[0] = -2665666351988736.000
Gradient do_[0] = -215080451671326720.000
Backward Time Step 1:
Gradient di[0] = -6103497493708800.000, df[0] = -4366837668118528.000, dc_hat[0] = -3694290349850624.000
Gradient do_[0] = -192986229367635968.000
Backward Time Step 0:
Gradient di[0] = -7126817475395584.000, df[0] = -5208993781776384.000, dc_hat[0] = -7204975142764544.000
Gradient do_[0] = -110565051043151872.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1267055853568.000, df[0] = 929012711424.000, dc_hat[0] = 550079627264.000
Gradient do_[0] = 78560513490944.000
Backward Time Step 3:
Gradient di[0] = 1987861544960.000, df[0] = 1434530349056.000, dc_hat[0] = 778195763200.000
Gradient do_[0] = 105154816770048.000
Backward Time Step 2:
Gradient di[0] = 2529424048128.000, df[0] = 1816738791424.000, dc_hat[0] = 1292324962304.000
Gradient do_[0] = 114593410056192.000
Backward Time Step 1:
Gradient di[0] = 3172262739968.000, df[0] = 2187496521728.000, dc_hat[0] = 1764277354496.000
Gradient do_[0] = 100982516088832.000
Backward Time Step 0:
Gradient di[0] = 3840664141824.000, df[0] = 2732610289664.000, dc_hat[0] = 3603838533632.000
Gradient do_[0] = 59376320643072.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2352411033206784.000, df[0] = -1846857009463296.000, dc_hat[0] = -1063272237236224.000
Gradient do_[0] = -137520824138596352.000
Backward Time Step 3:
Gradient di[0] = -3693615771549696.000, df[0] = -2819769241698304.000, dc_hat[0] = -1526170323517440.000
Gradient do_[0] = -188024099031744512.000
Backward Time Step 2:
Gradient di[0] = -4838684922740736.000, df[0] = -3645794531934208.000, dc_hat[0] = -2666281874489344.000
Gradient do_[0] = -215130221752352768.000
Backward Time Step 1:
Gradient di[0] = -6104915369787392.000, df[0] = -4367852354142208.000, dc_hat[0] = -3695145048342528.000
Gradient do_[0] = -193031017286598656.000
Backward Time Step 0:
Gradient di[0] = -7128474259030016.000, df[0] = -5210204425682944.000, dc_hat[0] = -7206649106268160.000
Gradient do_[0] = -110590734947581952.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1267581321216.000, df[0] = 929397997568.000, dc_hat[0] = 550307823616.000
Gradient do_[0] = 78593086455808.000
Backward Time Step 3:
Gradient di[0] = 1988685594624.000, df[0] = 1435125022720.000, dc_hat[0] = 778518331392.000
Gradient do_[0] = 105198362034176.000
Backward Time Step 2:
Gradient di[0] = 2530470264832.000, df[0] = 1817490096128.000, dc_hat[0] = 1292858425344.000
Gradient do_[0] = 114640805691392.000
Backward Time Step 1:
Gradient di[0] = 3173576343552.000, df[0] = 2188401967104.000, dc_hat[0] = 1765005459456.000
Gradient do_[0] = 101024282968064.000
Backward Time Step 0:
Gradient di[0] = 3842255093760.000, df[0] = 2733742227456.000, dc_hat[0] = 3605331181568.000
Gradient do_[0] = 59400920236032.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2352940119490560.000, df[0] = -1847272547549184.000, dc_hat[0] = -1063511413227520.000
Gradient do_[0] = -137551756493062144.000
Backward Time Step 3:
Gradient di[0] = -3694446847721472.000, df[0] = -2820403823116288.000, dc_hat[0] = -1526513786683392.000
Gradient do_[0] = -188066258430722048.000
Backward Time Step 2:
Gradient di[0] = -4839769938853888.000, df[0] = -3646611381026816.000, dc_hat[0] = -2666878606508032.000
Gradient do_[0] = -215178325386067968.000
Backward Time Step 1:
Gradient di[0] = -6106274190065664.000, df[0] = -4368824090492928.000, dc_hat[0] = -3695963239612416.000
Gradient do_[0] = -193073881060212736.000
Backward Time Step 0:
Gradient di[0] = -7130059638833152.000, df[0] = -5211362993111040.000, dc_hat[0] = -7208252739682304.000
Gradient do_[0] = -110615345110188032.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1267724189696.000, df[0] = 929502789632.000, dc_hat[0] = 550369820672.000
Gradient do_[0] = 78601894494208.000
Backward Time Step 3:
Gradient di[0] = 1988910383104.000, df[0] = 1435287158784.000, dc_hat[0] = 778606018560.000
Gradient do_[0] = 105210257080320.000
Backward Time Step 2:
Gradient di[0] = 2530757312512.000, df[0] = 1817696403456.000, dc_hat[0] = 1293004570624.000
Gradient do_[0] = 114653808033792.000
Backward Time Step 1:
Gradient di[0] = 3173937577984.000, df[0] = 2188651266048.000, dc_hat[0] = 1765204426752.000
Gradient do_[0] = 101035758583808.000
Backward Time Step 0:
Gradient di[0] = 3842692874240.000, df[0] = 2734053654528.000, dc_hat[0] = 3605742223360.000
Gradient do_[0] = 59407681454080.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2353478064144384.000, df[0] = -1847694930739200.000, dc_hat[0] = -1063754011770880.000
Gradient do_[0] = -137583161293930496.000
Backward Time Step 3:
Gradient di[0] = -3695289198182400.000, df[0] = -2821046726033408.000, dc_hat[0] = -1526860471074816.000
Gradient do_[0] = -188109139384205312.000
Backward Time Step 2:
Gradient di[0] = -4840876966674432.000, df[0] = -3647445946859520.000, dc_hat[0] = -2667487149686784.000
Gradient do_[0] = -215227562891149312.000
Backward Time Step 1:
Gradient di[0] = -6107680254984192.000, df[0] = -4369829381275648.000, dc_hat[0] = -3696808811298816.000
Gradient do_[0] = -193118291022053376.000
Backward Time Step 0:
Gradient di[0] = -7131701926952960.000, df[0] = -5212563973341184.000, dc_hat[0] = -7209912744542208.000
Gradient do_[0] = -110640814266253312.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1268075331584.000, df[0] = 929760083968.000, dc_hat[0] = 550521929728.000
Gradient do_[0] = 78623662931968.000
Backward Time Step 3:
Gradient di[0] = 1989460099072.000, df[0] = 1435684044800.000, dc_hat[0] = 778821173248.000
Gradient do_[0] = 105239315218432.000
Backward Time Step 2:
Gradient di[0] = 2531455401984.000, df[0] = 1818197622784.000, dc_hat[0] = 1293360824320.000
Gradient do_[0] = 114685374365696.000
Backward Time Step 1:
Gradient di[0] = 3174813925376.000, df[0] = 2189255245824.000, dc_hat[0] = 1765689262080.000
Gradient do_[0] = 101063575207936.000
Backward Time Step 0:
Gradient di[0] = 3843754557440.000, df[0] = 2734809153536.000, dc_hat[0] = 3606738370560.000
Gradient do_[0] = 59424097959936.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2354013324443648.000, df[0] = -1848114763792384.000, dc_hat[0] = -1063995872116736.000
Gradient do_[0] = -137614497375322112.000
Backward Time Step 3:
Gradient di[0] = -3696128595853312.000, df[0] = -2821687481466880.000, dc_hat[0] = -1527206484377600.000
Gradient do_[0] = -188151917258473472.000
Backward Time Step 2:
Gradient di[0] = -4841978625785856.000, df[0] = -3648276486160384.000, dc_hat[0] = -2668093008510976.000
Gradient do_[0] = -215276491158585344.000
Backward Time Step 1:
Gradient di[0] = -6109072898129920.000, df[0] = -4370825545252864.000, dc_hat[0] = -3697647940534272.000
Gradient do_[0] = -193162254307295232.000
Backward Time Step 0:
Gradient di[0] = -7133310929076224.000, df[0] = -5213739183767552.000, dc_hat[0] = -7211538926534656.000
Gradient do_[0] = -110665776616177664.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1268458979328.000, df[0] = 930041495552.000, dc_hat[0] = 550688456704.000
Gradient do_[0] = 78647436247040.000
Backward Time Step 3:
Gradient di[0] = 1990063685632.000, df[0] = 1436119465984.000, dc_hat[0] = 779057496064.000
Gradient do_[0] = 105271242260480.000
Backward Time Step 2:
Gradient di[0] = 2532227153920.000, df[0] = 1818751795200.000, dc_hat[0] = 1293754433536.000
Gradient do_[0] = 114720304529408.000
Backward Time Step 1:
Gradient di[0] = 3175777566720.000, df[0] = 2189919649792.000, dc_hat[0] = 1766223642624.000
Gradient do_[0] = 101094235570176.000
Backward Time Step 0:
Gradient di[0] = 3844914544640.000, df[0] = 2735634644992.000, dc_hat[0] = 3607826792448.000
Gradient do_[0] = 59442032803840.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2354551805968384.000, df[0] = -1848537415417856.000, dc_hat[0] = -1064238940422144.000
Gradient do_[0] = -137645936535928832.000
Backward Time Step 3:
Gradient di[0] = -3696975241281536.000, df[0] = -2822333874044928.000, dc_hat[0] = -1527556121559040.000
Gradient do_[0] = -188194970010648576.000
Backward Time Step 2:
Gradient di[0] = -4843089411702784.000, df[0] = -3649112931041280.000, dc_hat[0] = -2668702893867008.000
Gradient do_[0] = -215325848922750976.000
Backward Time Step 1:
Gradient di[0] = -6110471446855680.000, df[0] = -4371826272632832.000, dc_hat[0] = -3698491096301568.000
Gradient do_[0] = -193206458110705664.000
Backward Time Step 0:
Gradient di[0] = -7134946774745088.000, df[0] = -5214934795288576.000, dc_hat[0] = -7213192488943616.000
Gradient do_[0] = -110691151282962432.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1268836073472.000, df[0] = 930317926400.000, dc_hat[0] = 550852165632.000
Gradient do_[0] = 78670806908928.000
Backward Time Step 3:
Gradient di[0] = 1990653509632.000, df[0] = 1436545056768.000, dc_hat[0] = 779287789568.000
Gradient do_[0] = 105302414327808.000
Backward Time Step 2:
Gradient di[0] = 2532975312896.000, df[0] = 1819289059328.000, dc_hat[0] = 1294135066624.000
Gradient do_[0] = 114754152562688.000
Backward Time Step 1:
Gradient di[0] = 3176722071552.000, df[0] = 2190570946560.000, dc_hat[0] = 1766746226688.000
Gradient do_[0] = 101124266786816.000
Backward Time Step 0:
Gradient di[0] = 3846055919616.000, df[0] = 2736446767104.000, dc_hat[0] = 3608897650688.000
Gradient do_[0] = 59459674046464.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2355116057296896.000, df[0] = -1848980602355712.000, dc_hat[0] = -1064493954105344.000
Gradient do_[0] = -137678921884762112.000
Backward Time Step 3:
Gradient di[0] = -3697865641689088.000, df[0] = -2823014357925888.000, dc_hat[0] = -1527924146569216.000
Gradient do_[0] = -188240135886733312.000
Backward Time Step 2:
Gradient di[0] = -4844245294776320.000, df[0] = -3649983735660544.000, dc_hat[0] = -2669338280591360.000
Gradient do_[0] = -215377182371872768.000
Backward Time Step 1:
Gradient di[0] = -6111931735736320.000, df[0] = -4372869949685760.000, dc_hat[0] = -3699369417113600.000
Gradient do_[0] = -193252465800380416.000
Backward Time Step 0:
Gradient di[0] = -7136644897439744.000, df[0] = -5216176577708032.000, dc_hat[0] = -7214909402120192.000
Gradient do_[0] = -110717505202290688.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1269155102720.000, df[0] = 930551889920.000, dc_hat[0] = 550990708736.000
Gradient do_[0] = 78690604023808.000
Backward Time Step 3:
Gradient di[0] = 1991152893952.000, df[0] = 1436905373696.000, dc_hat[0] = 779483283456.000
Gradient do_[0] = 105328830054400.000
Backward Time Step 2:
Gradient di[0] = 2533612060672.000, df[0] = 1819746369536.000, dc_hat[0] = 1294459863040.000
Gradient do_[0] = 114782992596992.000
Backward Time Step 1:
Gradient di[0] = 3177517416448.000, df[0] = 2191119089664.000, dc_hat[0] = 1767186235392.000
Gradient do_[0] = 101149558439936.000
Backward Time Step 0:
Gradient di[0] = 3847016153088.000, df[0] = 2737129914368.000, dc_hat[0] = 3609798639616.000
Gradient do_[0] = 59474517688320.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2355649975418880.000, df[0] = -1849399495884800.000, dc_hat[0] = -1064734740709376.000
Gradient do_[0] = -137710120527200256.000
Backward Time Step 3:
Gradient di[0] = -3698703160311808.000, df[0] = -2823653234311168.000, dc_hat[0] = -1528269220347904.000
Gradient do_[0] = -188282759142178816.000
Backward Time Step 2:
Gradient di[0] = -4845344806404096.000, df[0] = -3650811590606848.000, dc_hat[0] = -2669940381319168.000
Gradient do_[0] = -215426076279570432.000
Backward Time Step 1:
Gradient di[0] = -6113322768269312.000, df[0] = -4373866113662976.000, dc_hat[0] = -3700207472607232.000
Gradient do_[0] = -193296446265491456.000
Backward Time Step 0:
Gradient di[0] = -7138271616303104.000, df[0] = -5217365209907200.000, dc_hat[0] = -7216554374594560.000
Gradient do_[0] = -110742733840187392.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1269461024768.000, df[0] = 930776154112.000, dc_hat[0] = 551123156992.000
Gradient do_[0] = 78709511946240.000
Backward Time Step 3:
Gradient di[0] = 1991631831040.000, df[0] = 1437251403776.000, dc_hat[0] = 779670323200.000
Gradient do_[0] = 105354096541696.000
Backward Time Step 2:
Gradient di[0] = 2534219186176.000, df[0] = 1820182446080.000, dc_hat[0] = 1294768668672.000
Gradient do_[0] = 114810473676800.000
Backward Time Step 1:
Gradient di[0] = 3178280255488.000, df[0] = 2191645081600.000, dc_hat[0] = 1767608025088.000
Gradient do_[0] = 101173835071488.000
Backward Time Step 0:
Gradient di[0] = 3847943094272.000, df[0] = 2737789206528.000, dc_hat[0] = 3610668695552.000
Gradient do_[0] = 59488849625088.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2356188188508160.000, df[0] = -1849822147510272.000, dc_hat[0] = -1064978010341376.000
Gradient do_[0] = -137741568277741568.000
Backward Time Step 3:
Gradient di[0] = -3699546047643648.000, df[0] = -2824296405663744.000, dc_hat[0] = -1528616441610240.000
Gradient do_[0] = -188325622915792896.000
Backward Time Step 2:
Gradient di[0] = -4846445391773696.000, df[0] = -3651641056165888.000, dc_hat[0] = -2670547045449728.000
Gradient do_[0] = -215474918647660544.000
Backward Time Step 1:
Gradient di[0] = -6114708432093184.000, df[0] = -4374856908931072.000, dc_hat[0] = -3701042575310848.000
Gradient do_[0] = -193340220572172288.000
Backward Time Step 0:
Gradient di[0] = -7139892966457344.000, df[0] = -5218550084009984.000, dc_hat[0] = -7218192904617984.000
Gradient do_[0] = -110767893758607360.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1269813215232.000, df[0] = 931034300416.000, dc_hat[0] = 551276052480.000
Gradient do_[0] = 78731355881472.000
Backward Time Step 3:
Gradient di[0] = 1992187183104.000, df[0] = 1437651959808.000, dc_hat[0] = 779887247360.000
Gradient do_[0] = 105383465058304.000
Backward Time Step 2:
Gradient di[0] = 2534931431424.000, df[0] = 1820694020096.000, dc_hat[0] = 1295132524544.000
Gradient do_[0] = 114842677542912.000
Backward Time Step 1:
Gradient di[0] = 3179173380096.000, df[0] = 2192260988928.000, dc_hat[0] = 1768103084032.000
Gradient do_[0] = 101202222120960.000
Backward Time Step 0:
Gradient di[0] = 3849015787520.000, df[0] = 2738552569856.000, dc_hat[0] = 3611675066368.000
Gradient do_[0] = 59505438097408.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2356732038742016.000, df[0] = -1850249228320768.000, dc_hat[0] = -1065223695892480.000
Gradient do_[0] = -137773273726320640.000
Backward Time Step 3:
Gradient di[0] = -3700399403958272.000, df[0] = -2824948166950912.000, dc_hat[0] = -1528968897363968.000
Gradient do_[0] = -188369087984828416.000
Backward Time Step 2:
Gradient di[0] = -4847570673205248.000, df[0] = -3652488506900480.000, dc_hat[0] = -2671165520740352.000
Gradient do_[0] = -215524929246855168.000
Backward Time Step 1:
Gradient di[0] = -6116131140009984.000, df[0] = -4375874547744768.000, dc_hat[0] = -3701898347544576.000
Gradient do_[0] = -193385180289826816.000
Backward Time Step 0:
Gradient di[0] = -7141545455124480.000, df[0] = -5219758043561984.000, dc_hat[0] = -7219863646896128.000
Gradient do_[0] = -110793517533495296.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1270181265408.000, df[0] = 931304177664.000, dc_hat[0] = 551435829248.000
Gradient do_[0] = 78754181283840.000
Backward Time Step 3:
Gradient di[0] = 1992765210624.000, df[0] = 1438069030912.000, dc_hat[0] = 780113412096.000
Gradient do_[0] = 105414049923072.000
Backward Time Step 2:
Gradient di[0] = 2535665434624.000, df[0] = 1821220929536.000, dc_hat[0] = 1295506604032.000
Gradient do_[0] = 114875896430592.000
Backward Time Step 1:
Gradient di[0] = 3180090359808.000, df[0] = 2192893018112.000, dc_hat[0] = 1768610594816.000
Gradient do_[0] = 101231372533760.000
Backward Time Step 0:
Gradient di[0] = 3850124132352.000, df[0] = 2739341099008.000, dc_hat[0] = 3612715253760.000
Gradient do_[0] = 59522576023552.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2357267835912192.000, df[0] = -1850669732462592.000, dc_hat[0] = -1065465287802880.000
Gradient do_[0] = -137804626987581440.000
Backward Time Step 3:
Gradient di[0] = -3701235848839168.000, df[0] = -2825587311771648.000, dc_hat[0] = -1529315447537664.000
Gradient do_[0] = -188411694060404736.000
Backward Time Step 2:
Gradient di[0] = -4848668574220288.000, df[0] = -3653316093411328.000, dc_hat[0] = -2671768695209984.000
Gradient do_[0] = -215573668535730176.000
Backward Time Step 1:
Gradient di[0] = -6117517340704768.000, df[0] = -4376865879883776.000, dc_hat[0] = -3702733987119104.000
Gradient do_[0] = -193428868697161728.000
Backward Time Step 0:
Gradient di[0] = -7143160899698688.000, df[0] = -5220938622697472.000, dc_hat[0] = -7221497345081344.000
Gradient do_[0] = -110818600142503936.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1270388752384.000, df[0] = 931456221184.000, dc_hat[0] = 551525744640.000
Gradient do_[0] = 78767024242688.000
Backward Time Step 3:
Gradient di[0] = 1993092104192.000, df[0] = 1438305222656.000, dc_hat[0] = 780241272832.000
Gradient do_[0] = 105431246569472.000
Backward Time Step 2:
Gradient di[0] = 2536078835712.000, df[0] = 1821517938688.000, dc_hat[0] = 1295717236736.000
Gradient do_[0] = 114894619803648.000
Backward Time Step 1:
Gradient di[0] = 3180609929216.000, df[0] = 2193251368960.000, dc_hat[0] = 1768897511424.000
Gradient do_[0] = 101247822594048.000
Backward Time Step 0:
Gradient di[0] = 3850754588672.000, df[0] = 2739789627392.000, dc_hat[0] = 3613306388480.000
Gradient do_[0] = 59532319391744.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2357781352939520.000, df[0] = -1851073325170688.000, dc_hat[0] = -1065697484472320.000
Gradient do_[0] = -137834665988849664.000
Backward Time Step 3:
Gradient di[0] = -3702043302690816.000, df[0] = -2826203371143168.000, dc_hat[0] = -1529647770632192.000
Gradient do_[0] = -188452736767885312.000
Backward Time Step 2:
Gradient di[0] = -4849724062433280.000, df[0] = -3654111199232000.000, dc_hat[0] = -2672348247359488.000
Gradient do_[0] = -215620672657817600.000
Backward Time Step 1:
Gradient di[0] = -6118854149275648.000, df[0] = -4377822046978048.000, dc_hat[0] = -3703537682874368.000
Gradient do_[0] = -193471096815616000.000
Backward Time Step 0:
Gradient di[0] = -7144715677859840.000, df[0] = -5222075178418176.000, dc_hat[0] = -7223068766240768.000
Gradient do_[0] = -110842729268772864.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1270813294592.000, df[0] = 931767582720.000, dc_hat[0] = 551710031872.000
Gradient do_[0] = 78793339305984.000
Backward Time Step 3:
Gradient di[0] = 1993756770304.000, df[0] = 1438784552960.000, dc_hat[0] = 780500860928.000
Gradient do_[0] = 105466420002816.000
Backward Time Step 2:
Gradient di[0] = 2536922939392.000, df[0] = 1822124277760.000, dc_hat[0] = 1296147939328.000
Gradient do_[0] = 114932855078912.000
Backward Time Step 1:
Gradient di[0] = 3181671088128.000, df[0] = 2193983012864.000, dc_hat[0] = 1769486417920.000
Gradient do_[0] = 101281611907072.000
Backward Time Step 0:
Gradient di[0] = 3852040667136.000, df[0] = 2740704772096.000, dc_hat[0] = 3614513561600.000
Gradient do_[0] = 59552204587008.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2358321445076992.000, df[0] = -1851497184755712.000, dc_hat[0] = -1065941559410688.000
Gradient do_[0] = -137866251178344448.000
Backward Time Step 3:
Gradient di[0] = -3702898001182720.000, df[0] = -2826855669301248.000, dc_hat[0] = -1529999823732736.000
Gradient do_[0] = -188496201836920832.000
Backward Time Step 2:
Gradient di[0] = -4850844512026624.000, df[0] = -3654955697176576.000, dc_hat[0] = -2672965917343744.000
Gradient do_[0] = -215670339659628544.000
Backward Time Step 1:
Gradient di[0] = -6120261824806912.000, df[0] = -4378829485244416.000, dc_hat[0] = -3704385402044416.000
Gradient do_[0] = -193515506777456640.000
Backward Time Step 0:
Gradient di[0] = -7146353134141440.000, df[0] = -5223271863681024.000, dc_hat[0] = -7224724476133376.000
Gradient do_[0] = -110868112525492224.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1271032578048.000, df[0] = 931928408064.000, dc_hat[0] = 551804993536.000
Gradient do_[0] = 78806937239552.000
Backward Time Step 3:
Gradient di[0] = 1994100965376.000, df[0] = 1439033065472.000, dc_hat[0] = 780635602944.000
Gradient do_[0] = 105484606504960.000
Backward Time Step 2:
Gradient di[0] = 2537364652032.000, df[0] = 1822441209856.000, dc_hat[0] = 1296371810304.000
Gradient do_[0] = 114952836743168.000
Backward Time Step 1:
Gradient di[0] = 3182222376960.000, df[0] = 2194362728448.000, dc_hat[0] = 1769789587456.000
Gradient do_[0] = 101299110543360.000
Backward Time Step 0:
Gradient di[0] = 3852699435008.000, df[0] = 2741173223424.000, dc_hat[0] = 3615131435008.000
Gradient do_[0] = 59562379968512.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2358875764293632.000, df[0] = -1851932318629888.000, dc_hat[0] = -1066191875473408.000
Gradient do_[0] = -137898566512279552.000
Backward Time Step 3:
Gradient di[0] = -3703768537366528.000, df[0] = -2827520315490304.000, dc_hat[0] = -1530358721937408.000
Gradient do_[0] = -188540491539677184.000
Backward Time Step 2:
Gradient di[0] = -4851983215230976.000, df[0] = -3655813348458496.000, dc_hat[0] = -2673590835085312.000
Gradient do_[0] = -215720900014637056.000
Backward Time Step 1:
Gradient di[0] = -6121701712592896.000, df[0] = -4379858935218176.000, dc_hat[0] = -3705253253873664.000
Gradient do_[0] = -193560930351579136.000
Backward Time Step 0:
Gradient di[0] = -7148036224450560.000, df[0] = -5224502371811328.000, dc_hat[0] = -7226425820053504.000
Gradient do_[0] = -110894225926651904.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1271612309504.000, df[0] = 932353409024.000, dc_hat[0] = 552056651776.000
Gradient do_[0] = 78842857259008.000
Backward Time Step 3:
Gradient di[0] = 1995011522560.000, df[0] = 1439690260480.000, dc_hat[0] = 780992053248.000
Gradient do_[0] = 105532773892096.000
Backward Time Step 2:
Gradient di[0] = 2538519658496.000, df[0] = 1823270764544.000, dc_hat[0] = 1296961896448.000
Gradient do_[0] = 115005173268480.000
Backward Time Step 1:
Gradient di[0] = 3183674654720.000, df[0] = 2195364642816.000, dc_hat[0] = 1770595549184.000
Gradient do_[0] = 101345331773440.000
Backward Time Step 0:
Gradient di[0] = 3854456848384.000, df[0] = 2742423912448.000, dc_hat[0] = 3616780582912.000
Gradient do_[0] = 59589550669824.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2359413172076544.000, df[0] = -1852354030731264.000, dc_hat[0] = -1066434943778816.000
Gradient do_[0] = -137929954133278720.000
Backward Time Step 3:
Gradient di[0] = -3704607129731072.000, df[0] = -2828161070923776.000, dc_hat[0] = -1530705272111104.000
Gradient do_[0] = -188583114795122688.000
Backward Time Step 2:
Gradient di[0] = -4853082189987840.000, df[0] = -3656640934969344.000, dc_hat[0] = -2674194814861312.000
Gradient do_[0] = -215769725202857984.000
Backward Time Step 1:
Gradient di[0] = -6123085765804032.000, df[0] = -4380848925179904.000, dc_hat[0] = -3706085940658176.000
Gradient do_[0] = -193604670298521600.000
Backward Time Step 0:
Gradient di[0] = -7149643615961088.000, df[0] = -5225677582237696.000, dc_hat[0] = -7228051465175040.000
Gradient do_[0] = -110919179686641664.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1271901978624.000, df[0] = 932565745664.000, dc_hat[0] = 552182349824.000
Gradient do_[0] = 78860808880128.000
Backward Time Step 3:
Gradient di[0] = 1995466866688.000, df[0] = 1440018726912.000, dc_hat[0] = 781170245632.000
Gradient do_[0] = 105556849197056.000
Backward Time Step 2:
Gradient di[0] = 2539099783168.000, df[0] = 1823687704576.000, dc_hat[0] = 1297257463808.000
Gradient do_[0] = 115031396057088.000
Backward Time Step 1:
Gradient di[0] = 3184401580032.000, df[0] = 2195865468928.000, dc_hat[0] = 1770997153792.000
Gradient do_[0] = 101368433999872.000
Backward Time Step 0:
Gradient di[0] = 3855337390080.000, df[0] = 2743050436608.000, dc_hat[0] = 3617607122944.000
Gradient do_[0] = 59603169574912.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2359953264214016.000, df[0] = -1852777756098560.000, dc_hat[0] = -1066678146301952.000
Gradient do_[0] = -137961599452315648.000
Backward Time Step 3:
Gradient di[0] = -3705459949174784.000, df[0] = -2828811490033664.000, dc_hat[0] = -1531056922558464.000
Gradient do_[0] = -188626528324550656.000
Backward Time Step 2:
Gradient di[0] = -4854203713323008.000, df[0] = -3657485969784832.000, dc_hat[0] = -2674810337361920.000
Gradient do_[0] = -215819426564407296.000
Backward Time Step 1:
Gradient di[0] = -6124502031269888.000, df[0] = -4381862805897216.000, dc_hat[0] = -3706939565408256.000
Gradient do_[0] = -193649406677876736.000
Backward Time Step 0:
Gradient di[0] = -7151295030886400.000, df[0] = -5226884468047872.000, dc_hat[0] = -7229720596840448.000
Gradient do_[0] = -110944786281660416.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1272093605888.000, df[0] = 932706189312.000, dc_hat[0] = 552265056256.000
Gradient do_[0] = 78872687149056.000
Backward Time Step 3:
Gradient di[0] = 1995765710848.000, df[0] = 1440234209280.000, dc_hat[0] = 781286244352.000
Gradient do_[0] = 105572644945920.000
Backward Time Step 2:
Gradient di[0] = 2539483824128.000, df[0] = 1823963217920.000, dc_hat[0] = 1297453154304.000
Gradient do_[0] = 115048760475648.000
Backward Time Step 1:
Gradient di[0] = 3184881303552.000, df[0] = 2196196294656.000, dc_hat[0] = 1771262050304.000
Gradient do_[0] = 101383676100608.000
Backward Time Step 0:
Gradient di[0] = 3855915417600.000, df[0] = 2743461740544.000, dc_hat[0] = 3618149236736.000
Gradient do_[0] = 59612099248128.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2360465975934976.000, df[0] = -1853180543500288.000, dc_hat[0] = -1066909470556160.000
Gradient do_[0] = -137991492424695808.000
Backward Time Step 3:
Gradient di[0] = -3706263644930048.000, df[0] = -2829425133486080.000, dc_hat[0] = -1531388842999808.000
Gradient do_[0] = -188667399233339392.000
Backward Time Step 2:
Gradient di[0] = -4855250611601408.000, df[0] = -3658275706896384.000, dc_hat[0] = -2675387473592320.000
Gradient do_[0] = -215865984009895936.000
Backward Time Step 1:
Gradient di[0] = -6125820586229760.000, df[0] = -4382805282783232.000, dc_hat[0] = -3707731986874368.000
Gradient do_[0] = -193691050680778752.000
Backward Time Step 0:
Gradient di[0] = -7152840682242048.000, df[0] = -5228014044446720.000, dc_hat[0] = -7231283428065280.000
Gradient do_[0] = -110968769379041280.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1272442257408.000, df[0] = 932961779712.000, dc_hat[0] = 552416444416.000
Gradient do_[0] = 78894329757696.000
Backward Time Step 3:
Gradient di[0] = 1996312936448.000, df[0] = 1440628998144.000, dc_hat[0] = 781500416000.000
Gradient do_[0] = 105601602420736.000
Backward Time Step 2:
Gradient di[0] = 2540177981440.000, df[0] = 1824461946880.000, dc_hat[0] = 1297807310848.000
Gradient do_[0] = 115080192589824.000
Backward Time Step 1:
Gradient di[0] = 3185754505216.000, df[0] = 2196798308352.000, dc_hat[0] = 1771745705984.000
Gradient do_[0] = 101411417227264.000
Backward Time Step 0:
Gradient di[0] = 3856971595776.000, df[0] = 2744213045248.000, dc_hat[0] = 3619140403200.000
Gradient do_[0] = 59628431867904.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2360949696626688.000, df[0] = -1853560513888256.000, dc_hat[0] = -1067128513888256.000
Gradient do_[0] = -138019753309503488.000
Backward Time Step 3:
Gradient di[0] = -3707026001625088.000, df[0] = -2830007101554688.000, dc_hat[0] = -1531703046701056.000
Gradient do_[0] = -188706122658480128.000
Backward Time Step 2:
Gradient di[0] = -4856253486465024.000, df[0] = -3659030278963200.000, dc_hat[0] = -2675937497841664.000
Gradient do_[0] = -215910514230820864.000
Backward Time Step 1:
Gradient di[0] = -6127089212194816.000, df[0] = -4383712863059968.000, dc_hat[0] = -3708495685746688.000
Gradient do_[0] = -193731062596108288.000
Backward Time Step 0:
Gradient di[0] = -7154312245411840.000, df[0] = -5229089396883456.000, dc_hat[0] = -7232770560491520.000
Gradient do_[0] = -110991592835252224.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1272636768256.000, df[0] = 933104386048.000, dc_hat[0] = 552500854784.000
Gradient do_[0] = 78906367410176.000
Backward Time Step 3:
Gradient di[0] = 1996618072064.000, df[0] = 1440849592320.000, dc_hat[0] = 781619953664.000
Gradient do_[0] = 105617733713920.000
Backward Time Step 2:
Gradient di[0] = 2540570673152.000, df[0] = 1824743751680.000, dc_hat[0] = 1298005884928.000
Gradient do_[0] = 115097968050176.000
Backward Time Step 1:
Gradient di[0] = 3186249957376.000, df[0] = 2197139750912.000, dc_hat[0] = 1772019122176.000
Gradient do_[0] = 101427120701440.000
Backward Time Step 0:
Gradient di[0] = 3857560895488.000, df[0] = 2744632475648.000, dc_hat[0] = 3619693527040.000
Gradient do_[0] = 59637546090496.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2361534617485312.000, df[0] = -1854019404300288.000, dc_hat[0] = -1067392385941504.000
Gradient do_[0] = -138053932659245056.000
Backward Time Step 3:
Gradient di[0] = -3707943245578240.000, df[0] = -2830707449659392.000, dc_hat[0] = -1532082077564928.000
Gradient do_[0] = -188752834722791424.000
Backward Time Step 2:
Gradient di[0] = -4857450708598784.000, df[0] = -3659933027401728.000, dc_hat[0] = -2676596238450688.000
Gradient do_[0] = -215963737465552896.000
Backward Time Step 1:
Gradient di[0] = -6128608020004864.000, df[0] = -4384799758221312.000, dc_hat[0] = -3709411587522560.000
Gradient do_[0] = -193779045970739200.000
Backward Time Step 0:
Gradient di[0] = -7156077476970496.000, df[0] = -5230379497684992.000, dc_hat[0] = -7234555119403008.000
Gradient do_[0] = -111018977546731520.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1273039552512.000, df[0] = 933399494656.000, dc_hat[0] = 552675704832.000
Gradient do_[0] = 78931357073408.000
Backward Time Step 3:
Gradient di[0] = 1997250494464.000, df[0] = 1441305722880.000, dc_hat[0] = 781866696704.000
Gradient do_[0] = 105651170705408.000
Backward Time Step 2:
Gradient di[0] = 2541374930944.000, df[0] = 1825321123840.000, dc_hat[0] = 1298416664576.000
Gradient do_[0] = 115134374608896.000
Backward Time Step 1:
Gradient di[0] = 3187256328192.000, df[0] = 2197833515008.000, dc_hat[0] = 1772576309248.000
Gradient do_[0] = 101459131629568.000
Backward Time Step 0:
Gradient di[0] = 3858779865088.000, df[0] = 2745499648000.000, dc_hat[0] = 3620836737024.000
Gradient do_[0] = 59656382709760.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2362061556285440.000, df[0] = -1854433331773440.000, dc_hat[0] = -1067630018428928.000
Gradient do_[0] = -138084744754626560.000
Backward Time Step 3:
Gradient di[0] = -3708770563653632.000, df[0] = -2831339078287360.000, dc_hat[0] = -1532422051069952.000
Gradient do_[0] = -188794942582161408.000
Backward Time Step 2:
Gradient di[0] = -4858539482808320.000, df[0] = -3660752292413440.000, dc_hat[0] = -2677192433598464.000
Gradient do_[0] = -216012081617436672.000
Backward Time Step 1:
Gradient di[0] = -6129972208992256.000, df[0] = -4385774715797504.000, dc_hat[0] = -3710231657840640.000
Gradient do_[0] = -193822133082652672.000
Backward Time Step 0:
Gradient di[0] = -7157676278546432.000, df[0] = -5231547728789504.000, dc_hat[0] = -7236171100848128.000
Gradient do_[0] = -111043776687898624.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1273401442304.000, df[0] = 933664915456.000, dc_hat[0] = 552832794624.000
Gradient do_[0] = 78953779822592.000
Backward Time Step 3:
Gradient di[0] = 1997816463360.000, df[0] = 1441714012160.000, dc_hat[0] = 782088273920.000
Gradient do_[0] = 105681118035968.000
Backward Time Step 2:
Gradient di[0] = 2542093467648.000, df[0] = 1825837678592.000, dc_hat[0] = 1298783666176.000
Gradient do_[0] = 115166888853504.000
Backward Time Step 1:
Gradient di[0] = 3188158627840.000, df[0] = 2198455451648.000, dc_hat[0] = 1773075824640.000
Gradient do_[0] = 101487854223360.000
Backward Time Step 0:
Gradient di[0] = 3859869335552.000, df[0] = 2746274807808.000, dc_hat[0] = 3621859360768.000
Gradient do_[0] = 59673227034624.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2362579905150976.000, df[0] = -1854840279924736.000, dc_hat[0] = -1067864027037696.000
Gradient do_[0] = -138115058633801728.000
Backward Time Step 3:
Gradient di[0] = -3709584728391680.000, df[0] = -2831960506368000.000, dc_hat[0] = -1532757729607680.000
Gradient do_[0] = -188836277347418112.000
Backward Time Step 2:
Gradient di[0] = -4859602487214080.000, df[0] = -3661553303814144.000, dc_hat[0] = -2677777891328000.000
Gradient do_[0] = -216059291897954304.000
Backward Time Step 1:
Gradient di[0] = -6131314386272256.000, df[0] = -4386735446294528.000, dc_hat[0] = -3711039916998656.000
Gradient do_[0] = -193864481460191232.000
Backward Time Step 0:
Gradient di[0] = -7159238036029440.000, df[0] = -5232689653219328.000, dc_hat[0] = -7237751111942144.000
Gradient do_[0] = -111068017483317248.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1273709068288.000, df[0] = 933890686976.000, dc_hat[0] = 552965767168.000
Gradient do_[0] = 78972863905792.000
Backward Time Step 3:
Gradient di[0] = 1998301036544.000, df[0] = 1442063712256.000, dc_hat[0] = 782277410816.000
Gradient do_[0] = 105706661347328.000
Backward Time Step 2:
Gradient di[0] = 2542712389632.000, df[0] = 1826281488384.000, dc_hat[0] = 1299097583616.000
Gradient do_[0] = 115194839695360.000
Backward Time Step 1:
Gradient di[0] = 3188932476928.000, df[0] = 2198989045760.000, dc_hat[0] = 1773504299008.000
Gradient do_[0] = 101512449622016.000
Backward Time Step 0:
Gradient di[0] = 3860807024640.000, df[0] = 2746942226432.000, dc_hat[0] = 3622739378176.000
Gradient do_[0] = 59687722549248.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2363108991434752.000, df[0] = -1855255683792896.000, dc_hat[0] = -1068103068811264.000
Gradient do_[0] = -138145982398332928.000
Backward Time Step 3:
Gradient di[0] = -3710413925515264.000, df[0] = -2832592671866880.000, dc_hat[0] = -1533099582160896.000
Gradient do_[0] = -188878488286003200.000
Backward Time Step 2:
Gradient di[0] = -4860693408907264.000, df[0] = -3662374984744960.000, dc_hat[0] = -2678376770830336.000
Gradient do_[0] = -216107687589445632.000
Backward Time Step 1:
Gradient di[0] = -6132696828870656.000, df[0] = -4387724094078976.000, dc_hat[0] = -3711871798476800.000
Gradient do_[0] = -193908118327918592.000
Backward Time Step 0:
Gradient di[0] = -7160853480603648.000, df[0] = -5233870232354816.000, dc_hat[0] = -7239383736385536.000
Gradient do_[0] = -111093082912456704.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1274019577856.000, df[0] = 934118031360.000, dc_hat[0] = 553100574720.000
Gradient do_[0] = 78992107372544.000
Backward Time Step 3:
Gradient di[0] = 1998790852608.000, df[0] = 1442417213440.000, dc_hat[0] = 782468710400.000
Gradient do_[0] = 105732565368832.000
Backward Time Step 2:
Gradient di[0] = 2543333408768.000, df[0] = 1826727788544.000, dc_hat[0] = 1299414646784.000
Gradient do_[0] = 115222958309376.000
Backward Time Step 1:
Gradient di[0] = 3189712879616.000, df[0] = 2199527096320.000, dc_hat[0] = 1773936181248.000
Gradient do_[0] = 101537229570048.000
Backward Time Step 0:
Gradient di[0] = 3861752053760.000, df[0] = 2747614625792.000, dc_hat[0] = 3623625949184.000
Gradient do_[0] = 59702339698688.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2363633245880320.000, df[0] = -1855667195346944.000, dc_hat[0] = -1068340097318912.000
Gradient do_[0] = -138176571155415040.000
Backward Time Step 3:
Gradient di[0] = -3711232922091520.000, df[0] = -2833218931785728.000, dc_hat[0] = -1533438079270912.000
Gradient do_[0] = -188920166648643584.000
Backward Time Step 2:
Gradient di[0] = -4861763392634880.000, df[0] = -3663181633290240.000, dc_hat[0] = -2678964912914432.000
Gradient do_[0] = -216155258647216128.000
Backward Time Step 1:
Gradient di[0] = -6134043301117952.000, df[0] = -4388686972059648.000, dc_hat[0] = -3712682741989376.000
Gradient do_[0] = -193950655684018176.000
Backward Time Step 0:
Gradient di[0] = -7162428659859456.000, df[0] = -5235021820461056.000, dc_hat[0] = -7240976632381440.000
Gradient do_[0] = -111117512686436352.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1274457358336.000, df[0] = 934439092224.000, dc_hat[0] = 553290629120.000
Gradient do_[0] = 79019210964992.000
Backward Time Step 3:
Gradient di[0] = 1999475441664.000, df[0] = 1442911092736.000, dc_hat[0] = 782736621568.000
Gradient do_[0] = 105768778989568.000
Backward Time Step 2:
Gradient di[0] = 2544204513280.000, df[0] = 1827353526272.000, dc_hat[0] = 1299859767296.000
Gradient do_[0] = 115262460264448.000
Backward Time Step 1:
Gradient di[0] = 3190806806528.000, df[0] = 2200281284608.000, dc_hat[0] = 1774541864960.000
Gradient do_[0] = 101572033904640.000
Backward Time Step 0:
Gradient di[0] = 3863071948800.000, df[0] = 2748553363456.000, dc_hat[0] = 3624864579584.000
Gradient do_[0] = 59722744987648.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2364167700873216.000, df[0] = -1856086759964672.000, dc_hat[0] = -1068581555011584.000
Gradient do_[0] = -138207744028049408.000
Backward Time Step 3:
Gradient di[0] = -3712073930375168.000, df[0] = -2833860760961024.000, dc_hat[0] = -1533784629444608.000
Gradient do_[0] = -188962996062519296.000
Backward Time Step 2:
Gradient di[0] = -4862867736100864.000, df[0] = -3664013246332928.000, dc_hat[0] = -2679572113915904.000
Gradient do_[0] = -216204204094521344.000
Backward Time Step 1:
Gradient di[0] = -6135434870521856.000, df[0] = -4389683136036864.000, dc_hat[0] = -3713521871224832.000
Gradient do_[0] = -193994601789390848.000
Backward Time Step 0:
Gradient di[0] = -7164047325659136.000, df[0] = -5236205083951104.000, dc_hat[0] = -7242613551792128.000
Gradient do_[0] = -111142629655183360.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1274913751040.000, df[0] = 934773719040.000, dc_hat[0] = 553488744448.000
Gradient do_[0] = 79047505739776.000
Backward Time Step 3:
Gradient di[0] = 2000191881216.000, df[0] = 1443428171776.000, dc_hat[0] = 783016984576.000
Gradient do_[0] = 105806670331904.000
Backward Time Step 2:
Gradient di[0] = 2545114677248.000, df[0] = 1828007182336.000, dc_hat[0] = 1300323631104.000
Gradient do_[0] = 115303656718336.000
Backward Time Step 1:
Gradient di[0] = 3191949492224.000, df[0] = 2201069289472.000, dc_hat[0] = 1775175467008.000
Gradient do_[0] = 101608348188672.000
Backward Time Step 0:
Gradient di[0] = 3864453971968.000, df[0] = 2749536665600.000, dc_hat[0] = 3626161143808.000
Gradient do_[0] = 59744106577920.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2364679875723264.000, df[0] = -1856488742060032.000, dc_hat[0] = -1068812476612608.000
Gradient do_[0] = -138237748669579264.000
Backward Time Step 3:
Gradient di[0] = -3712877089259520.000, df[0] = -2834473599107072.000, dc_hat[0] = -1534115476144128.000
Gradient do_[0] = -189003884151177216.000
Backward Time Step 2:
Gradient di[0] = -4863927519281152.000, df[0] = -3664811841814528.000, dc_hat[0] = -2680155155726336.000
Gradient do_[0] = -216251328475693056.000
Backward Time Step 1:
Gradient di[0] = -6136770068480000.000, df[0] = -4390637424082944.000, dc_hat[0] = -3714324224802816.000
Gradient do_[0] = -194036692468891648.000
Backward Time Step 0:
Gradient di[0] = -7165596198240256.000, df[0] = -5237336807833600.000, dc_hat[0] = -7244178530500608.000
Gradient do_[0] = -111166664292171776.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1275099480064.000, df[0] = 934909837312.000, dc_hat[0] = 553569353728.000
Gradient do_[0] = 79058989744128.000
Backward Time Step 3:
Gradient di[0] = 2000482861056.000, df[0] = 1443638411264.000, dc_hat[0] = 783130755072.000
Gradient do_[0] = 105822038261760.000
Backward Time Step 2:
Gradient di[0] = 2545486659584.000, df[0] = 1828274044928.000, dc_hat[0] = 1300512374784.000
Gradient do_[0] = 115320408768512.000
Backward Time Step 1:
Gradient di[0] = 3192413749248.000, df[0] = 2201389367296.000, dc_hat[0] = 1775431581696.000
Gradient do_[0] = 101623120527360.000
Backward Time Step 0:
Gradient di[0] = 3865018368000.000, df[0] = 2749938532352.000, dc_hat[0] = 3626691198976.000
Gradient do_[0] = 59752834924544.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2365210035748864.000, df[0] = -1856904682799104.000, dc_hat[0] = -1069051451277312.000
Gradient do_[0] = -138268741153587200.000
Backward Time Step 3:
Gradient di[0] = -3713709507608576.000, df[0] = -2835109254266880.000, dc_hat[0] = -1534459341963264.000
Gradient do_[0] = -189046215348846592.000
Backward Time Step 2:
Gradient di[0] = -4865009314168832.000, df[0] = -3665627348729856.000, dc_hat[0] = -2680750008696832.000
Gradient do_[0] = -216299414929539072.000
Backward Time Step 1:
Gradient di[0] = -6138142310531072.000, df[0] = -4391618824110080.000, dc_hat[0] = -3715149932265472.000
Gradient do_[0] = -194080020098973696.000
Backward Time Step 0:
Gradient di[0] = -7167196073558016.000, df[0] = -5238506112679936.000, dc_hat[0] = -7245796659429376.000
Gradient do_[0] = -111191480613208064.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1275472510976.000, df[0] = 935183515648.000, dc_hat[0] = 553731293184.000
Gradient do_[0] = 79082125524992.000
Backward Time Step 3:
Gradient di[0] = 2001068883968.000, df[0] = 1444061118464.000, dc_hat[0] = 783360131072.000
Gradient do_[0] = 105853009002496.000
Backward Time Step 2:
Gradient di[0] = 2546231672832.000, df[0] = 1828809211904.000, dc_hat[0] = 1300892745728.000
Gradient do_[0] = 115354198081536.000
Backward Time Step 1:
Gradient di[0] = 3193348554752.000, df[0] = 2202033717248.000, dc_hat[0] = 1775948791808.000
Gradient do_[0] = 101652832976896.000
Backward Time Step 0:
Gradient di[0] = 3866144800768.000, df[0] = 2750739906560.000, dc_hat[0] = 3627748163584.000
Gradient do_[0] = 59770253869056.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2365740464209920.000, df[0] = -1857321294626816.000, dc_hat[0] = -1069291298357248.000
Gradient do_[0] = -138299673508052992.000
Backward Time Step 3:
Gradient di[0] = -3714541925957632.000, df[0] = -2835744640991232.000, dc_hat[0] = -1534803073564672.000
Gradient do_[0] = -189088598086123520.000
Backward Time Step 2:
Gradient di[0] = -4866099698991104.000, df[0] = -3666447955918848.000, dc_hat[0] = -2681346740715520.000
Gradient do_[0] = -216347827800899584.000
Backward Time Step 1:
Gradient di[0] = -6139512405098496.000, df[0] = -4392599150395392.000, dc_hat[0] = -3715975908163584.000
Gradient do_[0] = -194123330549186560.000
Backward Time Step 0:
Gradient di[0] = -7168798096359424.000, df[0] = -5239677028139008.000, dc_hat[0] = -7247415862099968.000
Gradient do_[0] = -111216331293982720.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1275730984960.000, df[0] = 935372980224.000, dc_hat[0] = 553843490816.000
Gradient do_[0] = 79098147766272.000
Backward Time Step 3:
Gradient di[0] = 2001474945024.000, df[0] = 1444354195456.000, dc_hat[0] = 783518531584.000
Gradient do_[0] = 105874450284544.000
Backward Time Step 2:
Gradient di[0] = 2546746523648.000, df[0] = 1829179228160.000, dc_hat[0] = 1301155676160.000
Gradient do_[0] = 115377493245952.000
Backward Time Step 1:
Gradient di[0] = 3193996050432.000, df[0] = 2202480148480.000, dc_hat[0] = 1776307142656.000
Gradient do_[0] = 101673401843712.000
Backward Time Step 0:
Gradient di[0] = 3866923892736.000, df[0] = 2751294078976.000, dc_hat[0] = 3628478758912.000
Gradient do_[0] = 59782291521536.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2366248612528128.000, df[0] = -1857720458149888.000, dc_hat[0] = -1069521011998720.000
Gradient do_[0] = -138329411861610496.000
Backward Time Step 3:
Gradient di[0] = -3715344816406528.000, df[0] = -2836357747572736.000, dc_hat[0] = -1535134725570560.000
Gradient do_[0] = -189129400275435520.000
Backward Time Step 2:
Gradient di[0] = -4867153576591360.000, df[0] = -3667242793304064.000, dc_hat[0] = -2681926292865024.000
Gradient do_[0] = -216394711663902720.000
Backward Time Step 1:
Gradient di[0] = -6140846529314816.000, df[0] = -4393553438441472.000, dc_hat[0] = -3716778261741568.000
Gradient do_[0] = -194165386868948992.000
Backward Time Step 0:
Gradient di[0] = -7170342673973248.000, df[0] = -5240806067666944.000, dc_hat[0] = -7248977082712064.000
Gradient do_[0] = -111240297211494400.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1276172435456.000, df[0] = 935696596992.000, dc_hat[0] = 554034659328.000
Gradient do_[0] = 79125511405568.000
Backward Time Step 3:
Gradient di[0] = 2002166480896.000, df[0] = 1444853055488.000, dc_hat[0] = 783788736512.000
Gradient do_[0] = 105911016226816.000
Backward Time Step 2:
Gradient di[0] = 2547629686784.000, df[0] = 1829813485568.000, dc_hat[0] = 1301605777408.000
Gradient do_[0] = 115417448185856.000
Backward Time Step 1:
Gradient di[0] = 3195102035968.000, df[0] = 2203242463232.000, dc_hat[0] = 1776919904256.000
Gradient do_[0] = 101708575277056.000
Backward Time Step 0:
Gradient di[0] = 3868260827136.000, df[0] = 2752245399552.000, dc_hat[0] = 3629733380096.000
Gradient do_[0] = 59802965245952.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2366781456908288.000, df[0] = -1858138546372608.000, dc_hat[0] = -1069761395949568.000
Gradient do_[0] = -138360550374506496.000
Backward Time Step 3:
Gradient di[0] = -3716181798158336.000, df[0] = -2836996355522560.000, dc_hat[0] = -1535480202002432.000
Gradient do_[0] = -189171989171142656.000
Backward Time Step 2:
Gradient di[0] = -4868252551348224.000, df[0] = -3668070111379456.000, dc_hat[0] = -2682529735770112.000
Gradient do_[0] = -216443502492385280.000
Backward Time Step 1:
Gradient di[0] = -6142224676945920.000, df[0] = -4394539401871360.000, dc_hat[0] = -3717608801042432.000
Gradient do_[0] = -194208886297722880.000
Backward Time Step 0:
Gradient di[0] = -7171955971063808.000, df[0] = -5241985036189696.000, dc_hat[0] = -7250608096542720.000
Gradient do_[0] = -111265311101026304.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1276449783808.000, df[0] = 935900020736.000, dc_hat[0] = 554155048960.000
Gradient do_[0] = 79142666108928.000
Backward Time Step 3:
Gradient di[0] = 2002597969920.000, df[0] = 1445164482560.000, dc_hat[0] = 783957622784.000
Gradient do_[0] = 105933841629184.000
Backward Time Step 2:
Gradient di[0] = 2548177043456.000, df[0] = 1830206701568.000, dc_hat[0] = 1301884698624.000
Gradient do_[0] = 115442244911104.000
Backward Time Step 1:
Gradient di[0] = 3195791474688.000, df[0] = 2203717730304.000, dc_hat[0] = 1777301454848.000
Gradient do_[0] = 101730519875584.000
Backward Time Step 0:
Gradient di[0] = 3869096542208.000, df[0] = 2752839680000.000, dc_hat[0] = 3630517452800.000
Gradient do_[0] = 59815879507968.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2367296584548352.000, df[0] = -1858543212822528.000, dc_hat[0] = -1069994330816512.000
Gradient do_[0] = -138390658095251456.000
Backward Time Step 3:
Gradient di[0] = -3716986299219968.000, df[0] = -2837611072716800.000, dc_hat[0] = -1535811719790592.000
Gradient do_[0] = -189212963159146496.000
Backward Time Step 2:
Gradient di[0] = -4869303207723008.000, df[0] = -3668861459103744.000, dc_hat[0] = -2683108214177792.000
Gradient do_[0] = -216490231736565760.000
Backward Time Step 1:
Gradient di[0] = -6143572222935040.000, df[0] = -4395502816722944.000, dc_hat[0] = -3718420012990464.000
Gradient do_[0] = -194251440833691648.000
Backward Time Step 0:
Gradient di[0] = -7173525781610496.000, df[0] = -5243132866199552.000, dc_hat[0] = -7252195623829504.000
Gradient do_[0] = -111289672155529216.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1276820979712.000, df[0] = 936172126208.000, dc_hat[0] = 554316267520.000
Gradient do_[0] = 79165676060672.000
Backward Time Step 3:
Gradient di[0] = 2003180978176.000, df[0] = 1445585485824.000, dc_hat[0] = 784185360384.000
Gradient do_[0] = 105964694929408.000
Backward Time Step 2:
Gradient di[0] = 2548919435264.000, df[0] = 1830739247104.000, dc_hat[0] = 1302262054912.000
Gradient do_[0] = 115475824508928.000
Backward Time Step 1:
Gradient di[0] = 3196721823744.000, df[0] = 2204359458816.000, dc_hat[0] = 1777816174592.000
Gradient do_[0] = 101760072941568.000
Backward Time Step 0:
Gradient di[0] = 3870226907136.000, df[0] = 2753643937792.000, dc_hat[0] = 3631578087424.000
Gradient do_[0] = 59833357172736.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2367818154639360.000, df[0] = -1858952442675200.000, dc_hat[0] = -1070229815820288.000
Gradient do_[0] = -138421118003314688.000
Backward Time Step 3:
Gradient di[0] = -3717806101102592.000, df[0] = -2838236527329280.000, dc_hat[0] = -1536149545811968.000
Gradient do_[0] = -189254675881525248.000
Backward Time Step 2:
Gradient di[0] = -4870374802063360.000, df[0] = -3669668912955392.000, dc_hat[0] = -2683696893132800.000
Gradient do_[0] = -216537837154074624.000
Backward Time Step 1:
Gradient di[0] = -6144914937085952.000, df[0] = -4396463547219968.000, dc_hat[0] = -3719229077454848.000
Gradient do_[0] = -194293892290445312.000
Backward Time Step 0:
Gradient di[0] = -7175083780997120.000, df[0] = -5244271569403904.000, dc_hat[0] = -7253770803085312.000
Gradient do_[0] = -111313852821405696.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1276939206656.000, df[0] = 936258895872.000, dc_hat[0] = 554367516672.000
Gradient do_[0] = 79173007704064.000
Backward Time Step 3:
Gradient di[0] = 2003368017920.000, df[0] = 1445720358912.000, dc_hat[0] = 784258564096.000
Gradient do_[0] = 105974568321024.000
Backward Time Step 2:
Gradient di[0] = 2549160083456.000, df[0] = 1830912655360.000, dc_hat[0] = 1302385655808.000
Gradient do_[0] = 115486746476544.000
Backward Time Step 1:
Gradient di[0] = 3197024337920.000, df[0] = 2204567863296.000, dc_hat[0] = 1777982636032.000
Gradient do_[0] = 101769652731904.000
Backward Time Step 0:
Gradient di[0] = 3870585782272.000, df[0] = 2753899528192.000, dc_hat[0] = 3631914942464.000
Gradient do_[0] = 59838902042624.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2368346167181312.000, df[0] = -1859367175454720.000, dc_hat[0] = -1070468522049536.000
Gradient do_[0] = -138451904328892416.000
Backward Time Step 3:
Gradient di[0] = -3718634224484352.000, df[0] = -2838869229699072.000, dc_hat[0] = -1536491398365184.000
Gradient do_[0] = -189296663481810944.000
Backward Time Step 2:
Gradient di[0] = -4871464650014720.000, df[0] = -3670490862321664.000, dc_hat[0] = -2684297114812416.000
Gradient do_[0] = -216586232845565952.000
Backward Time Step 1:
Gradient di[0] = -6146295232200704.000, df[0] = -4397450047520768.000, dc_hat[0] = -3720059348320256.000
Gradient do_[0] = -194337494798434304.000
Backward Time Step 0:
Gradient di[0] = -7176694393733120.000, df[0] = -5245448390443008.000, dc_hat[0] = -7255398595690496.000
Gradient do_[0] = -111338832351199232.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1277371351040.000, df[0] = 936575762432.000, dc_hat[0] = 554555211776.000
Gradient do_[0] = 79199775752192.000
Backward Time Step 3:
Gradient di[0] = 2004045398016.000, df[0] = 1446209126400.000, dc_hat[0] = 784523526144.000
Gradient do_[0] = 106010320568320.000
Backward Time Step 2:
Gradient di[0] = 2550018080768.000, df[0] = 1831528955904.000, dc_hat[0] = 1302823436288.000
Gradient do_[0] = 115525577342976.000
Backward Time Step 1:
Gradient di[0] = 3198100701184.000, df[0] = 2205309992960.000, dc_hat[0] = 1778579013632.000
Gradient do_[0] = 101803895029760.000
Backward Time Step 0:
Gradient di[0] = 3871888637952.000, df[0] = 2754826469376.000, dc_hat[0] = 3633137844224.000
Gradient do_[0] = 59859051479040.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2368848141484032.000, df[0] = -1859761238704128.000, dc_hat[0] = -1070695618445312.000
Gradient do_[0] = -138481247545458688.000
Backward Time Step 3:
Gradient di[0] = -3719420740370432.000, df[0] = -2839469182943232.000, dc_hat[0] = -1536815399960576.000
Gradient do_[0] = -189336658217271296.000
Backward Time Step 2:
Gradient di[0] = -4872492220940288.000, df[0] = -3671263956434944.000, dc_hat[0] = -2684860023963648.000
Gradient do_[0] = -216631793858641920.000
Backward Time Step 1:
Gradient di[0] = -6147584796131328.000, df[0] = -4398372928618496.000, dc_hat[0] = -3720834858352640.000
Gradient do_[0] = -194378193908531200.000
Backward Time Step 0:
Gradient di[0] = -7178208906575872.000, df[0] = -5246555418263552.000, dc_hat[0] = -7256929214660608.000
Gradient do_[0] = -111362317232373760.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1277719740416.000, df[0] = 936830959616.000, dc_hat[0] = 554706337792.000
Gradient do_[0] = 79221376417792.000
Backward Time Step 3:
Gradient di[0] = 2004594720768.000, df[0] = 1446605488128.000, dc_hat[0] = 784738484224.000
Gradient do_[0] = 106039353540608.000
Backward Time Step 2:
Gradient di[0] = 2550718005248.000, df[0] = 1832031617024.000, dc_hat[0] = 1303179689984.000
Gradient do_[0] = 115557244338176.000
Backward Time Step 1:
Gradient di[0] = 3198977310720.000, df[0] = 2205914234880.000, dc_hat[0] = 1779063717888.000
Gradient do_[0] = 101831745208320.000
Backward Time Step 0:
Gradient di[0] = 3872946651136.000, df[0] = 2755579084800.000, dc_hat[0] = 3634130321408.000
Gradient do_[0] = 59875400876032.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2369368906268672.000, df[0] = -1860169931685888.000, dc_hat[0] = -1070930767904768.000
Gradient do_[0] = -138511707453521920.000
Backward Time Step 3:
Gradient di[0] = -3720243226607616.000, df[0] = -2840097053474816.000, dc_hat[0] = -1537154031288320.000
Gradient do_[0] = -189378594277949440.000
Backward Time Step 2:
Gradient di[0] = -4873572942086144.000, df[0] = -3672078121172992.000, dc_hat[0] = -2685454608498688.000
Gradient do_[0] = -216679674154057728.000
Backward Time Step 1:
Gradient di[0] = -6148942005796864.000, df[0] = -4399343859662848.000, dc_hat[0] = -3721652781187072.000
Gradient do_[0] = -194421023322406912.000
Backward Time Step 0:
Gradient di[0] = -7179783548960768.000, df[0] = -5247706469498880.000, dc_hat[0] = -7258521573785600.000
Gradient do_[0] = -111386755596288000.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1278110334976.000, df[0] = 937117351936.000, dc_hat[0] = 554875355136.000
Gradient do_[0] = 79245594329088.000
Backward Time Step 3:
Gradient di[0] = 2005206433792.000, df[0] = 1447047069696.000, dc_hat[0] = 784977494016.000
Gradient do_[0] = 106071716790272.000
Backward Time Step 2:
Gradient di[0] = 2551498407936.000, df[0] = 1832591687680.000, dc_hat[0] = 1303577493504.000
Gradient do_[0] = 115592526823424.000
Backward Time Step 1:
Gradient di[0] = 3199954583552.000, df[0] = 2206588207104.000, dc_hat[0] = 1779605569536.000
Gradient do_[0] = 101862841778176.000
Backward Time Step 0:
Gradient di[0] = 3874129182720.000, df[0] = 2756420567040.000, dc_hat[0] = 3635239714816.000
Gradient do_[0] = 59893688041472.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2369887255134208.000, df[0] = -1860576611401728.000, dc_hat[0] = -1071163904098304.000
Gradient do_[0] = -138541978383024128.000
Backward Time Step 3:
Gradient di[0] = -3721056317603840.000, df[0] = -2840717944684544.000, dc_hat[0] = -1537489978261504.000
Gradient do_[0] = -189419963402944512.000
Backward Time Step 2:
Gradient di[0] = -4874638630846464.000, df[0] = -3672881816928256.000, dc_hat[0] = -2686040871534592.000
Gradient do_[0] = -216727056233267200.000
Backward Time Step 1:
Gradient di[0] = -6150288478044160.000, df[0] = -4400307274514432.000, dc_hat[0] = -3722463456264192.000
Gradient do_[0] = -194463577858375680.000
Backward Time Step 0:
Gradient di[0] = -7181347990798336.000, df[0] = -5248850004541440.000, dc_hat[0] = -7260102658621440.000
Gradient do_[0] = -111411030751444992.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1278472224768.000, df[0] = 937382707200.000, dc_hat[0] = 555032772608.000
Gradient do_[0] = 79268033855488.000
Backward Time Step 3:
Gradient di[0] = 2005773320192.000, df[0] = 1447456145408.000, dc_hat[0] = 785199398912.000
Gradient do_[0] = 106101697675264.000
Backward Time Step 2:
Gradient di[0] = 2552219566080.000, df[0] = 1833109553152.000, dc_hat[0] = 1303945281536.000
Gradient do_[0] = 115625183674368.000
Backward Time Step 1:
Gradient di[0] = 3200861339648.000, df[0] = 2207213420544.000, dc_hat[0] = 1780107182080.000
Gradient do_[0] = 101891665035264.000
Backward Time Step 0:
Gradient di[0] = 3875221012480.000, df[0] = 2757197561856.000, dc_hat[0] = 3636264435712.000
Gradient do_[0] = 59910570115072.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2370400772161536.000, df[0] = -1860980069892096.000, dc_hat[0] = -1071396033658880.000
Gradient do_[0] = -138571983024553984.000
Backward Time Step 3:
Gradient di[0] = -3721864308326400.000, df[0] = -2841334809362432.000, dc_hat[0] = -1537823643533312.000
Gradient do_[0] = -189461006110425088.000
Backward Time Step 2:
Gradient di[0] = -4875693045317632.000, df[0] = -3673676385878016.000, dc_hat[0] = -2686620423684096.000
Gradient do_[0] = -216773957276139520.000
Backward Time Step 1:
Gradient di[0] = -6151625286615040.000, df[0] = -4401263710044160.000, dc_hat[0] = -3723268494196736.000
Gradient do_[0] = -194505702897614848.000
Backward Time Step 0:
Gradient di[0] = -7182915116990464.000, df[0] = -5249995150196736.000, dc_hat[0] = -7261687501553664.000
Gradient do_[0] = -111435340266340352.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1278737776640.000, df[0] = 937577480192.000, dc_hat[0] = 555148378112.000
Gradient do_[0] = 79284509081600.000
Backward Time Step 3:
Gradient di[0] = 2006191046656.000, df[0] = 1447757611008.000, dc_hat[0] = 785362386944.000
Gradient do_[0] = 106123793268736.000
Backward Time Step 2:
Gradient di[0] = 2552752242688.000, df[0] = 1833492283392.000, dc_hat[0] = 1304216469504.000
Gradient do_[0] = 115649326088192.000
Backward Time Step 1:
Gradient di[0] = 3201528233984.000, df[0] = 2207673221120.000, dc_hat[0] = 1780475756544.000
Gradient do_[0] = 101912879824896.000
Backward Time Step 0:
Gradient di[0] = 3876025532416.000, df[0] = 2757769822208.000, dc_hat[0] = 3637019410432.000
Gradient do_[0] = 59923006226432.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2370925026607104.000, df[0] = -1861391715663872.000, dc_hat[0] = -1071632793731072.000
Gradient do_[0] = -138602606141374464.000
Backward Time Step 3:
Gradient di[0] = -3722686794563584.000, df[0] = -2841962411458560.000, dc_hat[0] = -1538162677514240.000
Gradient do_[0] = -189502890631495680.000
Backward Time Step 2:
Gradient di[0] = -4876772692721664.000, df[0] = -3674488940003328.000, dc_hat[0] = -2687212860735488.000
Gradient do_[0] = -216821906291032064.000
Backward Time Step 1:
Gradient di[0] = -6152990012473344.000, df[0] = -4402238936055808.000, dc_hat[0] = -3724089101385728.000
Gradient do_[0] = -194548790009528320.000
Backward Time Step 0:
Gradient di[0] = -7184504254889984.000, df[0] = -5251156938850304.000, dc_hat[0] = -7263293819322368.000
Gradient do_[0] = -111459993378619392.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1279085641728.000, df[0] = 937832415232.000, dc_hat[0] = 555298717696.000
Gradient do_[0] = 79306067804160.000
Backward Time Step 3:
Gradient di[0] = 2006736437248.000, df[0] = 1448151089152.000, dc_hat[0] = 785575378944.000
Gradient do_[0] = 106152633303040.000
Backward Time Step 2:
Gradient di[0] = 2553445351424.000, df[0] = 1833989963776.000, dc_hat[0] = 1304569708544.000
Gradient do_[0] = 115680707870720.000
Backward Time Step 1:
Gradient di[0] = 3202396454912.000, df[0] = 2208271695872.000, dc_hat[0] = 1780956528640.000
Gradient do_[0] = 101940453179392.000
Backward Time Step 0:
Gradient di[0] = 3877079613440.000, df[0] = 2758519816192.000, dc_hat[0] = 3638008217600.000
Gradient do_[0] = 59939301097472.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2371436396150784.000, df[0] = -1861792892452864.000, dc_hat[0] = -1071863312678912.000
Gradient do_[0] = -138632473343950848.000
Backward Time Step 3:
Gradient di[0] = -3723488074399744.000, df[0] = -2842574712733696.000, dc_hat[0] = -1538493121560576.000
Gradient do_[0] = -189543641281200128.000
Backward Time Step 2:
Gradient di[0] = -4877826570321920.000, df[0] = -3675283240517632.000, dc_hat[0] = -2687792144449536.000
Gradient do_[0] = -216868704254689280.000
Backward Time Step 1:
Gradient di[0] = -6154321989206016.000, df[0] = -4403191881924608.000, dc_hat[0] = -3724890918092800.000
Gradient do_[0] = -194590880689029120.000
Backward Time Step 0:
Gradient di[0] = -7186057959309312.000, df[0] = -5252292420829184.000, dc_hat[0] = -7264864166739968.000
Gradient do_[0] = -111484088145149952.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1279297847296.000, df[0] = 937987997696.000, dc_hat[0] = 555391123456.000
Gradient do_[0] = 79319221141504.000
Backward Time Step 3:
Gradient di[0] = 2007070146560.000, df[0] = 1448391868416.000, dc_hat[0] = 785706057728.000
Gradient do_[0] = 106170299711488.000
Backward Time Step 2:
Gradient di[0] = 2553870024704.000, df[0] = 1834295099392.000, dc_hat[0] = 1304786108416.000
Gradient do_[0] = 115699959726080.000
Backward Time Step 1:
Gradient di[0] = 3202932277248.000, df[0] = 2208640532480.000, dc_hat[0] = 1781252227072.000
Gradient do_[0] = 101957498830848.000
Backward Time Step 0:
Gradient di[0] = 3877726322688.000, df[0] = 2758979878912.000, dc_hat[0] = 3638615343104.000
Gradient do_[0] = 59949300318208.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2371929780518912.000, df[0] = -1862180513251328.000, dc_hat[0] = -1072086718087168.000
Gradient do_[0] = -138661318344310784.000
Backward Time Step 3:
Gradient di[0] = -3724261705383936.000, df[0] = -2843165270736896.000, dc_hat[0] = -1538812828188672.000
Gradient do_[0] = -189582914462154752.000
Backward Time Step 2:
Gradient di[0] = -4878834813894656.000, df[0] = -3676043181293568.000, dc_hat[0] = -2688345658359808.000
Gradient do_[0] = -216913492173651968.000
Backward Time Step 1:
Gradient di[0] = -6155593299525632.000, df[0] = -4404101609684992.000, dc_hat[0] = -3725657032884224.000
Gradient do_[0] = -194631012863442944.000
Backward Time Step 0:
Gradient di[0] = -7187540796768256.000, df[0] = -5253376363200512.000, dc_hat[0] = -7266363647197184.000
Gradient do_[0] = -111507100579921920.000
Epoch 900, Train Loss=0.011198, Weight Norm=13.137237
Sample Predictions at Epoch 900:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.49 | 63.87 | 6.38 |
| 193 | 2024-10-14 | 56.87 | 66.55 | 9.68 |
| 194 | 2024-10-15 | 57.05 | 66.00 | 8.95 |
| 195 | 2024-10-16 | 58.02 | 67.20 | 9.18 |
| 196 | 2024-10-17 | 57.54 | 66.76 | 9.22 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1279672713216.000, df[0] = 938262790144.000, dc_hat[0] = 555553587200.000
Gradient do_[0] = 79342432419840.000
Backward Time Step 3:
Gradient di[0] = 2007656431616.000, df[0] = 1448814706688.000, dc_hat[0] = 785934778368.000
Gradient do_[0] = 106201245286400.000
Backward Time Step 2:
Gradient di[0] = 2554616348672.000, df[0] = 1834830790656.000, dc_hat[0] = 1305166086144.000
Gradient do_[0] = 115733681930240.000
Backward Time Step 1:
Gradient di[0] = 3203867607040.000, df[0] = 2209285931008.000, dc_hat[0] = 1781770747904.000
Gradient do_[0] = 101987219668992.000
Backward Time Step 0:
Gradient di[0] = 3878860619776.000, df[0] = 2759786758144.000, dc_hat[0] = 3639679647744.000
Gradient do_[0] = 59966832508928.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2372427728289792.000, df[0] = -1862571086839808.000, dc_hat[0] = -1072311264346112.000
Gradient do_[0] = -138690446812512256.000
Backward Time Step 3:
Gradient di[0] = -3725043389431808.000, df[0] = -2843761465884672.000, dc_hat[0] = -1539135621824512.000
Gradient do_[0] = -189622823298269184.000
Backward Time Step 2:
Gradient di[0] = -4879867753529344.000, df[0] = -3676820033503232.000, dc_hat[0] = -2688911520301056.000
Gradient do_[0] = -216959413963980800.000
Backward Time Step 1:
Gradient di[0] = -6156894137745408.000, df[0] = -4405031738540032.000, dc_hat[0] = -3726439253803008.000
Gradient do_[0] = -194672089930661888.000
Backward Time Step 0:
Gradient di[0] = -7189048330289152.000, df[0] = -5254478022311936.000, dc_hat[0] = -7267888360587264.000
Gradient do_[0] = -111530482381881344.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1280000000000.000, df[0] = 938502848512.000, dc_hat[0] = 555695734784.000
Gradient do_[0] = 79362724462592.000
Backward Time Step 3:
Gradient di[0] = 2008169971712.000, df[0] = 1449185378304.000, dc_hat[0] = 786135711744.000
Gradient do_[0] = 106228390821888.000
Backward Time Step 2:
Gradient di[0] = 2555269873664.000, df[0] = 1835300421632.000, dc_hat[0] = 1305499926528.000
Gradient do_[0] = 115763285327872.000
Backward Time Step 1:
Gradient di[0] = 3204688117760.000, df[0] = 2209851899904.000, dc_hat[0] = 1782225305600.000
Gradient do_[0] = 102013316628480.000
Backward Time Step 0:
Gradient di[0] = 3879848902656.000, df[0] = 2760490090496.000, dc_hat[0] = 3640606851072.000
Gradient do_[0] = 59982116552704.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2372963525459968.000, df[0] = -1862992127852544.000, dc_hat[0] = -1072553393127424.000
Gradient do_[0] = -138721731354296320.000
Backward Time Step 3:
Gradient di[0] = -3725890840166400.000, df[0] = -2844408395333632.000, dc_hat[0] = -1539484051046400.000
Gradient do_[0] = -189665841690705920.000
Backward Time Step 2:
Gradient di[0] = -4880970486382592.000, df[0] = -3677651109675008.000, dc_hat[0] = -2689517110689792.000
Gradient do_[0] = -217008290691809280.000
Backward Time Step 1:
Gradient di[0] = -6158284633407488.000, df[0] = -4406026828775424.000, dc_hat[0] = -3727277040861184.000
Gradient do_[0] = -194715932956819456.000
Backward Time Step 0:
Gradient di[0] = -7190673975410688.000, df[0] = -5255666117640192.000, dc_hat[0] = -7269531185577984.000
Gradient do_[0] = -111555711019778048.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1280346161152.000, df[0] = 938756603904.000, dc_hat[0] = 555845877760.000
Gradient do_[0] = 79384182521856.000
Backward Time Step 3:
Gradient di[0] = 2008714706944.000, df[0] = 1449578463232.000, dc_hat[0] = 786348900352.000
Gradient do_[0] = 106257197301760.000
Backward Time Step 2:
Gradient di[0] = 2555964555264.000, df[0] = 1835799281664.000, dc_hat[0] = 1305853952000.000
Gradient do_[0] = 115794742607872.000
Backward Time Step 1:
Gradient di[0] = 3205558697984.000, df[0] = 2210451947520.000, dc_hat[0] = 1782706339840.000
Gradient do_[0] = 102040982257664.000
Backward Time Step 0:
Gradient di[0] = 3880899051520.000, df[0] = 2761237463040.000, dc_hat[0] = 3641592250368.000
Gradient do_[0] = 59998352703488.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2373470063165440.000, df[0] = -1863389546545152.000, dc_hat[0] = -1072782301462528.000
Gradient do_[0] = -138751323678965760.000
Backward Time Step 3:
Gradient di[0] = -3726679771971584.000, df[0] = -2845010764496896.000, dc_hat[0] = -1539809260601344.000
Gradient do_[0] = -189706008224858112.000
Backward Time Step 2:
Gradient di[0] = -4882005036630016.000, df[0] = -3678430914674688.000, dc_hat[0] = -2690085656985600.000
Gradient do_[0] = -217054298381484032.000
Backward Time Step 1:
Gradient di[0] = -6159588155981824.000, df[0] = -4406959105114112.000, dc_hat[0] = -3728061677699072.000
Gradient do_[0] = -194757130283122688.000
Backward Time Step 0:
Gradient di[0] = -7192191709478912.000, df[0] = -5256775829815296.000, dc_hat[0] = -7271065562644480.000
Gradient do_[0] = -111579256030494720.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1280724041728.000, df[0] = 939033690112.000, dc_hat[0] = 556009848832.000
Gradient do_[0] = 79407611904000.000
Backward Time Step 3:
Gradient di[0] = 2009308594176.000, df[0] = 1450007068672.000, dc_hat[0] = 786580963328.000
Gradient do_[0] = 106288587472896.000
Backward Time Step 2:
Gradient di[0] = 2556717694976.000, df[0] = 1836339822592.000, dc_hat[0] = 1306238255104.000
Gradient do_[0] = 115828850688000.000
Backward Time Step 1:
Gradient di[0] = 3206499270656.000, df[0] = 2211100491776.000, dc_hat[0] = 1783227744256.000
Gradient do_[0] = 102070912811008.000
Backward Time Step 0:
Gradient di[0] = 3882035445760.000, df[0] = 2762045915136.000, dc_hat[0] = 3642658652160.000
Gradient do_[0] = 60015914254336.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2374001296932864.000, df[0] = -1863806829461504.000, dc_hat[0] = -1073022416977920.000
Gradient do_[0] = -138782290393169920.000
Backward Time Step 3:
Gradient di[0] = -3727513532497920.000, df[0] = -2845646956527616.000, dc_hat[0] = -1540153126420480.000
Gradient do_[0] = -189748390962135040.000
Backward Time Step 2:
Gradient di[0] = -4883098642677760.000, df[0] = -3679254743089152.000, dc_hat[0] = -2690687757713408.000
Gradient do_[0] = -217102762792452096.000
Backward Time Step 1:
Gradient di[0] = -6160972209192960.000, df[0] = -4407949095075840.000, dc_hat[0] = -3728894096048128.000
Gradient do_[0] = -194800801510588416.000
Backward Time Step 0:
Gradient di[0] = -7193809838407680.000, df[0] = -5257958019563520.000, dc_hat[0] = -7272701408313344.000
Gradient do_[0] = -111604347229437952.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1281062207488.000, df[0] = 939281547264.000, dc_hat[0] = 556156059648.000
Gradient do_[0] = 79428575035392.000
Backward Time Step 3:
Gradient di[0] = 2009836421120.000, df[0] = 1450387963904.000, dc_hat[0] = 786787598336.000
Gradient do_[0] = 106316513148928.000
Backward Time Step 2:
Gradient di[0] = 2557390094336.000, df[0] = 1836823085056.000, dc_hat[0] = 1306581270528.000
Gradient do_[0] = 115859309723648.000
Backward Time Step 1:
Gradient di[0] = 3207349403648.000, df[0] = 2211686383616.000, dc_hat[0] = 1783697637376.000
Gradient do_[0] = 102097932517376.000
Backward Time Step 0:
Gradient di[0] = 3883062001664.000, df[0] = 2762776248320.000, dc_hat[0] = 3643622031360.000
Gradient do_[0] = 60031793889280.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2374497902526464.000, df[0] = -1864196731961344.000, dc_hat[0] = -1073246963236864.000
Gradient do_[0] = -138811418861371392.000
Backward Time Step 3:
Gradient di[0] = -3728298169335808.000, df[0] = -2846246372900864.000, dc_hat[0] = -1540477262233600.000
Gradient do_[0] = -189788334157987840.000
Backward Time Step 2:
Gradient di[0] = -4884124066119680.000, df[0] = -3680027568766976.000, dc_hat[0] = -2691251203735552.000
Gradient do_[0] = -217148392525004800.000
Backward Time Step 1:
Gradient di[0] = -6162268752445440.000, df[0] = -4408876002705408.000, dc_hat[0] = -3729675243225088.000
Gradient do_[0] = -194841723958984704.000
Backward Time Step 0:
Gradient di[0] = -7195321130024960.000, df[0] = -5259062363029504.000, dc_hat[0] = -7274228806057984.000
Gradient do_[0] = -111627797750874112.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1281330905088.000, df[0] = 939478614016.000, dc_hat[0] = 556273238016.000
Gradient do_[0] = 79445226422272.000
Backward Time Step 3:
Gradient di[0] = 2010260439040.000, df[0] = 1450694148096.000, dc_hat[0] = 786953601024.000
Gradient do_[0] = 106338919120896.000
Backward Time Step 2:
Gradient di[0] = 2557929848832.000, df[0] = 1837210402816.000, dc_hat[0] = 1306855342080.000
Gradient do_[0] = 115883703795712.000
Backward Time Step 1:
Gradient di[0] = 3208023900160.000, df[0] = 2212151427072.000, dc_hat[0] = 1784070668288.000
Gradient do_[0] = 102119382188032.000
Backward Time Step 0:
Gradient di[0] = 3883880939520.000, df[0] = 2763358994432.000, dc_hat[0] = 3644390375424.000
Gradient do_[0] = 60044448104448.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2375032089083904.000, df[0] = -1864616162361344.000, dc_hat[0] = -1073488420929536.000
Gradient do_[0] = -138842600323940352.000
Backward Time Step 3:
Gradient di[0] = -3729133540474880.000, df[0] = -2846883907108864.000, dc_hat[0] = -1540821396488192.000
Gradient do_[0] = -189830751255003136.000
Backward Time Step 2:
Gradient di[0] = -4885216061554688.000, df[0] = -3680849518133248.000, dc_hat[0] = -2691851156979712.000
Gradient do_[0] = -217196856935972864.000
Backward Time Step 1:
Gradient di[0] = -6163651731914752.000, df[0] = -4409864650489856.000, dc_hat[0] = -3730507124703232.000
Gradient do_[0] = -194885378006581248.000
Backward Time Step 0:
Gradient di[0] = -7196930669019136.000, df[0] = -5260239184068608.000, dc_hat[0] = -7275856061792256.000
Gradient do_[0] = -111652768690733056.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1281652424704.000, df[0] = 939714347008.000, dc_hat[0] = 556412174336.000
Gradient do_[0] = 79465115811840.000
Backward Time Step 3:
Gradient di[0] = 2010763100160.000, df[0] = 1451056824320.000, dc_hat[0] = 787150340096.000
Gradient do_[0] = 106365494231040.000
Backward Time Step 2:
Gradient di[0] = 2558569218048.000, df[0] = 1837669810176.000, dc_hat[0] = 1307181580288.000
Gradient do_[0] = 115912636104704.000
Backward Time Step 1:
Gradient di[0] = 3208827633664.000, df[0] = 2212705599488.000, dc_hat[0] = 1784515657728.000
Gradient do_[0] = 102144883556352.000
Backward Time Step 0:
Gradient di[0] = 3884846153728.000, df[0] = 2764045811712.000, dc_hat[0] = 3645296082944.000
Gradient do_[0] = 60059367243776.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2375539968966656.000, df[0] = -1865015191666688.000, dc_hat[0] = -1073717933244416.000
Gradient do_[0] = -138872287137890304.000
Backward Time Step 3:
Gradient di[0] = -3729930793779200.000, df[0] = -2847492987158528.000, dc_hat[0] = -1541150901010432.000
Gradient do_[0] = -189871347285884928.000
Backward Time Step 2:
Gradient di[0] = -4886261886091264.000, df[0] = -3681638181502976.000, dc_hat[0] = -2692425608855552.000
Gradient do_[0] = -217243328482115584.000
Backward Time Step 1:
Gradient di[0] = -6164956328230912.000, df[0] = -4410799074312192.000, dc_hat[0] = -3731293103718400.000
Gradient do_[0] = -194926592512753664.000
Backward Time Step 0:
Gradient di[0] = -7198460214247424.000, df[0] = -5261356949307392.000, dc_hat[0] = -7277402786889728.000
Gradient do_[0] = -111676502680010752.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1281944584192.000, df[0] = 939928453120.000, dc_hat[0] = 556539052032.000
Gradient do_[0] = 79483218427904.000
Backward Time Step 3:
Gradient di[0] = 2011221196800.000, df[0] = 1451387125760.000, dc_hat[0] = 787329056768.000
Gradient do_[0] = 106389712142336.000
Backward Time Step 2:
Gradient di[0] = 2559152750592.000, df[0] = 1838088585216.000, dc_hat[0] = 1307478458368.000
Gradient do_[0] = 115939018276864.000
Backward Time Step 1:
Gradient di[0] = 3209559277568.000, df[0] = 2213209702400.000, dc_hat[0] = 1784920801280.000
Gradient do_[0] = 102168136777728.000
Backward Time Step 0:
Gradient di[0] = 3885738229760.000, df[0] = 2764680462336.000, dc_hat[0] = 3646133370880.000
Gradient do_[0] = 60073162309632.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2376043017011200.000, df[0] = -1865409926004736.000, dc_hat[0] = -1073944157224960.000
Gradient do_[0] = -138901664714194944.000
Backward Time Step 3:
Gradient di[0] = -3730721336197120.000, df[0] = -2848096430063616.000, dc_hat[0] = -1541477452742656.000
Gradient do_[0] = -189911599719383040.000
Backward Time Step 2:
Gradient di[0] = -4887296973209600.000, df[0] = -3682417181196288.000, dc_hat[0] = -2692994423586816.000
Gradient do_[0] = -217289336171790336.000
Backward Time Step 1:
Gradient di[0] = -6166277030674432.000, df[0] = -4411743430246400.000, dc_hat[0] = -3732088477974528.000
Gradient do_[0] = -194968305235132416.000
Backward Time Step 0:
Gradient di[0] = -7199991906959360.000, df[0] = -5262476325158912.000, dc_hat[0] = -7278951659470848.000
Gradient do_[0] = -111700253849157632.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1282231631872.000, df[0] = 940139020288.000, dc_hat[0] = 556663701504.000
Gradient do_[0] = 79501077774336.000
Backward Time Step 3:
Gradient di[0] = 2011671429120.000, df[0] = 1451712184320.000, dc_hat[0] = 787505348608.000
Gradient do_[0] = 106413535789056.000
Backward Time Step 2:
Gradient di[0] = 2559729205248.000, df[0] = 1838502510592.000, dc_hat[0] = 1307772321792.000
Gradient do_[0] = 115965157179392.000
Backward Time Step 1:
Gradient di[0] = 3210280435712.000, df[0] = 2213706989568.000, dc_hat[0] = 1785319522304.000
Gradient do_[0] = 102191079620608.000
Backward Time Step 0:
Gradient di[0] = 3886603829248.000, df[0] = 2765296238592.000, dc_hat[0] = 3646945230848.000
Gradient do_[0] = 60086542139392.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2376558949957632.000, df[0] = -1865814458236928.000, dc_hat[0] = -1074177293418496.000
Gradient do_[0] = -138931781024874496.000
Backward Time Step 3:
Gradient di[0] = -3731528253177856.000, df[0] = -2848711952564224.000, dc_hat[0] = -1541809775837184.000
Gradient do_[0] = -189952573707386880.000
Backward Time Step 2:
Gradient di[0] = -4888359977615360.000, df[0] = -3683217924161536.000, dc_hat[0] = -2693578807574528.000
Gradient do_[0] = -217336512092569600.000
Backward Time Step 1:
Gradient di[0] = -6167615449858048.000, df[0] = -4412701476388864.000, dc_hat[0] = -3732895126519808.000
Gradient do_[0] = -195010584893194240.000
Backward Time Step 0:
Gradient di[0] = -7201550980087808.000, df[0] = -5263616102105088.000, dc_hat[0] = -7280527912468480.000
Gradient do_[0] = -111724468874772480.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1282642542592.000, df[0] = 940440027136.000, dc_hat[0] = 556842090496.000
Gradient do_[0] = 79526503645184.000
Backward Time Step 3:
Gradient di[0] = 2012316565504.000, df[0] = 1452177883136.000, dc_hat[0] = 787757531136.000
Gradient do_[0] = 106447652257792.000
Backward Time Step 2:
Gradient di[0] = 2560546308096.000, df[0] = 1839089975296.000, dc_hat[0] = 1308190179328.000
Gradient do_[0] = 116002125774848.000
Backward Time Step 1:
Gradient di[0] = 3211307778048.000, df[0] = 2214415040512.000, dc_hat[0] = 1785888112640.000
Gradient do_[0] = 102223702917120.000
Backward Time Step 0:
Gradient di[0] = 3887841148928.000, df[0] = 2766176518144.000, dc_hat[0] = 3648106266624.000
Gradient do_[0] = 60105676554240.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2377044549697536.000, df[0] = -1866195905019904.000, dc_hat[0] = -1074396806512640.000
Gradient do_[0] = -138960222298308608.000
Backward Time Step 3:
Gradient di[0] = -3732298931372032.000, df[0] = -2849300899954688.000, dc_hat[0] = -1542128274505728.000
Gradient do_[0] = -189991829708472320.000
Backward Time Step 2:
Gradient di[0] = -4889368758059008.000, df[0] = -3683978401808384.000, dc_hat[0] = -2694133663662080.000
Gradient do_[0] = -217381334371270656.000
Backward Time Step 1:
Gradient di[0] = -6168881391468544.000, df[0] = -4413606103875584.000, dc_hat[0] = -3733655872602112.000
Gradient do_[0] = -195050545268916224.000
Backward Time Step 0:
Gradient di[0] = -7203025227612160.000, df[0] = -5264693065154560.000, dc_hat[0] = -7282017729249280.000
Gradient do_[0] = -111747326690721792.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1282993029120.000, df[0] = 940697124864.000, dc_hat[0] = 556994199552.000
Gradient do_[0] = 79548246917120.000
Backward Time Step 3:
Gradient di[0] = 2012868247552.000, df[0] = 1452575817728.000, dc_hat[0] = 787973079040.000
Gradient do_[0] = 106476811059200.000
Backward Time Step 2:
Gradient di[0] = 2561249902592.000, df[0] = 1839594995712.000, dc_hat[0] = 1308548530176.000
Gradient do_[0] = 116033960542208.000
Backward Time Step 1:
Gradient di[0] = 3212191727616.000, df[0] = 2215024787456.000, dc_hat[0] = 1786377928704.000
Gradient do_[0] = 102251855085568.000
Backward Time Step 0:
Gradient di[0] = 3888913580032.000, df[0] = 2766939619328.000, dc_hat[0] = 3649112899584.000
Gradient do_[0] = 60122252443648.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2377567998836736.000, df[0] = -1866606477049856.000, dc_hat[0] = -1074632425734144.000
Gradient do_[0] = -138990768105717760.000
Backward Time Step 3:
Gradient di[0] = -3733113901416448.000, df[0] = -2849922864906240.000, dc_hat[0] = -1542463684608000.000
Gradient do_[0] = -190033319092551680.000
Backward Time Step 2:
Gradient di[0] = -4890435520561152.000, df[0] = -3684782634434560.000, dc_hat[0] = -2694719926697984.000
Gradient do_[0] = -217428664910872576.000
Backward Time Step 1:
Gradient di[0] = -6170233769295872.000, df[0] = -4414574082129920.000, dc_hat[0] = -3734470574211072.000
Gradient do_[0] = -195093185704230912.000
Backward Time Step 0:
Gradient di[0] = -7204599333126144.000, df[0] = -5265844653260800.000, dc_hat[0] = -7283610088374272.000
Gradient do_[0] = -111771747874766848.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1283232366592.000, df[0] = 940872564736.000, dc_hat[0] = 557097811968.000
Gradient do_[0] = 79563077976064.000
Backward Time Step 3:
Gradient di[0] = 2013240885248.000, df[0] = 1452844777472.000, dc_hat[0] = 788118372352.000
Gradient do_[0] = 106496423624704.000
Backward Time Step 2:
Gradient di[0] = 2561721499648.000, df[0] = 1839933554688.000, dc_hat[0] = 1308788129792.000
Gradient do_[0] = 116055301160960.000
Backward Time Step 1:
Gradient di[0] = 3212783386624.000, df[0] = 2215432421376.000, dc_hat[0] = 1786704429056.000
Gradient do_[0] = 102270662344704.000
Backward Time Step 0:
Gradient di[0] = 3889633165312.000, df[0] = 2767451586560.000, dc_hat[0] = 3649787920384.000
Gradient do_[0] = 60133371543552.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2378066483478528.000, df[0] = -1866997855944704.000, dc_hat[0] = -1074858247061504.000
Gradient do_[0] = -139019845034311680.000
Backward Time Step 3:
Gradient di[0] = -3733896659206144.000, df[0] = -2850520402231296.000, dc_hat[0] = -1542787015114752.000
Gradient do_[0] = -190073124849451008.000
Backward Time Step 2:
Gradient di[0] = -4891459333390336.000, df[0] = -3685553581064192.000, dc_hat[0] = -2695282030542848.000
Gradient do_[0] = -217474174384340992.000
Backward Time Step 1:
Gradient di[0] = -6171525480710144.000, df[0] = -4415498305404928.000, dc_hat[0] = -3735248500162560.000
Gradient do_[0] = -195133970713673728.000
Backward Time Step 0:
Gradient di[0] = -7206103645421568.000, df[0] = -5266943628017664.000, dc_hat[0] = -7285129969926144.000
Gradient do_[0] = -111795086727053312.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1283553755136.000, df[0] = 941108166656.000, dc_hat[0] = 557237272576.000
Gradient do_[0] = 79583009308672.000
Backward Time Step 3:
Gradient di[0] = 2013747347456.000, df[0] = 1453210075136.000, dc_hat[0] = 788316225536.000
Gradient do_[0] = 106523250393088.000
Backward Time Step 2:
Gradient di[0] = 2562365849600.000, df[0] = 1840396238848.000, dc_hat[0] = 1309116727296.000
Gradient do_[0] = 116084476739584.000
Backward Time Step 1:
Gradient di[0] = 3213590528000.000, df[0] = 2215989215232.000, dc_hat[0] = 1787151908864.000
Gradient do_[0] = 102296306319360.000
Backward Time Step 0:
Gradient di[0] = 3890606243840.000, df[0] = 2768143908864.000, dc_hat[0] = 3650700967936.000
Gradient do_[0] = 60148424900608.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2378556915056640.000, df[0] = -1867383195041792.000, dc_hat[0] = -1075079504986112.000
Gradient do_[0] = -139048526825914368.000
Backward Time Step 3:
Gradient di[0] = -3734669753319424.000, df[0] = -2851110691799040.000, dc_hat[0] = -1543106319089664.000
Gradient do_[0] = -190112363670667264.000
Backward Time Step 2:
Gradient di[0] = -4892474019414016.000, df[0] = -3686317548371968.000, dc_hat[0] = -2695838497243136.000
Gradient do_[0] = -217519065382518784.000
Backward Time Step 1:
Gradient di[0] = -6172804307222528.000, df[0] = -4416412596568064.000, dc_hat[0] = -3736017299308544.000
Gradient do_[0] = -195174360586125312.000
Backward Time Step 0:
Gradient di[0] = -7207592388460544.000, df[0] = -5268031865356288.000, dc_hat[0] = -7286634819092480.000
Gradient do_[0] = -111818176471236608.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1283836215296.000, df[0] = 941315391488.000, dc_hat[0] = 557359955968.000
Gradient do_[0] = 79600482779136.000
Backward Time Step 3:
Gradient di[0] = 2014189453312.000, df[0] = 1453529366528.000, dc_hat[0] = 788489306112.000
Gradient do_[0] = 106546612666368.000
Backward Time Step 2:
Gradient di[0] = 2562929459200.000, df[0] = 1840800989184.000, dc_hat[0] = 1309403906048.000
Gradient do_[0] = 116110011662336.000
Backward Time Step 1:
Gradient di[0] = 3214296743936.000, df[0] = 2216476278784.000, dc_hat[0] = 1787542110208.000
Gradient do_[0] = 102318771011584.000
Backward Time Step 0:
Gradient di[0] = 3891458473984.000, df[0] = 2768750247936.000, dc_hat[0] = 3651500244992.000
Gradient do_[0] = 60161590820864.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2379071505825792.000, df[0] = -1867786787749888.000, dc_hat[0] = -1075311433220096.000
Gradient do_[0] = -139078583007051776.000
Backward Time Step 3:
Gradient di[0] = -3735475596558336.000, df[0] = -2851725677428736.000, dc_hat[0] = -1543438239531008.000
Gradient do_[0] = -190153389198278656.000
Backward Time Step 2:
Gradient di[0] = -4893527897014272.000, df[0] = -3687112117321728.000, dc_hat[0] = -2696419123134464.000
Gradient do_[0] = -217565983605260288.000
Backward Time Step 1:
Gradient di[0] = -6174143800147968.000, df[0] = -4417370642710528.000, dc_hat[0] = -3736823679418368.000
Gradient do_[0] = -195216657424056320.000
Backward Time Step 0:
Gradient di[0] = -7209151461588992.000, df[0] = -5269171642302464.000, dc_hat[0] = -7288211072090112.000
Gradient do_[0] = -111842374316982272.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1284133224448.000, df[0] = 941533036544.000, dc_hat[0] = 557488340992.000
Gradient do_[0] = 79618887385088.000
Backward Time Step 3:
Gradient di[0] = 2014655414272.000, df[0] = 1453865566208.000, dc_hat[0] = 788671627264.000
Gradient do_[0] = 106571258396672.000
Backward Time Step 2:
Gradient di[0] = 2563525050368.000, df[0] = 1841228677120.000, dc_hat[0] = 1309707075584.000
Gradient do_[0] = 116136913928192.000
Backward Time Step 1:
Gradient di[0] = 3215045689344.000, df[0] = 2216992178176.000, dc_hat[0] = 1787956166656.000
Gradient do_[0] = 102342544326656.000
Backward Time Step 0:
Gradient di[0] = 3892352909312.000, df[0] = 2769386733568.000, dc_hat[0] = 3652340154368.000
Gradient do_[0] = 60175419441152.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2379567842983936.000, df[0] = -1868176556032000.000, dc_hat[0] = -1075535643934720.000
Gradient do_[0] = -139107599806103552.000
Backward Time Step 3:
Gradient di[0] = -3736253254074368.000, df[0] = -2852319725092864.000, dc_hat[0] = -1543758751465472.000
Gradient do_[0] = -190192988796747776.000
Backward Time Step 2:
Gradient di[0] = -4894552783585280.000, df[0] = -3687883600822272.000, dc_hat[0] = -2696981226979328.000
Gradient do_[0] = -217611493078728704.000
Backward Time Step 1:
Gradient di[0] = -6175428532240384.000, df[0] = -4418289765711872.000, dc_hat[0] = -3737598384144384.000
Gradient do_[0] = -195257253454938112.000
Backward Time Step 0:
Gradient di[0] = -7210653626400768.000, df[0] = -5270269006446592.000, dc_hat[0] = -7289729879900160.000
Gradient do_[0] = -111865661629661184.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1284572971008.000, df[0] = 941855408128.000, dc_hat[0] = 557679247360.000
Gradient do_[0] = 79646133583872.000
Backward Time Step 3:
Gradient di[0] = 2015344721920.000, df[0] = 1454362853376.000, dc_hat[0] = 788940914688.000
Gradient do_[0] = 106607706898432.000
Backward Time Step 2:
Gradient di[0] = 2564400349184.000, df[0] = 1841857298432.000, dc_hat[0] = 1310152982528.000
Gradient do_[0] = 116176558489600.000
Backward Time Step 1:
Gradient di[0] = 3216140926976.000, df[0] = 2217747677184.000, dc_hat[0] = 1788563685376.000
Gradient do_[0] = 102377398992896.000
Backward Time Step 0:
Gradient di[0] = 3893690105856.000, df[0] = 2770338054144.000, dc_hat[0] = 3653594513408.000
Gradient do_[0] = 60196093165568.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2380064180142080.000, df[0] = -1868566190096384.000, dc_hat[0] = -1075759988867072.000
Gradient do_[0] = -139136565065547776.000
Backward Time Step 3:
Gradient di[0] = -3737036817170432.000, df[0] = -2852917799288832.000, dc_hat[0] = -1544082216189952.000
Gradient do_[0] = -190232725834170368.000
Backward Time Step 2:
Gradient di[0] = -4895578207027200.000, df[0] = -3688655889629184.000, dc_hat[0] = -2697545209872384.000
Gradient do_[0] = -217657105631412224.000
Backward Time Step 1:
Gradient di[0] = -6176724538621952.000, df[0] = -4419216404905984.000, dc_hat[0] = -3738376578531328.000
Gradient do_[0] = -195298141543596032.000
Backward Time Step 0:
Gradient di[0] = -7212169749856256.000, df[0] = -5271377108008960.000, dc_hat[0] = -7291262109483008.000
Gradient do_[0] = -111889189460508672.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1284826202112.000, df[0] = 942041137152.000, dc_hat[0] = 557789216768.000
Gradient do_[0] = 79661837058048.000
Backward Time Step 3:
Gradient di[0] = 2015742787584.000, df[0] = 1454650032128.000, dc_hat[0] = 789096497152.000
Gradient do_[0] = 106628728750080.000
Backward Time Step 2:
Gradient di[0] = 2564908122112.000, df[0] = 1842221940736.000, dc_hat[0] = 1310412111872.000
Gradient do_[0] = 116199518109696.000
Backward Time Step 1:
Gradient di[0] = 3216779509760.000, df[0] = 2218187816960.000, dc_hat[0] = 1788916793344.000
Gradient do_[0] = 102397665869824.000
Backward Time Step 0:
Gradient di[0] = 3894460284928.000, df[0] = 2770885935104.000, dc_hat[0] = 3654317244416.000
Gradient do_[0] = 60208004988928.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2380562396348416.000, df[0] = -1868957300555776.000, dc_hat[0] = -1075984937779200.000
Gradient do_[0] = -139165693533749248.000
Backward Time Step 3:
Gradient di[0] = -3737817695911936.000, df[0] = -2853513994436608.000, dc_hat[0] = -1544403936083968.000
Gradient do_[0] = -190272462871592960.000
Backward Time Step 2:
Gradient di[0] = -4896603093598208.000, df[0] = -3689428446871552.000, dc_hat[0] = -2698109461200896.000
Gradient do_[0] = -217702597925011456.000
Backward Time Step 1:
Gradient di[0] = -6178018397519872.000, df[0] = -4420141701922816.000, dc_hat[0] = -3739155846660096.000
Gradient do_[0] = -195338995272515584.000
Backward Time Step 0:
Gradient di[0] = -7213675672764416.000, df[0] = -5272478230249472.000, dc_hat[0] = -7292785212260352.000
Gradient do_[0] = -111912554082598912.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1285111021568.000, df[0] = 942249869312.000, dc_hat[0] = 557912555520.000
Gradient do_[0] = 79679495077888.000
Backward Time Step 3:
Gradient di[0] = 2016188956672.000, df[0] = 1454972076032.000, dc_hat[0] = 789271150592.000
Gradient do_[0] = 106652334292992.000
Backward Time Step 2:
Gradient di[0] = 2565476188160.000, df[0] = 1842630098944.000, dc_hat[0] = 1310702305280.000
Gradient do_[0] = 116225254359040.000
Backward Time Step 1:
Gradient di[0] = 3217493065728.000, df[0] = 2218679599104.000, dc_hat[0] = 1789310402560.000
Gradient do_[0] = 102420340277248.000
Backward Time Step 0:
Gradient di[0] = 3895318282240.000, df[0] = 2771496468480.000, dc_hat[0] = 3655122550784.000
Gradient do_[0] = 60221267378176.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2381066518134784.000, df[0] = -1869353511288832.000, dc_hat[0] = -1076212772372480.000
Gradient do_[0] = -139195191369138176.000
Backward Time Step 3:
Gradient di[0] = -3738609580507136.000, df[0] = -2854119047954432.000, dc_hat[0] = -1544730890469376.000
Gradient do_[0] = -190312784024567808.000
Backward Time Step 2:
Gradient di[0] = -4897633348878336.000, df[0] = -3690204225339392.000, dc_hat[0] = -2698673712529408.000
Gradient do_[0] = -217748347916648448.000
Backward Time Step 1:
Gradient di[0] = -6179317625126912.000, df[0] = -4421071293906944.000, dc_hat[0] = -3739938336014336.000
Gradient do_[0] = -195380037979996160.000
Backward Time Step 0:
Gradient di[0] = -7215195017445376.000, df[0] = -5273588479295488.000, dc_hat[0] = -7294321199939584.000
Gradient do_[0] = -111936124863119360.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1285443551232.000, df[0] = 942493728768.000, dc_hat[0] = 558056603648.000
Gradient do_[0] = 79700080721920.000
Backward Time Step 3:
Gradient di[0] = 2016712851456.000, df[0] = 1455350218752.000, dc_hat[0] = 789475819520.000
Gradient do_[0] = 106680067031040.000
Backward Time Step 2:
Gradient di[0] = 2566145179648.000, df[0] = 1843110739968.000, dc_hat[0] = 1311043616768.000
Gradient do_[0] = 116255461736448.000
Backward Time Step 1:
Gradient di[0] = 3218329305088.000, df[0] = 2219256315904.000, dc_hat[0] = 1789773611008.000
Gradient do_[0] = 102446906998784.000
Backward Time Step 0:
Gradient di[0] = 3896323342336.000, df[0] = 2772211597312.000, dc_hat[0] = 3656065482752.000
Gradient do_[0] = 60236811468800.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2381585940742144.000, df[0] = -1869761130528768.000, dc_hat[0] = -1076446982307840.000
Gradient do_[0] = -139225496658378752.000
Backward Time Step 3:
Gradient di[0] = -3739425892728832.000, df[0] = -2854741818212352.000, dc_hat[0] = -1545067105878016.000
Gradient do_[0] = -190354273408647168.000
Backward Time Step 2:
Gradient di[0] = -4898705480089600.000, df[0] = -3691012484497408.000, dc_hat[0] = -2699263465226240.000
Gradient do_[0] = -217795987693895680.000
Backward Time Step 1:
Gradient di[0] = -6180674834792448.000, df[0] = -4422041956515840.000, dc_hat[0] = -3740754379800576.000
Gradient do_[0] = -195422815854264320.000
Backward Time Step 0:
Gradient di[0] = -7216766975475712.000, df[0] = -5274737919918080.000, dc_hat[0] = -7295911411580928.000
Gradient do_[0] = -111960520277360640.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1285770444800.000, df[0] = 942733459456.000, dc_hat[0] = 558198489088.000
Gradient do_[0] = 79720355987456.000
Backward Time Step 3:
Gradient di[0] = 2017224818688.000, df[0] = 1455719448576.000, dc_hat[0] = 789676097536.000
Gradient do_[0] = 106707128680448.000
Backward Time Step 2:
Gradient di[0] = 2566793199616.000, df[0] = 1843575914496.000, dc_hat[0] = 1311373918208.000
Gradient do_[0] = 116284805087232.000
Backward Time Step 1:
Gradient di[0] = 3219143524352.000, df[0] = 2219817566208.000, dc_hat[0] = 1790224629760.000
Gradient do_[0] = 102472819408896.000
Backward Time Step 0:
Gradient di[0] = 3897312411648.000, df[0] = 2772915453952.000, dc_hat[0] = 3656993472512.000
Gradient do_[0] = 60252095512576.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2382075030142976.000, df[0] = -1870145127448576.000, dc_hat[0] = -1076667434926080.000
Gradient do_[0] = -139254083960700928.000
Backward Time Step 3:
Gradient di[0] = -3740192812826624.000, df[0] = -2855327275941888.000, dc_hat[0] = -1545383859716096.000
Gradient do_[0] = -190393288891564032.000
Backward Time Step 2:
Gradient di[0] = -4899708891824128.000, df[0] = -3691768667176960.000, dc_hat[0] = -2699814831652864.000
Gradient do_[0] = -217840603814166528.000
Backward Time Step 1:
Gradient di[0] = -6181937018306560.000, df[0] = -4422944436518912.000, dc_hat[0] = -3741514320576512.000
Gradient do_[0] = -195462673150771200.000
Backward Time Step 0:
Gradient di[0] = -7218238001774592.000, df[0] = -5275812198612992.000, dc_hat[0] = -7297397470265344.000
Gradient do_[0] = -111983335143636992.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1286065750016.000, df[0] = 942949924864.000, dc_hat[0] = 558326677504.000
Gradient do_[0] = 79738659930112.000
Backward Time Step 3:
Gradient di[0] = 2017687240704.000, df[0] = 1456053420032.000, dc_hat[0] = 789856714752.000
Gradient do_[0] = 106731564695552.000
Backward Time Step 2:
Gradient di[0] = 2567383810048.000, df[0] = 1844000063488.000, dc_hat[0] = 1311674466304.000
Gradient do_[0] = 116311556358144.000
Backward Time Step 1:
Gradient di[0] = 3219884343296.000, df[0] = 2220328222720.000, dc_hat[0] = 1790634360832.000
Gradient do_[0] = 102496374620160.000
Backward Time Step 0:
Gradient di[0] = 3898207633408.000, df[0] = 2773552463872.000, dc_hat[0] = 3657833644032.000
Gradient do_[0] = 60265936715776.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2382580225671168.000, df[0] = -1870541875052544.000, dc_hat[0] = -1076896410370048.000
Gradient do_[0] = -139283633335697408.000
Backward Time Step 3:
Gradient di[0] = -3740983355244544.000, df[0] = -2855930181976064.000, dc_hat[0] = -1545708800835584.000
Gradient do_[0] = -190433558504931328.000
Backward Time Step 2:
Gradient di[0] = -4900747200167936.000, df[0] = -3692551156531200.000, dc_hat[0] = -2700386330738688.000
Gradient do_[0] = -217886611503841280.000
Backward Time Step 1:
Gradient di[0] = -6183244835848192.000, df[0] = -4423880739389440.000, dc_hat[0] = -3742302178639872.000
Gradient do_[0] = -195503990736158720.000
Backward Time Step 0:
Gradient di[0] = -7219769157615616.000, df[0] = -5276932111335424.000, dc_hat[0] = -7298945805975552.000
Gradient do_[0] = -112007094902718464.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1286428426240.000, df[0] = 943215869952.000, dc_hat[0] = 558484160512.000
Gradient do_[0] = 79761141399552.000
Backward Time Step 3:
Gradient di[0] = 2018257141760.000, df[0] = 1456464330752.000, dc_hat[0] = 790079668224.000
Gradient do_[0] = 106761671409664.000
Backward Time Step 2:
Gradient di[0] = 2568108376064.000, df[0] = 1844519895040.000, dc_hat[0] = 1312042123264.000
Gradient do_[0] = 116344347426816.000
Backward Time Step 1:
Gradient di[0] = 3220794245120.000, df[0] = 2220955795456.000, dc_hat[0] = 1791137808384.000
Gradient do_[0] = 102525290151936.000
Backward Time Step 0:
Gradient di[0] = 3899307589632.000, df[0] = 2774334963712.000, dc_hat[0] = 3658865704960.000
Gradient do_[0] = 60282948812800.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2383086494941184.000, df[0] = -1870939025309696.000, dc_hat[0] = -1077124446289920.000
Gradient do_[0] = -139313156940890112.000
Backward Time Step 3:
Gradient di[0] = -3741779803242496.000, df[0] = -2856538993590272.000, dc_hat[0] = -1546037768486912.000
Gradient do_[0] = -190474120176074752.000
Backward Time Step 2:
Gradient di[0] = -4901789266608128.000, df[0] = -3693336330240000.000, dc_hat[0] = -2700960245743616.000
Gradient do_[0] = -217932962790899712.000
Backward Time Step 1:
Gradient di[0] = -6184566075162624.000, df[0] = -4424824558452736.000, dc_hat[0] = -3743096747589632.000
Gradient do_[0] = -195545703458537472.000
Backward Time Step 0:
Gradient di[0] = -7221307292778496.000, df[0] = -5278056319025152.000, dc_hat[0] = -7300501121007616.000
Gradient do_[0] = -112030949151080448.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1286752567296.000, df[0] = 943453503488.000, dc_hat[0] = 558624866304.000
Gradient do_[0] = 79781223727104.000
Backward Time Step 3:
Gradient di[0] = 2018765570048.000, df[0] = 1456831332352.000, dc_hat[0] = 790278307840.000
Gradient do_[0] = 106788531732480.000
Backward Time Step 2:
Gradient di[0] = 2568756396032.000, df[0] = 1844985593856.000, dc_hat[0] = 1312373211136.000
Gradient do_[0] = 116373665611776.000
Backward Time Step 1:
Gradient di[0] = 3221604794368.000, df[0] = 2221514686464.000, dc_hat[0] = 1791586598912.000
Gradient do_[0] = 102551059955712.000
Backward Time Step 0:
Gradient di[0] = 3900284338176.000, df[0] = 2775029907456.000, dc_hat[0] = 3659782422528.000
Gradient do_[0] = 60298039918592.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2383601085710336.000, df[0] = -1871342752235520.000, dc_hat[0] = -1077357045612544.000
Gradient do_[0] = -139343195942158336.000
Backward Time Step 3:
Gradient di[0] = -3742583230562304.000, df[0] = -2857152368607232.000, dc_hat[0] = -1546369152057344.000
Gradient do_[0] = -190514991084863488.000
Backward Time Step 2:
Gradient di[0] = -4902843144208384.000, df[0] = -3694130630754304.000, dc_hat[0] = -2701538992586752.000
Gradient do_[0] = -217979726394818560.000
Backward Time Step 1:
Gradient di[0] = -6185897515024384.000, df[0] = -4425777772756992.000, dc_hat[0] = -3743898564296704.000
Gradient do_[0] = -195587759778299904.000
Backward Time Step 0:
Gradient di[0] = -7222863681552384.000, df[0] = -5279193948487680.000, dc_hat[0] = -7302074152779776.000
Gradient do_[0] = -112055095457218560.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1287035944960.000, df[0] = 943661252608.000, dc_hat[0] = 558747877376.000
Gradient do_[0] = 79798781083648.000
Backward Time Step 3:
Gradient di[0] = 2019209117696.000, df[0] = 1457151541248.000, dc_hat[0] = 790451912704.000
Gradient do_[0] = 106812003057664.000
Backward Time Step 2:
Gradient di[0] = 2569316859904.000, df[0] = 1845388247040.000, dc_hat[0] = 1312659341312.000
Gradient do_[0] = 116399074705408.000
Backward Time Step 1:
Gradient di[0] = 3222309175296.000, df[0] = 2222000439296.000, dc_hat[0] = 1791976407040.000
Gradient do_[0] = 102573482704896.000
Backward Time Step 0:
Gradient di[0] = 3901138403328.000, df[0] = 2775637819392.000, dc_hat[0] = 3660583534592.000
Gradient do_[0] = 60311247781888.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2384090443546624.000, df[0] = -1871726883373056.000, dc_hat[0] = -1077577632448512.000
Gradient do_[0] = -139371877733761024.000
Backward Time Step 3:
Gradient di[0] = -3743357666852864.000, df[0] = -2857743463481344.000, dc_hat[0] = -1546688992903168.000
Gradient do_[0] = -190554384524902400.000
Backward Time Step 2:
Gradient di[0] = -4903862125199360.000, df[0] = -3694897550852096.000, dc_hat[0] = -2702098143641600.000
Gradient do_[0] = -218024960990380032.000
Backward Time Step 1:
Gradient di[0] = -6187181710245888.000, df[0] = -4426696358887424.000, dc_hat[0] = -3744671658409984.000
Gradient do_[0] = -195628269909835776.000
Backward Time Step 0:
Gradient di[0] = -7224345982140416.000, df[0] = -5280276817117184.000, dc_hat[0] = -7303572559495168.000
Gradient do_[0] = -112078090712121344.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1287361658880.000, df[0] = 943900196864.000, dc_hat[0] = 558889304064.000
Gradient do_[0] = 79818980851712.000
Backward Time Step 3:
Gradient di[0] = 2019720822784.000, df[0] = 1457520640000.000, dc_hat[0] = 790651994112.000
Gradient do_[0] = 106839056318464.000
Backward Time Step 2:
Gradient di[0] = 2569971957760.000, df[0] = 1845858271232.000, dc_hat[0] = 1312992526336.000
Gradient do_[0] = 116428711657472.000
Backward Time Step 1:
Gradient di[0] = 3223131258880.000, df[0] = 2222566670336.000, dc_hat[0] = 1792430571520.000
Gradient do_[0] = 102599571275776.000
Backward Time Step 0:
Gradient di[0] = 3902127472640.000, df[0] = 2776341151744.000, dc_hat[0] = 3661511786496.000
Gradient do_[0] = 60326536019968.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2384583559479296.000, df[0] = -1872114101518336.000, dc_hat[0] = -1077800568094720.000
Gradient do_[0] = -139400722734120960.000
Backward Time Step 3:
Gradient di[0] = -3744130760966144.000, df[0] = -2858333484613632.000, dc_hat[0] = -1547007625789440.000
Gradient do_[0] = -190593726425333760.000
Backward Time Step 2:
Gradient di[0] = -4904869295030272.000, df[0] = -3695656686321664.000, dc_hat[0] = -2702650583810048.000
Gradient do_[0] = -218069783269081088.000
Backward Time Step 1:
Gradient di[0] = -6188447114985472.000, df[0] = -4427601523245056.000, dc_hat[0] = -3745432404492288.000
Gradient do_[0] = -195668230285557760.000
Backward Time Step 0:
Gradient di[0] = -7225842778243072.000, df[0] = -5281370960035840.000, dc_hat[0] = -7305085461725184.000
Gradient do_[0] = -112101300715388928.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1287719092224.000, df[0] = 944162209792.000, dc_hat[0] = 559044296704.000
Gradient do_[0] = 79841135165440.000
Backward Time Step 3:
Gradient di[0] = 2020282073088.000, df[0] = 1457925652480.000, dc_hat[0] = 790871474176.000
Gradient do_[0] = 106868735213568.000
Backward Time Step 2:
Gradient di[0] = 2570685251584.000, df[0] = 1846371024896.000, dc_hat[0] = 1313357168640.000
Gradient do_[0] = 116460974243840.000
Backward Time Step 1:
Gradient di[0] = 3224024121344.000, df[0] = 2223182446592.000, dc_hat[0] = 1792925368320.000
Gradient do_[0] = 102627958325248.000
Backward Time Step 0:
Gradient di[0] = 3903205670912.000, df[0] = 2777108185088.000, dc_hat[0] = 3662523138048.000
Gradient do_[0] = 60343208378368.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2385081507250176.000, df[0] = -1872505211977728.000, dc_hat[0] = -1078025315680256.000
Gradient do_[0] = -139429748123107328.000
Backward Time Step 3:
Gradient di[0] = -3744909760659456.000, df[0] = -2858928606019584.000, dc_hat[0] = -1547328674594816.000
Gradient do_[0] = -190633291664064512.000
Backward Time Step 2:
Gradient di[0] = -4905894718472192.000, df[0] = -3696429511999488.000, dc_hat[0] = -2703215103574016.000
Gradient do_[0] = -218115258382811136.000
Backward Time Step 1:
Gradient di[0] = -6189741510754304.000, df[0] = -4428527625568256.000, dc_hat[0] = -3746212209491968.000
Gradient do_[0] = -195709032474869760.000
Backward Time Step 0:
Gradient di[0] = -7227335816249344.000, df[0] = -5282462418599936.000, dc_hat[0] = -7306595142729728.000
Gradient do_[0] = -112124484948852736.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1287902461952.000, df[0] = 944296558592.000, dc_hat[0] = 559123857408.000
Gradient do_[0] = 79852459786240.000
Backward Time Step 3:
Gradient di[0] = 2020570300416.000, df[0] = 1458134056960.000, dc_hat[0] = 790983933952.000
Gradient do_[0] = 106883935371264.000
Backward Time Step 2:
Gradient di[0] = 2571052515328.000, df[0] = 1846634872832.000, dc_hat[0] = 1313544077312.000
Gradient do_[0] = 116477566910464.000
Backward Time Step 1:
Gradient di[0] = 3224486281216.000, df[0] = 2223500951552.000, dc_hat[0] = 1793180303360.000
Gradient do_[0] = 102642638389248.000
Backward Time Step 0:
Gradient di[0] = 3903766659072.000, df[0] = 2777507430400.000, dc_hat[0] = 3663050047488.000
Gradient do_[0] = 60351882199040.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2385566838554624.000, df[0] = -1872886256107520.000, dc_hat[0] = -1078244962992128.000
Gradient do_[0] = -139458129266999296.000
Backward Time Step 3:
Gradient di[0] = -3745676680757248.000, df[0] = -2859513526878208.000, dc_hat[0] = -1547645428432896.000
Gradient do_[0] = -190672324326850560.000
Backward Time Step 2:
Gradient di[0] = -4906898667077632.000, df[0] = -3697185426243584.000, dc_hat[0] = -2703766201565184.000
Gradient do_[0] = -218159891682951168.000
Backward Time Step 1:
Gradient di[0] = -6191002083655680.000, df[0] = -4429428763394048.000, dc_hat[0] = -3746971344961536.000
Gradient do_[0] = -195748872591507456.000
Backward Time Step 0:
Gradient di[0] = -7228812211257344.000, df[0] = -5283541529133056.000, dc_hat[0] = -7308087643865088.000
Gradient do_[0] = -112147385714475008.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1288276672512.000, df[0] = 944571088896.000, dc_hat[0] = 559286321152.000
Gradient do_[0] = 79875679453184.000
Backward Time Step 3:
Gradient di[0] = 2021157502976.000, df[0] = 1458557681664.000, dc_hat[0] = 791213309952.000
Gradient do_[0] = 106914956443648.000
Backward Time Step 2:
Gradient di[0] = 2571798577152.000, df[0] = 1847170826240.000, dc_hat[0] = 1313924710400.000
Gradient do_[0] = 116511406555136.000
Backward Time Step 1:
Gradient di[0] = 3225422921728.000, df[0] = 2224146612224.000, dc_hat[0] = 1793699217408.000
Gradient do_[0] = 102672426336256.000
Backward Time Step 0:
Gradient di[0] = 3904897810432.000, df[0] = 2778312212480.000, dc_hat[0] = 3664111206400.000
Gradient do_[0] = 60369364058112.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2386069349728256.000, df[0] = -1873280453574656.000, dc_hat[0] = -1078471723843584.000
Gradient do_[0] = -139487498253369344.000
Backward Time Step 3:
Gradient di[0] = -3746462928207872.000, df[0] = -2860114016993280.000, dc_hat[0] = -1547969698463744.000
Gradient do_[0] = -190712267522703360.000
Backward Time Step 2:
Gradient di[0] = -4907920869294080.000, df[0] = -3697955567566848.000, dc_hat[0] = -2704327231668224.000
Gradient do_[0] = -218205315257073664.000
Backward Time Step 1:
Gradient di[0] = -6192302385004544.000, df[0] = -4430359697555456.000, dc_hat[0] = -3747754102751232.000
Gradient do_[0] = -195789880939249664.000
Backward Time Step 0:
Gradient di[0] = -7230323502874624.000, df[0] = -5284645872599040.000, dc_hat[0] = -7309615041609728.000
Gradient do_[0] = -112170836235911168.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1288628338688.000, df[0] = 944828907520.000, dc_hat[0] = 559438692352.000
Gradient do_[0] = 79897473056768.000
Backward Time Step 3:
Gradient di[0] = 2021708267520.000, df[0] = 1458954829824.000, dc_hat[0] = 791428268032.000
Gradient do_[0] = 106944090079232.000
Backward Time Step 2:
Gradient di[0] = 2572499288064.000, df[0] = 1847673618432.000, dc_hat[0] = 1314281488384.000
Gradient do_[0] = 116543123881984.000
Backward Time Step 1:
Gradient di[0] = 3226302939136.000, df[0] = 2224753475584.000, dc_hat[0] = 1794185887744.000
Gradient do_[0] = 102700435898368.000
Backward Time Step 0:
Gradient di[0] = 3905963425792.000, df[0] = 2779070332928.000, dc_hat[0] = 3665111023616.000
Gradient do_[0] = 60385839284224.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2386564613144576.000, df[0] = -1873669416550400.000, dc_hat[0] = -1078695062142976.000
Gradient do_[0] = -139516454922878976.000
Backward Time Step 3:
Gradient di[0] = -3747240317288448.000, df[0] = -2860707259351040.000, dc_hat[0] = -1548289941962752.000
Gradient do_[0] = -190751849941303296.000
Backward Time Step 2:
Gradient di[0] = -4908950050832384.000, df[0] = -3698730540728320.000, dc_hat[0] = -2704891482996736.000
Gradient do_[0] = -218250979349364736.000
Backward Time Step 1:
Gradient di[0] = -6193598391386112.000, df[0] = -4431286336749568.000, dc_hat[0] = -3748533370880000.000
Gradient do_[0] = -195830803387645952.000
Backward Time Step 0:
Gradient di[0] = -7231826741428224.000, df[0] = -5285744847355904.000, dc_hat[0] = -7311135460032512.000
Gradient do_[0] = -112194149318393856.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1288790605824.000, df[0] = 944947855360.000, dc_hat[0] = 559509143552.000
Gradient do_[0] = 79907522609152.000
Backward Time Step 3:
Gradient di[0] = 2021962416128.000, df[0] = 1459138330624.000, dc_hat[0] = 791527555072.000
Gradient do_[0] = 106957495074816.000
Backward Time Step 2:
Gradient di[0] = 2572823035904.000, df[0] = 1847906795520.000, dc_hat[0] = 1314446770176.000
Gradient do_[0] = 116557736837120.000
Backward Time Step 1:
Gradient di[0] = 3226707689472.000, df[0] = 2225032134656.000, dc_hat[0] = 1794408579072.000
Gradient do_[0] = 102713245302784.000
Backward Time Step 0:
Gradient di[0] = 3906454159360.000, df[0] = 2779419770880.000, dc_hat[0] = 3665571610624.000
Gradient do_[0] = 60393430974464.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2387050212884480.000, df[0] = -1874050729115648.000, dc_hat[0] = -1078915179216896.000
Gradient do_[0] = -139544861836574720.000
Backward Time Step 3:
Gradient di[0] = -3748007237386240.000, df[0] = -2861292717080576.000, dc_hat[0] = -1548606830018560.000
Gradient do_[0] = -190790831064481792.000
Backward Time Step 2:
Gradient di[0] = -4909950241341440.000, df[0] = -3699484575924224.000, dc_hat[0] = -2705442580987904.000
Gradient do_[0] = -218295354951467008.000
Backward Time Step 1:
Gradient di[0] = -6194857353674752.000, df[0] = -4432186669268992.000, dc_hat[0] = -3749291969478656.000
Gradient do_[0] = -195870609144545280.000
Backward Time Step 0:
Gradient di[0] = -7233302599565312.000, df[0] = -5286823421018112.000, dc_hat[0] = -7312627424296960.000
Gradient do_[0] = -112217050084016128.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1289152495616.000, df[0] = 945212948480.000, dc_hat[0] = 559666233344.000
Gradient do_[0] = 79929953746944.000
Backward Time Step 3:
Gradient di[0] = 2022529302528.000, df[0] = 1459547537408.000, dc_hat[0] = 791749525504.000
Gradient do_[0] = 106987492737024.000
Backward Time Step 2:
Gradient di[0] = 2573543145472.000, df[0] = 1848423350272.000, dc_hat[0] = 1314813247488.000
Gradient do_[0] = 116590343356416.000
Backward Time Step 1:
Gradient di[0] = 3227611561984.000, df[0] = 2225655513088.000, dc_hat[0] = 1794909536256.000
Gradient do_[0] = 102741976285184.000
Backward Time Step 0:
Gradient di[0] = 3907539697664.000, df[0] = 2780192047104.000, dc_hat[0] = 3666590564352.000
Gradient do_[0] = 60410212384768.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2387544402558976.000, df[0] = -1874438752567296.000, dc_hat[0] = -1079137846427648.000
Gradient do_[0] = -139573655297327104.000
Backward Time Step 3:
Gradient di[0] = -3748781673676800.000, df[0] = -2861884080390144.000, dc_hat[0] = -1548924926033920.000
Gradient do_[0] = -190830172964913152.000
Backward Time Step 2:
Gradient di[0] = -4910960632397824.000, df[0] = -3700245590441984.000, dc_hat[0] = -2705996900204544.000
Gradient do_[0] = -218340245949644800.000
Backward Time Step 1:
Gradient di[0] = -6196140475154432.000, df[0] = -4433103913222144.000, dc_hat[0] = -3750062916108288.000
Gradient do_[0] = -195911119276081152.000
Backward Time Step 0:
Gradient di[0] = -7234795637571584.000, df[0] = -5287914342711296.000, dc_hat[0] = -7314137105301504.000
Gradient do_[0] = -112240217137610752.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1289525133312.000, df[0] = 945486102528.000, dc_hat[0] = 559827648512.000
Gradient do_[0] = 79953055973376.000
Backward Time Step 3:
Gradient di[0] = 2023115063296.000, df[0] = 1459970375680.000, dc_hat[0] = 791978835968.000
Gradient do_[0] = 107018488643584.000
Backward Time Step 2:
Gradient di[0] = 2574289993728.000, df[0] = 1848960090112.000, dc_hat[0] = 1315194667008.000
Gradient do_[0] = 116624132669440.000
Backward Time Step 1:
Gradient di[0] = 3228547678208.000, df[0] = 2226300911616.000, dc_hat[0] = 1795428450304.000
Gradient do_[0] = 102771730677760.000
Backward Time Step 0:
Gradient di[0] = 3908676878336.000, df[0] = 2781001023488.000, dc_hat[0] = 3667657228288.000
Gradient do_[0] = 60427790712832.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2388035907878912.000, df[0] = -1874824225882112.000, dc_hat[0] = -1079359439896576.000
Gradient do_[0] = -139602362858733568.000
Backward Time Step 3:
Gradient di[0] = -3749549130645504.000, df[0] = -2862469806555136.000, dc_hat[0] = -1549241948307456.000
Gradient do_[0] = -190869257167306752.000
Backward Time Step 2:
Gradient di[0] = -4911967265357824.000, df[0] = -3701003920605184.000, dc_hat[0] = -2706550145679360.000
Gradient do_[0] = -218384965149130752.000
Backward Time Step 1:
Gradient di[0] = -6197411248603136.000, df[0] = -4434012835676160.000, dc_hat[0] = -3750828494028800.000
Gradient do_[0] = -195951182731018240.000
Backward Time Step 0:
Gradient di[0] = -7236280622514176.000, df[0] = -5289000432566272.000, dc_hat[0] = -7315638196371456.000
Gradient do_[0] = -112263238162317312.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1289776660480.000, df[0] = 945670520832.000, dc_hat[0] = 559936831488.000
Gradient do_[0] = 79968658784256.000
Backward Time Step 3:
Gradient di[0] = 2023510376448.000, df[0] = 1460255457280.000, dc_hat[0] = 792133042176.000
Gradient do_[0] = 107039393054720.000
Backward Time Step 2:
Gradient di[0] = 2574790688768.000, df[0] = 1849319620608.000, dc_hat[0] = 1315449339904.000
Gradient do_[0] = 116646815465472.000
Backward Time Step 1:
Gradient di[0] = 3229178134528.000, df[0] = 2226736070656.000, dc_hat[0] = 1795776839680.000
Gradient do_[0] = 102791771062272.000
Backward Time Step 0:
Gradient di[0] = 3909436047360.000, df[0] = 2781541040128.000, dc_hat[0] = 3668369735680.000
Gradient do_[0] = 60439522181120.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2388525534150656.000, df[0] = -1875208893890560.000, dc_hat[0] = -1079580832038912.000
Gradient do_[0] = -139630993110728704.000
Backward Time Step 3:
Gradient di[0] = -3750322224758784.000, df[0] = -2863060364558336.000, dc_hat[0] = -1549561386500096.000
Gradient do_[0] = -190908616247607296.000
Backward Time Step 2:
Gradient di[0] = -4912984098865152.000, df[0] = -3701769766961152.000, dc_hat[0] = -2707109565169664.000
Gradient do_[0] = -218430148205084672.000
Backward Time Step 1:
Gradient di[0] = -6198691148857344.000, df[0] = -4434928469016576.000, dc_hat[0] = -3751598098481152.000
Gradient do_[0] = -195991641322946560.000
Backward Time Step 0:
Gradient di[0] = -7237769365553152.000, df[0] = -5290088669904896.000, dc_hat[0] = -7317143582408704.000
Gradient do_[0] = -112286353676304384.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1289987031040.000, df[0] = 945824727040.000, dc_hat[0] = 560027992064.000
Gradient do_[0] = 79981694681088.000
Backward Time Step 3:
Gradient di[0] = 2023839760384.000, df[0] = 1460493090816.000, dc_hat[0] = 792262410240.000
Gradient do_[0] = 107056832970752.000
Backward Time Step 2:
Gradient di[0] = 2575212216320.000, df[0] = 1849622134784.000, dc_hat[0] = 1315664035840.000
Gradient do_[0] = 116665891160064.000
Backward Time Step 1:
Gradient di[0] = 3229707403264.000, df[0] = 2227100712960.000, dc_hat[0] = 1796069392384.000
Gradient do_[0] = 102808598609920.000
Backward Time Step 0:
Gradient di[0] = 3910073319424.000, df[0] = 2781994811392.000, dc_hat[0] = 3668967686144.000
Gradient do_[0] = 60449382989824.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2389007912665088.000, df[0] = -1875587522101248.000, dc_hat[0] = -1079798667411456.000
Gradient do_[0] = -139659159506255872.000
Backward Time Step 3:
Gradient di[0] = -3751075723083776.000, df[0] = -2863635621740544.000, dc_hat[0] = -1549872637411328.000
Gradient do_[0] = -190946910176018432.000
Backward Time Step 2:
Gradient di[0] = -4913969793859584.000, df[0] = -3702512259432448.000, dc_hat[0] = -2707649657307136.000
Gradient do_[0] = -218473802252681216.000
Backward Time Step 1:
Gradient di[0] = -6199942594953216.000, df[0] = -4435823969697792.000, dc_hat[0] = -3752352133677056.000
Gradient do_[0] = -196031086302593024.000
Backward Time Step 0:
Gradient di[0] = -7239229654433792.000, df[0] = -5291155969277952.000, dc_hat[0] = -7318619977416704.000
Gradient do_[0] = -112308996743888896.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1290433855488.000, df[0] = 946152407040.000, dc_hat[0] = 560221978624.000
Gradient do_[0] = 80009402253312.000
Backward Time Step 3:
Gradient di[0] = 2024540471296.000, df[0] = 1460998635520.000, dc_hat[0] = 792535760896.000
Gradient do_[0] = 107093885452288.000
Backward Time Step 2:
Gradient di[0] = 2576102719488.000, df[0] = 1850261766144.000, dc_hat[0] = 1316118855680.000
Gradient do_[0] = 116706231975936.000
Backward Time Step 1:
Gradient di[0] = 3230823350272.000, df[0] = 2227870105600.000, dc_hat[0] = 1796687527936.000
Gradient do_[0] = 102844065644544.000
Backward Time Step 0:
Gradient di[0] = 3911423361024.000, df[0] = 2782955044864.000, dc_hat[0] = 3670234365952.000
Gradient do_[0] = 60470249652224.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2389511766016000.000, df[0] = -1875983061745664.000, dc_hat[0] = -1080026300678144.000
Gradient do_[0] = -139688640161775616.000
Backward Time Step 3:
Gradient di[0] = -3751865728630784.000, df[0] = -2864238796210176.000, dc_hat[0] = -1550199189143552.000
Gradient do_[0] = -190987128249778176.000
Backward Time Step 2:
Gradient di[0] = -4915005954719744.000, df[0] = -3703293138173952.000, dc_hat[0] = -2708219277344768.000
Gradient do_[0] = -218519895841701888.000
Backward Time Step 1:
Gradient di[0] = -6201252023107584.000, df[0] = -4436760541003776.000, dc_hat[0] = -3753140528611328.000
Gradient do_[0] = -196072455427588096.000
Backward Time Step 0:
Gradient di[0] = -7240745241018368.000, df[0] = -5292262997098496.000, dc_hat[0] = -7320152206999552.000
Gradient do_[0] = -112332507394867200.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1290705305600.000, df[0] = 946351505408.000, dc_hat[0] = 560340008960.000
Gradient do_[0] = 80026204635136.000
Backward Time Step 3:
Gradient di[0] = 2024965144576.000, df[0] = 1461305081856.000, dc_hat[0] = 792701501440.000
Gradient do_[0] = 107116333367296.000
Backward Time Step 2:
Gradient di[0] = 2576648241152.000, df[0] = 1850653278208.000, dc_hat[0] = 1316396335104.000
Gradient do_[0] = 116730902872064.000
Backward Time Step 1:
Gradient di[0] = 3231511740416.000, df[0] = 2228344324096.000, dc_hat[0] = 1797067505664.000
Gradient do_[0] = 102865951522816.000
Backward Time Step 0:
Gradient di[0] = 3912254357504.000, df[0] = 2783546179584.000, dc_hat[0] = 3671014244352.000
Gradient do_[0] = 60483096805376.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2389998707933184.000, df[0] = -1876365313835008.000, dc_hat[0] = -1080245679554560.000
Gradient do_[0] = -139717072845275136.000
Backward Time Step 3:
Gradient di[0] = -3752631574986752.000, df[0] = -2864823448633344.000, dc_hat[0] = -1550514869239808.000
Gradient do_[0] = -191026057833349120.000
Backward Time Step 2:
Gradient di[0] = -4916008829583360.000, df[0] = -3704048515547136.000, dc_hat[0] = -2708769838465024.000
Gradient do_[0] = -218564443242496000.000
Backward Time Step 1:
Gradient di[0] = -6202509374783488.000, df[0] = -4437659799781376.000, dc_hat[0] = -3753895905984512.000
Gradient do_[0] = -196112123745533952.000
Backward Time Step 0:
Gradient di[0] = -7242219488542720.000, df[0] = -5293340497018880.000, dc_hat[0] = -7321641486909440.000
Gradient do_[0] = -112355365210816512.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1290968498176.000, df[0] = 946544508928.000, dc_hat[0] = 560454238208.000
Gradient do_[0] = 80042503700480.000
Backward Time Step 3:
Gradient di[0] = 2025379069952.000, df[0] = 1461603794944.000, dc_hat[0] = 792863506432.000
Gradient do_[0] = 107138219245568.000
Backward Time Step 2:
Gradient di[0] = 2577173577728.000, df[0] = 1851030634496.000, dc_hat[0] = 1316664639488.000
Gradient do_[0] = 116754642632704.000
Backward Time Step 1:
Gradient di[0] = 3232167362560.000, df[0] = 2228796260352.000, dc_hat[0] = 1797430444032.000
Gradient do_[0] = 102886797213696.000
Backward Time Step 0:
Gradient di[0] = 3913047867392.000, df[0] = 2784111099904.000, dc_hat[0] = 3671758733312.000
Gradient do_[0] = 60495360950272.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2390474912432128.000, df[0] = -1876739110207488.000, dc_hat[0] = -1080460897681408.000
Gradient do_[0] = -139744912823287808.000
Backward Time Step 3:
Gradient di[0] = -3753385341747200.000, df[0] = -2865398437380096.000, dc_hat[0] = -1550825449062400.000
Gradient do_[0] = -191064420481236992.000
Backward Time Step 2:
Gradient di[0] = -4916991840223232.000, df[0] = -3704789128970240.000, dc_hat[0] = -2709309125296128.000
Gradient do_[0] = -218608114469961728.000
Backward Time Step 1:
Gradient di[0] = -6203748472848384.000, df[0] = -4438545636786176.000, dc_hat[0] = -3754642424987648.000
Gradient do_[0] = -196151242307665920.000
Backward Time Step 0:
Gradient di[0] = -7243659376328704.000, df[0] = -5294392764006400.000, dc_hat[0] = -7323098017693696.000
Gradient do_[0] = -112377707630690304.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1291467751424.000, df[0] = 946910527488.000, dc_hat[0] = 560670507008.000
Gradient do_[0] = 80073432498176.000
Backward Time Step 3:
Gradient di[0] = 2026163666944.000, df[0] = 1462169763840.000, dc_hat[0] = 793170214912.000
Gradient do_[0] = 107179617026048.000
Backward Time Step 2:
Gradient di[0] = 2578171297792.000, df[0] = 1851747336192.000, dc_hat[0] = 1317172543488.000
Gradient do_[0] = 116799798509568.000
Backward Time Step 1:
Gradient di[0] = 3233417003008.000, df[0] = 2229658451968.000, dc_hat[0] = 1798123683840.000
Gradient do_[0] = 102926559215616.000
Backward Time Step 0:
Gradient di[0] = 3914560700416.000, df[0] = 2785187463168.000, dc_hat[0] = 3673178243072.000
Gradient do_[0] = 60518760972288.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2390959975301120.000, df[0] = -1877120020119552.000, dc_hat[0] = -1080680142340096.000
Gradient do_[0] = -139773268197376000.000
Backward Time Step 3:
Gradient di[0] = -3754142598168576.000, df[0] = -2865977184223232.000, dc_hat[0] = -1551137505280000.000
Gradient do_[0] = -191102920568078336.000
Backward Time Step 2:
Gradient di[0] = -4917989883248640.000, df[0] = -3705541285117952.000, dc_hat[0] = -2709858612674560.000
Gradient do_[0] = -218652438532456448.000
Backward Time Step 1:
Gradient di[0] = -6205013340717056.000, df[0] = -4439450801143808.000, dc_hat[0] = -3755404513247232.000
Gradient do_[0] = -196191219863257088.000
Backward Time Step 0:
Gradient di[0] = -7245136845078528.000, df[0] = -5295473485152256.000, dc_hat[0] = -7324592129441792.000
Gradient do_[0] = -112400651345985536.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1291683495936.000, df[0] = 947068796928.000, dc_hat[0] = 560764485632.000
Gradient do_[0] = 80086829105152.000
Backward Time Step 3:
Gradient di[0] = 2026502881280.000, df[0] = 1462414868480.000, dc_hat[0] = 793303121920.000
Gradient do_[0] = 107197585424384.000
Backward Time Step 2:
Gradient di[0] = 2578601213952.000, df[0] = 1852055748608.000, dc_hat[0] = 1317390909440.000
Gradient do_[0] = 116819285245952.000
Backward Time Step 1:
Gradient di[0] = 3233954922496.000, df[0] = 2230028861440.000, dc_hat[0] = 1798420430848.000
Gradient do_[0] = 102943588089856.000
Backward Time Step 0:
Gradient di[0] = 3915204001792.000, df[0] = 2785645166592.000, dc_hat[0] = 3673781960704.000
Gradient do_[0] = 60528701472768.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2391460875862016.000, df[0] = -1877512875409408.000, dc_hat[0] = -1080905896558592.000
Gradient do_[0] = -139802516924661760.000
Backward Time Step 3:
Gradient di[0] = -3754930724667392.000, df[0] = -2866578479644672.000, dc_hat[0] = -1551462177964032.000
Gradient do_[0] = -191142984023015424.000
Backward Time Step 2:
Gradient di[0] = -4919017991045120.000, df[0] = -3706315989843968.000, dc_hat[0] = -2710424743051264.000
Gradient do_[0] = -218698068265009152.000
Backward Time Step 1:
Gradient di[0] = -6206310957711360.000, df[0] = -4440379050950656.000, dc_hat[0] = -3756184586682368.000
Gradient do_[0] = -196232125131784192.000
Backward Time Step 0:
Gradient di[0] = -7246648673566720.000, df[0] = -5296578365489152.000, dc_hat[0] = -7326120600928256.000
Gradient do_[0] = -112424101867421696.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1291941576704.000, df[0] = 947257868288.000, dc_hat[0] = 560876224512.000
Gradient do_[0] = 80102817792000.000
Backward Time Step 3:
Gradient di[0] = 2026906583040.000, df[0] = 1462706241536.000, dc_hat[0] = 793461129216.000
Gradient do_[0] = 107218942820352.000
Backward Time Step 2:
Gradient di[0] = 2579115802624.000, df[0] = 1852425502720.000, dc_hat[0] = 1317653577728.000
Gradient do_[0] = 116842538467328.000
Backward Time Step 1:
Gradient di[0] = 3234605563904.000, df[0] = 2230477651968.000, dc_hat[0] = 1798779568128.000
Gradient do_[0] = 102964282785792.000
Backward Time Step 0:
Gradient di[0] = 3915987812352.000, df[0] = 2786202746880.000, dc_hat[0] = 3674517536768.000
Gradient do_[0] = 60540818817024.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2391945401860096.000, df[0] = -1877893919539200.000, dc_hat[0] = -1081125409652736.000
Gradient do_[0] = -139830837939011584.000
Backward Time Step 3:
Gradient di[0] = -3755693081362432.000, df[0] = -2867160716148736.000, dc_hat[0] = -1551776381665280.000
Gradient do_[0] = -191181793347502080.000
Backward Time Step 2:
Gradient di[0] = -4920017644683264.000, df[0] = -3707068145991680.000, dc_hat[0] = -2710973156687872.000
Gradient do_[0] = -218742564126195712.000
Backward Time Step 1:
Gradient di[0] = -6207569920000000.000, df[0] = -4441279383470080.000, dc_hat[0] = -3756942111539200.000
Gradient do_[0] = -196271879349075968.000
Backward Time Step 0:
Gradient di[0] = -7248111109931008.000, df[0] = -5297646738604032.000, dc_hat[0] = -7327598069678080.000
Gradient do_[0] = -112446787884679168.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1292147884032.000, df[0] = 947409190912.000, dc_hat[0] = 560965484544.000
Gradient do_[0] = 80115602030592.000
Backward Time Step 3:
Gradient di[0] = 2027230724096.000, df[0] = 1462939942912.000, dc_hat[0] = 793587548160.000
Gradient do_[0] = 107236072357888.000
Backward Time Step 2:
Gradient di[0] = 2579528941568.000, df[0] = 1852722118656.000, dc_hat[0] = 1317863948288.000
Gradient do_[0] = 116861219897344.000
Backward Time Step 1:
Gradient di[0] = 3235121725440.000, df[0] = 2230833119232.000, dc_hat[0] = 1799064649728.000
Gradient do_[0] = 102980682514432.000
Backward Time Step 0:
Gradient di[0] = 3916615122944.000, df[0] = 2786649178112.000, dc_hat[0] = 3675106050048.000
Gradient do_[0] = 60550511853568.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2392436370309120.000, df[0] = -1878278855983104.000, dc_hat[0] = -1081346600468480.000
Gradient do_[0] = -139859528320548864.000
Backward Time Step 3:
Gradient di[0] = -3756461343637504.000, df[0] = -2867746710749184.000, dc_hat[0] = -1552092867067904.000
Gradient do_[0] = -191220877549895680.000
Backward Time Step 2:
Gradient di[0] = -4921025888256000.000, df[0] = -3707827818332160.000, dc_hat[0] = -2711528281210880.000
Gradient do_[0] = -218787283325681664.000
Backward Time Step 1:
Gradient di[0] = -6208846599028736.000, df[0] = -4442192600891392.000, dc_hat[0] = -3757711179120640.000
Gradient do_[0] = -196312217681920000.000
Backward Time Step 0:
Gradient di[0] = -7249604684808192.000, df[0] = -5298738734039040.000, dc_hat[0] = -7329108287553536.000
Gradient do_[0] = -112469946348339200.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1292659326976.000, df[0] = 947784122368.000, dc_hat[0] = 561187520512.000
Gradient do_[0] = 80147310968832.000
Backward Time Step 3:
Gradient di[0] = 2028032491520.000, df[0] = 1463518494720.000, dc_hat[0] = 793901268992.000
Gradient do_[0] = 107278476771328.000
Backward Time Step 2:
Gradient di[0] = 2580549468160.000, df[0] = 1853455204352.000, dc_hat[0] = 1318384828416.000
Gradient do_[0] = 116907457904640.000
Backward Time Step 1:
Gradient di[0] = 3236397580288.000, df[0] = 2231712874496.000, dc_hat[0] = 1799772045312.000
Gradient do_[0] = 103021274988544.000
Backward Time Step 0:
Gradient di[0] = 3918156791808.000, df[0] = 2787745988608.000, dc_hat[0] = 3676552560640.000
Gradient do_[0] = 60574352277504.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2392916601339904.000, df[0] = -1878656142016512.000, dc_hat[0] = -1081564234514432.000
Gradient do_[0] = -139887608816730112.000
Backward Time Step 3:
Gradient di[0] = -3757213499785216.000, df[0] = -2868321162625024.000, dc_hat[0] = -1552403715325952.000
Gradient do_[0] = -191259188658176000.000
Backward Time Step 2:
Gradient di[0] = -4922017488830464.000, df[0] = -3708574605770752.000, dc_hat[0] = -2712071594573824.000
Gradient do_[0] = -218831298150531072.000
Backward Time Step 1:
Gradient di[0] = -6210088918319104.000, df[0] = -4443081659121664.000, dc_hat[0] = -3758459845607424.000
Gradient do_[0] = -196351387783659520.000
Backward Time Step 0:
Gradient di[0] = -7251047793819648.000, df[0] = -5299793148510208.000, dc_hat[0] = -7330566965821440.000
Gradient do_[0] = -112492331717885952.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1292928286720.000, df[0] = 947981320192.000, dc_hat[0] = 561304305664.000
Gradient do_[0] = 80163987521536.000
Backward Time Step 3:
Gradient di[0] = 2028455723008.000, df[0] = 1463823892480.000, dc_hat[0] = 794066878464.000
Gradient do_[0] = 107300773691392.000
Backward Time Step 2:
Gradient di[0] = 2581087911936.000, df[0] = 1853841866752.000, dc_hat[0] = 1318659686400.000
Gradient do_[0] = 116931801645056.000
Backward Time Step 1:
Gradient di[0] = 3237076008960.000, df[0] = 2232180539392.000, dc_hat[0] = 1800146649088.000
Gradient do_[0] = 103042842099712.000
Backward Time Step 0:
Gradient di[0] = 3918972583936.000, df[0] = 2788326375424.000, dc_hat[0] = 3677318283264.000
Gradient do_[0] = 60586964549632.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2393386631823360.000, df[0] = -1879024703897600.000, dc_hat[0] = -1081775828762624.000
Gradient do_[0] = -139915036477882368.000
Backward Time Step 3:
Gradient di[0] = -3757955992256512.000, df[0] = -2868888098308096.000, dc_hat[0] = -1552710537052160.000
Gradient do_[0] = -191296898471034880.000
Backward Time Step 2:
Gradient di[0] = -4922985467084800.000, df[0] = -3709303676469248.000, dc_hat[0] = -2712604438953984.000
Gradient do_[0] = -218874282183229440.000
Backward Time Step 1:
Gradient di[0] = -6211313520869376.000, df[0] = -4443957295579136.000, dc_hat[0] = -3759195358756864.000
Gradient do_[0] = -196390042489323520.000
Backward Time Step 0:
Gradient di[0] = -7252474796703744.000, df[0] = -5300836288692224.000, dc_hat[0] = -7332010074832896.000
Gradient do_[0] = -112514485159198720.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1293271695360.000, df[0] = 948233175040.000, dc_hat[0] = 561453072384.000
Gradient do_[0] = 80185277808640.000
Backward Time Step 3:
Gradient di[0] = 2028994297856.000, df[0] = 1464212652032.000, dc_hat[0] = 794277380096.000
Gradient do_[0] = 107329311735808.000
Backward Time Step 2:
Gradient di[0] = 2581776826368.000, df[0] = 1854336532480.000, dc_hat[0] = 1319009910784.000
Gradient do_[0] = 116962973712384.000
Backward Time Step 1:
Gradient di[0] = 3237938200576.000, df[0] = 2232775344128.000, dc_hat[0] = 1800624275456.000
Gradient do_[0] = 103070256070656.000
Backward Time Step 0:
Gradient di[0] = 3920019062784.000, df[0] = 2789070864384.000, dc_hat[0] = 3678300012544.000
Gradient do_[0] = 60603146174464.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2393880553062400.000, df[0] = -1879412861566976.000, dc_hat[0] = -1081999099953152.000
Gradient do_[0] = -139943941607784448.000
Backward Time Step 3:
Gradient di[0] = -3758728281063424.000, df[0] = -2869477582569472.000, dc_hat[0] = -1553029572591616.000
Gradient do_[0] = -191336171651989504.000
Backward Time Step 2:
Gradient di[0] = -4923995858141184.000, df[0] = -3710065496293376.000, dc_hat[0] = -2713157952864256.000
Gradient do_[0] = -218919156001538048.000
Backward Time Step 1:
Gradient di[0] = -6212595031736320.000, df[0] = -4444873734225920.000, dc_hat[0] = -3759966573821952.000
Gradient do_[0] = -196430518261121024.000
Backward Time Step 0:
Gradient di[0] = -7253969982193664.000, df[0] = -5301929357869056.000, dc_hat[0] = -7333521366450176.000
Gradient do_[0] = -112537669392662528.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1293596360704.000, df[0] = 948471005184.000, dc_hat[0] = 561593974784.000
Gradient do_[0] = 80205385302016.000
Backward Time Step 3:
Gradient di[0] = 2029503381504.000, df[0] = 1464580046848.000, dc_hat[0] = 794476019712.000
Gradient do_[0] = 107356205613056.000
Backward Time Step 2:
Gradient di[0] = 2582421176320.000, df[0] = 1854799347712.000, dc_hat[0] = 1319338377216.000
Gradient do_[0] = 116992098959360.000
Backward Time Step 1:
Gradient di[0] = 3238748749824.000, df[0] = 2233333972992.000, dc_hat[0] = 1801072934912.000
Gradient do_[0] = 103096025874432.000
Backward Time Step 0:
Gradient di[0] = 3921002627072.000, df[0] = 2789770526720.000, dc_hat[0] = 3679222759424.000
Gradient do_[0] = 60618342137856.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2394359441915904.000, df[0] = -1879788671205376.000, dc_hat[0] = -1082214989168640.000
Gradient do_[0] = -139971850305273856.000
Backward Time Step 3:
Gradient di[0] = -3759481510952960.000, df[0] = -2870052839751680.000, dc_hat[0] = -1553340689285120.000
Gradient do_[0] = -191374448400531456.000
Backward Time Step 2:
Gradient di[0] = -4924985311232000.000, df[0] = -3710810404683776.000, dc_hat[0] = -2713701803098112.000
Gradient do_[0] = -218963016207564800.000
Backward Time Step 1:
Gradient di[0] = -6213836814155776.000, df[0] = -4445762524020736.000, dc_hat[0] = -3760714435002368.000
Gradient do_[0] = -196469774262206464.000
Backward Time Step 0:
Gradient di[0] = -7255417386172416.000, df[0] = -5302986993565696.000, dc_hat[0] = -7334984876556288.000
Gradient do_[0] = -112560132071620608.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1293841727488.000, df[0] = 948651032576.000, dc_hat[0] = 561700536320.000
Gradient do_[0] = 80220593848320.000
Backward Time Step 3:
Gradient di[0] = 2029887946752.000, df[0] = 1464857395200.000, dc_hat[0] = 794626293760.000
Gradient do_[0] = 107376522821632.000
Backward Time Step 2:
Gradient di[0] = 2582909812736.000, df[0] = 1855150227456.000, dc_hat[0] = 1319587807232.000
Gradient do_[0] = 117014236495872.000
Backward Time Step 1:
Gradient di[0] = 3239361380352.000, df[0] = 2233756024832.000, dc_hat[0] = 1801411493888.000
Gradient do_[0] = 103115462279168.000
Backward Time Step 0:
Gradient di[0] = 3921740824576.000, df[0] = 2790295863296.000, dc_hat[0] = 3679915606016.000
Gradient do_[0] = 60629759033344.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2394851752542208.000, df[0] = -1880175218262016.000, dc_hat[0] = -1082437522161664.000
Gradient do_[0] = -140000635176091648.000
Backward Time Step 3:
Gradient di[0] = -3760256484114432.000, df[0] = -2870644471496704.000, dc_hat[0] = -1553659187953664.000
Gradient do_[0] = -191413910560047104.000
Backward Time Step 2:
Gradient di[0] = -4925998923513856.000, df[0] = -3711574908862464.000, dc_hat[0] = -2714258001362944.000
Gradient do_[0] = -219008096184303616.000
Backward Time Step 1:
Gradient di[0] = -6215121009377280.000, df[0] = -4446681110151168.000, dc_hat[0] = -3761486723809280.000
Gradient do_[0] = -196510250034003968.000
Backward Time Step 0:
Gradient di[0] = -7256906666082304.000, df[0] = -5304075767775232.000, dc_hat[0] = -7336490262593536.000
Gradient do_[0] = -112583238995673088.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1294130610176.000, df[0] = 948862779392.000, dc_hat[0] = 561825710080.000
Gradient do_[0] = 80238503526400.000
Backward Time Step 3:
Gradient di[0] = 2030340276224.000, df[0] = 1465183764480.000, dc_hat[0] = 794802913280.000
Gradient do_[0] = 107400430354432.000
Backward Time Step 2:
Gradient di[0] = 2583486791680.000, df[0] = 1855564546048.000, dc_hat[0] = 1319881539584.000
Gradient do_[0] = 117040358621184.000
Backward Time Step 1:
Gradient di[0] = 3240088567808.000, df[0] = 2234257506304.000, dc_hat[0] = 1801813360640.000
Gradient do_[0] = 103138556116992.000
Backward Time Step 0:
Gradient di[0] = 3922616123392.000, df[0] = 2790918717440.000, dc_hat[0] = 3680736903168.000
Gradient do_[0] = 60643289858048.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2395329836089344.000, df[0] = -1880550759464960.000, dc_hat[0] = -1082653679812608.000
Gradient do_[0] = -140028586823254016.000
Backward Time Step 3:
Gradient di[0] = -3761004613730304.000, df[0] = -2871215433711616.000, dc_hat[0] = -1553967888728064.000
Gradient do_[0] = -191451981150158848.000
Backward Time Step 2:
Gradient di[0] = -4926981397282816.000, df[0] = -3712314985414656.000, dc_hat[0] = -2714798898806784.000
Gradient do_[0] = -219051715872161792.000
Backward Time Step 1:
Gradient di[0] = -6216364402409472.000, df[0] = -4447570705252352.000, dc_hat[0] = -3762235658731520.000
Gradient do_[0] = -196549506035089408.000
Backward Time Step 0:
Gradient di[0] = -7258362123124736.000, df[0] = -5305139845922816.000, dc_hat[0] = -7337962362634240.000
Gradient do_[0] = -112605830523650048.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1294445182976.000, df[0] = 949093400576.000, dc_hat[0] = 561962418176.000
Gradient do_[0] = 80258007040000.000
Backward Time Step 3:
Gradient di[0] = 2030836776960.000, df[0] = 1465542115328.000, dc_hat[0] = 794997161984.000
Gradient do_[0] = 107426711863296.000
Backward Time Step 2:
Gradient di[0] = 2584118558720.000, df[0] = 1856018448384.000, dc_hat[0] = 1320204369920.000
Gradient do_[0] = 117069014106112.000
Backward Time Step 1:
Gradient di[0] = 3240877096960.000, df[0] = 2234801192960.000, dc_hat[0] = 1802249568256.000
Gradient do_[0] = 103163654832128.000
Backward Time Step 0:
Gradient di[0] = 3923561152512.000, df[0] = 2791591116800.000, dc_hat[0] = 3681623474176.000
Gradient do_[0] = 60657898618880.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2395798792830976.000, df[0] = -1880918918692864.000, dc_hat[0] = -1082865072734208.000
Gradient do_[0] = -140055980124667904.000
Backward Time Step 3:
Gradient di[0] = -3761737442525184.000, df[0] = -2871774853201920.000, dc_hat[0] = -1554270281269248.000
Gradient do_[0] = -191489244286418944.000
Backward Time Step 2:
Gradient di[0] = -4927938101248000.000, df[0] = -3713036271484928.000, dc_hat[0] = -2715325300736000.000
Gradient do_[0] = -219094236048392192.000
Backward Time Step 1:
Gradient di[0] = -6217575046316032.000, df[0] = -4448436678033408.000, dc_hat[0] = -3762964192559104.000
Gradient do_[0] = -196587696884285440.000
Backward Time Step 0:
Gradient di[0] = -7259777851719680.000, df[0] = -5306174396170240.000, dc_hat[0] = -7339393660485632.000
Gradient do_[0] = -112627777806532608.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1294700511232.000, df[0] = 949280505856.000, dc_hat[0] = 562073108480.000
Gradient do_[0] = 80273827954688.000
Backward Time Step 3:
Gradient di[0] = 2031234711552.000, df[0] = 1465829294080.000, dc_hat[0] = 795152613376.000
Gradient do_[0] = 107447775657984.000
Backward Time Step 2:
Gradient di[0] = 2584626331648.000, df[0] = 1856382828544.000, dc_hat[0] = 1320462057472.000
Gradient do_[0] = 117091906617344.000
Backward Time Step 1:
Gradient di[0] = 3241515941888.000, df[0] = 2235241332736.000, dc_hat[0] = 1802602938368.000
Gradient do_[0] = 103183955263488.000
Backward Time Step 0:
Gradient di[0] = 3924343652352.000, df[0] = 2792147910656.000, dc_hat[0] = 3682358001664.000
Gradient do_[0] = 60669994991616.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2396301304004608.000, df[0] = -1881313116160000.000, dc_hat[0] = -1083091162497024.000
Gradient do_[0] = -140085254621757440.000
Backward Time Step 3:
Gradient di[0] = -3762524226846720.000, df[0] = -2872375611752448.000, dc_hat[0] = -1554594417082368.000
Gradient do_[0] = -191529290561486848.000
Backward Time Step 2:
Gradient di[0] = -4928972651495424.000, df[0] = -3713815002742784.000, dc_hat[0] = -2715893310160896.000
Gradient do_[0] = -219140140658851840.000
Backward Time Step 1:
Gradient di[0] = -6218874810793984.000, df[0] = -4449366270017536.000, dc_hat[0] = -3763745608171520.000
Gradient do_[0] = -196628756771635200.000
Backward Time Step 0:
Gradient di[0] = -7261287532724224.000, df[0] = -5307277665894400.000, dc_hat[0] = -7340919447617536.000
Gradient do_[0] = -112651202558164992.000
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Backward Time Step 4:
Gradient di[0] = 1294983102464.000, df[0] = 949487927296.000, dc_hat[0] = 562195988480.000
Gradient do_[0] = 80291368534016.000
Backward Time Step 3:
Gradient di[0] = 2031680094208.000, df[0] = 1466150682624.000, dc_hat[0] = 795326414848.000
Gradient do_[0] = 107471297314816.000
Backward Time Step 2:
Gradient di[0] = 2585191251968.000, df[0] = 1856789020672.000, dc_hat[0] = 1320750284800.000
Gradient do_[0] = 117117542203392.000
Backward Time Step 1:
Gradient di[0] = 3242225303552.000, df[0] = 2235730493440.000, dc_hat[0] = 1802995499008.000
Gradient do_[0] = 103206512230400.000
Backward Time Step 0:
Gradient di[0] = 3925198241792.000, df[0] = 2792756084736.000, dc_hat[0] = 3683159900160.000
Gradient do_[0] = 60683207049216.000
Time Step 0:
i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795
c_state[0] = 0.431, h_state[0] = 0.064
Time Step 1:
i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837
c_state[0] = 0.654, h_state[0] = 0.077
Time Step 2:
i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837
c_state[0] = 0.772, h_state[0] = 0.086
Time Step 3:
i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868
c_state[0] = 0.843, h_state[0] = 0.094
Time Step 4:
i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853
c_state[0] = 0.890, h_state[0] = 0.099
Backward Time Step 4:
Gradient di[0] = -2396771334488064.000, df[0] = -1881681946476544.000, dc_hat[0] = -1083303494942720.000
Gradient do_[0] = -140112725232582656.000
Backward Time Step 3:
Gradient di[0] = -3763265108705280.000, df[0] = -2872941205258240.000, dc_hat[0] = -1554899762413568.000
Gradient do_[0] = -191566931654868992.000
Backward Time Step 2:
Gradient di[0] = -4929941166620672.000, df[0] = -3714544610312192.000, dc_hat[0] = -2716425080799232.000
Gradient do_[0] = -219183193411026944.000
Backward Time Step 1:
Gradient di[0] = -6220101560827904.000, df[0] = -4450243517087744.000, dc_hat[0] = -3764485684723712.000
Gradient do_[0] = -196667445837037568.000
Backward Time Step 0:
Gradient di[0] = -7262719367446528.000, df[0] = -5308324564172800.000, dc_hat[0] = -7342366851596288.000
Gradient do_[0] = -112673416129019904.000
Epoch 1000, Train Loss=0.011137, Weight Norm=13.278138
Sample Predictions at Epoch 1000:
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 192 | 2024-10-11 | 57.57 | 63.87 | 6.30 |
| 193 | 2024-10-14 | 56.93 | 66.55 | 9.62 |
| 194 | 2024-10-15 | 57.12 | 66.00 | 8.88 |
| 195 | 2024-10-16 | 58.09 | 67.20 | 9.11 |
| 196 | 2024-10-17 | 57.61 | 66.76 | 9.15 |
-------------------------------------------------------------
Time Step 0:
i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812
c_state[0] = 0.450, h_state[0] = 0.070
Time Step 1:
i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852
c_state[0] = 0.689, h_state[0] = 0.084
Time Step 2:
i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850
c_state[0] = 0.823, h_state[0] = 0.095
Time Step 3:
i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877
c_state[0] = 0.910, h_state[0] = 0.106
Time Step 4:
i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862
c_state[0] = 0.972, h_state[0] = 0.112
Validation (Last 30 Days):
-------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
-------------------------------------------------------------
| 197 | 2024-10-18 | 58.40 | 61.52 | 3.12 |
| 198 | 2024-10-21 | 58.40 | 60.68 | 2.28 |
| 199 | 2024-10-22 | 56.24 | 61.02 | 4.78 |
| 200 | 2024-10-23 | 55.44 | 58.63 | 3.19 |
| 201 | 2024-10-24 | 56.03 | 59.05 | 3.02 |
| 202 | 2024-10-25 | 55.90 | 59.18 | 3.28 |
| 203 | 2024-10-28 | 56.04 | 57.29 | 1.25 |
| 204 | 2024-10-29 | 56.53 | 55.64 | 0.89 |
| 205 | 2024-10-30 | 56.28 | 59.83 | 3.55 |
| 206 | 2024-10-31 | 55.90 | 60.01 | 4.11 |
| 207 | 2024-11-01 | 57.07 | 60.49 | 3.42 |
| 208 | 2024-11-04 | 57.39 | 59.74 | 2.35 |
| 209 | 2024-11-05 | 57.43 | 58.02 | 0.59 |
| 210 | 2024-11-06 | 56.95 | 56.46 | 0.49 |
| 211 | 2024-11-07 | 56.31 | 56.34 | 0.03 |
| 212 | 2024-11-08 | 55.67 | 56.42 | 0.75 |
| 213 | 2024-11-11 | 55.47 | 59.76 | 4.29 |
| 214 | 2024-11-12 | 55.48 | 61.61 | 6.13 |
| 215 | 2024-11-13 | 56.34 | 60.58 | 4.24 |
| 216 | 2024-11-14 | 56.90 | 61.62 | 4.72 |
| 217 | 2024-11-15 | 56.67 | 59.92 | 3.25 |
| 218 | 2024-11-18 | 56.50 | 57.39 | 0.89 |
| 219 | 2024-11-19 | 55.78 | 61.19 | 5.41 |
| 220 | 2024-11-20 | 54.68 | 62.95 | 8.27 |
| 221 | 2024-11-21 | 55.61 | 64.17 | 8.56 |
| 222 | 2024-11-22 | 56.17 | 63.00 | 6.83 |
| 223 | 2024-11-25 | 56.71 | 65.06 | 8.35 |
| 224 | 2024-11-26 | 57.61 | 63.68 | 6.07 |
| 225 | 2024-11-27 | 57.77 | 63.68 | 5.91 |
-------------------------------------------------------------
Validation Metrics:
Mean Absolute Error (MAE): 3.79
Root Mean Squared Error (RMSE): 4.49
Detailed Predictions for All Data:
--------------------------------------------------------------------
| Day | Date | Predicted Close | Actual Close | Error |
--------------------------------------------------------------------
| 0 | 2024-01-08 | 51.55 | 72.11 | 20.56 |
| 1 | 2024-01-09 | 56.39 | 73.28 | 16.89 |
| 2 | 2024-01-10 | 57.81 | 74.47 | 16.66 |
| 3 | 2024-01-11 | 58.27 | 58.81 | 0.54 |
| 4 | 2024-01-12 | 58.52 | 61.39 | 2.87 |
| 5 | 2024-01-16 | 53.72 | 59.09 | 5.37 |
| 6 | 2024-01-17 | 55.11 | 57.66 | 2.55 |
| 7 | 2024-01-18 | 54.86 | 56.45 | 1.59 |
| 8 | 2024-01-19 | 54.35 | 57.27 | 2.92 |
| 9 | 2024-01-22 | 53.68 | 55.65 | 1.97 |
| 10 | 2024-01-23 | 53.97 | 54.79 | 0.82 |
| 11 | 2024-01-24 | 53.49 | 53.40 | 0.09 |
| 12 | 2024-01-25 | 53.05 | 55.25 | 2.20 |
| 13 | 2024-01-26 | 52.37 | 55.58 | 3.21 |
| 14 | 2024-01-29 | 53.22 | 57.24 | 4.02 |
| 15 | 2024-01-30 | 53.38 | 59.82 | 6.44 |
| 16 | 2024-01-31 | 53.73 | 58.73 | 5.00 |
| 17 | 2024-02-01 | 54.18 | 58.80 | 4.62 |
| 18 | 2024-02-02 | 53.67 | 59.14 | 5.47 |
| 19 | 2024-02-05 | 54.19 | 58.76 | 4.57 |
| 20 | 2024-02-06 | 54.19 | 58.32 | 4.13 |
| 21 | 2024-02-07 | 53.83 | 55.58 | 1.75 |
| 22 | 2024-02-08 | 53.32 | 55.22 | 1.90 |
| 23 | 2024-02-09 | 52.60 | 56.55 | 3.95 |
| 24 | 2024-02-12 | 51.91 | 56.60 | 4.69 |
| 25 | 2024-02-13 | 52.31 | 56.39 | 4.08 |
| 26 | 2024-02-14 | 52.29 | 58.62 | 6.33 |
| 27 | 2024-02-15 | 51.96 | 59.54 | 7.58 |
| 28 | 2024-02-16 | 52.62 | 57.92 | 5.30 |
| 29 | 2024-02-20 | 52.79 | 58.10 | 5.31 |
| 30 | 2024-02-21 | 52.23 | 57.07 | 4.84 |
| 31 | 2024-02-22 | 52.36 | 53.81 | 1.45 |
| 32 | 2024-02-23 | 53.14 | 53.88 | 0.74 |
| 33 | 2024-02-26 | 52.08 | 53.51 | 1.43 |
| 34 | 2024-02-27 | 52.81 | 52.55 | 0.26 |
| 35 | 2024-02-28 | 52.69 | 51.48 | 1.21 |
| 36 | 2024-02-29 | 52.22 | 52.52 | 0.30 |
| 37 | 2024-03-01 | 51.96 | 56.09 | 4.13 |
| 38 | 2024-03-04 | 52.20 | 56.39 | 4.19 |
| 39 | 2024-03-05 | 53.32 | 56.00 | 2.68 |
| 40 | 2024-03-06 | 53.36 | 55.48 | 2.12 |
| 41 | 2024-03-07 | 52.87 | 55.38 | 2.51 |
| 42 | 2024-03-08 | 52.53 | 52.44 | 0.09 |
| 43 | 2024-03-11 | 53.15 | 53.38 | 0.23 |
| 44 | 2024-03-12 | 52.30 | 54.41 | 2.11 |
| 45 | 2024-03-13 | 52.51 | 54.54 | 2.03 |
| 46 | 2024-03-14 | 52.67 | 55.20 | 2.53 |
| 47 | 2024-03-15 | 52.80 | 53.47 | 0.67 |
| 48 | 2024-03-18 | 52.71 | 52.92 | 0.21 |
| 49 | 2024-03-19 | 51.89 | 52.90 | 1.01 |
| 50 | 2024-03-20 | 51.53 | 52.87 | 1.34 |
| 51 | 2024-03-21 | 51.50 | 53.28 | 1.78 |
| 52 | 2024-03-22 | 51.36 | 54.28 | 2.92 |
| 53 | 2024-03-25 | 51.60 | 57.08 | 5.48 |
| 54 | 2024-03-26 | 52.23 | 56.50 | 4.27 |
| 55 | 2024-03-27 | 53.22 | 56.28 | 3.06 |
| 56 | 2024-03-28 | 52.87 | 58.51 | 5.64 |
| 57 | 2024-04-01 | 54.48 | 58.21 | 3.73 |
| 58 | 2024-04-02 | 55.19 | 57.44 | 2.25 |
| 59 | 2024-04-03 | 54.73 | 56.97 | 2.24 |
| 60 | 2024-04-04 | 54.23 | 55.18 | 0.95 |
| 61 | 2024-04-05 | 53.37 | 53.52 | 0.15 |
| 62 | 2024-04-08 | 52.24 | 53.09 | 0.85 |
| 63 | 2024-04-09 | 51.70 | 48.52 | 3.18 |
| 64 | 2024-04-10 | 51.30 | 47.20 | 4.10 |
| 65 | 2024-04-11 | 51.02 | 47.09 | 3.93 |
| 66 | 2024-04-12 | 50.22 | 44.75 | 5.47 |
| 67 | 2024-04-15 | 50.05 | 44.87 | 5.18 |
| 68 | 2024-04-16 | 49.52 | 44.12 | 5.40 |
| 69 | 2024-04-17 | 50.11 | 43.54 | 6.57 |
| 70 | 2024-04-18 | 50.31 | 44.26 | 6.05 |
| 71 | 2024-04-19 | 50.35 | 43.59 | 6.76 |
| 72 | 2024-04-22 | 50.95 | 46.32 | 4.63 |
| 73 | 2024-04-23 | 50.94 | 45.22 | 5.72 |
| 74 | 2024-04-24 | 52.20 | 44.53 | 7.67 |
| 75 | 2024-04-25 | 51.96 | 43.41 | 8.55 |
| 76 | 2024-04-26 | 51.43 | 43.01 | 8.42 |
| 77 | 2024-04-29 | 50.90 | 42.90 | 8.00 |
| 78 | 2024-04-30 | 50.78 | 43.79 | 6.99 |
| 79 | 2024-05-01 | 50.78 | 44.07 | 6.71 |
| 80 | 2024-05-02 | 51.29 | 43.60 | 7.69 |
| 81 | 2024-05-03 | 51.34 | 48.55 | 2.79 |
| 82 | 2024-05-06 | 51.20 | 49.41 | 1.79 |
| 83 | 2024-05-07 | 53.57 | 49.70 | 3.87 |
| 84 | 2024-05-08 | 53.54 | 51.31 | 2.23 |
| 85 | 2024-05-09 | 53.45 | 51.75 | 1.70 |
| 86 | 2024-05-10 | 53.96 | 53.20 | 0.76 |
| 87 | 2024-05-13 | 54.00 | 52.55 | 1.45 |
| 88 | 2024-05-14 | 54.43 | 53.00 | 1.43 |
| 89 | 2024-05-15 | 55.82 | 50.23 | 5.59 |
| 90 | 2024-05-16 | 55.88 | 50.70 | 5.18 |
| 91 | 2024-05-17 | 54.64 | 52.10 | 2.54 |
| 92 | 2024-05-20 | 54.33 | 50.94 | 3.38 |
| 93 | 2024-05-21 | 54.82 | 48.44 | 6.38 |
| 94 | 2024-05-22 | 54.24 | 50.62 | 3.61 |
| 95 | 2024-05-23 | 52.81 | 55.45 | 2.64 |
| 96 | 2024-05-24 | 53.42 | 53.02 | 0.40 |
| 97 | 2024-05-28 | 55.38 | 53.61 | 1.77 |
| 98 | 2024-05-29 | 54.63 | 53.61 | 1.02 |
| 99 | 2024-05-30 | 54.73 | 55.06 | 0.33 |
| 100 | 2024-05-31 | 54.52 | 51.96 | 2.56 |
| 101 | 2024-06-03 | 54.87 | 53.44 | 1.43 |
| 102 | 2024-06-04 | 53.89 | 53.34 | 0.55 |
| 103 | 2024-06-05 | 54.12 | 52.87 | 1.25 |
| 104 | 2024-06-06 | 53.91 | 50.13 | 3.78 |
| 105 | 2024-06-07 | 53.77 | 51.22 | 2.55 |
| 106 | 2024-06-10 | 52.94 | 48.46 | 4.48 |
| 107 | 2024-06-11 | 53.31 | 47.68 | 5.63 |
| 108 | 2024-06-12 | 53.45 | 42.70 | 10.75 |
| 109 | 2024-06-13 | 53.48 | 43.06 | 10.42 |
| 110 | 2024-06-14 | 52.53 | 43.22 | 9.31 |
| 111 | 2024-06-17 | 52.84 | 40.86 | 11.98 |
| 112 | 2024-06-18 | 53.09 | 41.72 | 11.37 |
| 113 | 2024-06-20 | 52.31 | 42.00 | 10.31 |
| 114 | 2024-06-21 | 52.37 | 44.14 | 8.23 |
| 115 | 2024-06-24 | 52.49 | 42.97 | 9.52 |
| 116 | 2024-06-25 | 52.93 | 44.80 | 8.13 |
| 117 | 2024-06-26 | 52.56 | 45.41 | 7.15 |
| 118 | 2024-06-27 | 52.56 | 46.34 | 6.22 |
| 119 | 2024-06-28 | 52.47 | 46.54 | 5.93 |
| 120 | 2024-07-01 | 52.30 | 47.29 | 5.01 |
| 121 | 2024-07-02 | 51.98 | 46.13 | 5.85 |
| 122 | 2024-07-03 | 51.93 | 46.93 | 5.00 |
| 123 | 2024-07-05 | 51.36 | 47.34 | 4.02 |
| 124 | 2024-07-08 | 51.22 | 47.02 | 4.20 |
| 125 | 2024-07-09 | 51.06 | 46.10 | 4.96 |
| 126 | 2024-07-10 | 50.92 | 47.89 | 3.03 |
| 127 | 2024-07-11 | 50.45 | 47.38 | 3.07 |
| 128 | 2024-07-12 | 50.98 | 44.69 | 6.29 |
| 129 | 2024-07-15 | 50.63 | 54.35 | 3.72 |
| 130 | 2024-07-16 | 49.45 | 55.87 | 6.42 |
| 131 | 2024-07-17 | 54.37 | 53.08 | 1.29 |
| 132 | 2024-07-18 | 55.63 | 53.46 | 2.17 |
| 133 | 2024-07-19 | 54.89 | 56.20 | 1.31 |
| 134 | 2024-07-22 | 54.50 | 55.27 | 0.77 |
| 135 | 2024-07-23 | 54.85 | 54.17 | 0.68 |
| 136 | 2024-07-24 | 54.53 | 54.53 | 0.00 |
| 137 | 2024-07-25 | 53.91 | 52.76 | 1.15 |
| 138 | 2024-07-26 | 54.11 | 51.63 | 2.48 |
| 139 | 2024-07-29 | 53.50 | 51.16 | 2.34 |
| 140 | 2024-07-30 | 53.14 | 52.35 | 0.79 |
| 141 | 2024-07-31 | 52.96 | 52.02 | 0.94 |
| 142 | 2024-08-01 | 53.56 | 53.17 | 0.39 |
| 143 | 2024-08-02 | 53.48 | 53.05 | 0.43 |
| 144 | 2024-08-05 | 54.55 | 51.67 | 2.88 |
| 145 | 2024-08-06 | 54.51 | 51.38 | 3.13 |
| 146 | 2024-08-07 | 54.18 | 51.62 | 2.56 |
| 147 | 2024-08-08 | 53.93 | 51.54 | 2.39 |
| 148 | 2024-08-09 | 53.98 | 49.68 | 4.30 |
| 149 | 2024-08-12 | 53.89 | 50.19 | 3.70 |
| 150 | 2024-08-13 | 53.35 | 48.73 | 4.62 |
| 151 | 2024-08-14 | 53.47 | 50.18 | 3.29 |
| 152 | 2024-08-15 | 53.10 | 49.48 | 3.62 |
| 153 | 2024-08-16 | 53.59 | 49.96 | 3.63 |
| 154 | 2024-08-19 | 53.37 | 53.85 | 0.48 |
| 155 | 2024-08-20 | 54.13 | 54.37 | 0.24 |
| 156 | 2024-08-21 | 54.99 | 54.76 | 0.23 |
| 157 | 2024-08-22 | 55.22 | 54.97 | 0.25 |
| 158 | 2024-08-23 | 55.20 | 54.36 | 0.84 |
| 159 | 2024-08-26 | 55.07 | 54.35 | 0.72 |
| 160 | 2024-08-27 | 54.62 | 54.43 | 0.19 |
| 161 | 2024-08-28 | 55.69 | 55.29 | 0.40 |
| 162 | 2024-08-29 | 55.62 | 55.43 | 0.19 |
| 163 | 2024-08-30 | 56.07 | 54.85 | 1.22 |
| 164 | 2024-09-03 | 56.17 | 53.05 | 3.12 |
| 165 | 2024-09-04 | 55.47 | 55.63 | 0.16 |
| 166 | 2024-09-05 | 54.39 | 55.09 | 0.70 |
| 167 | 2024-09-06 | 55.14 | 53.91 | 1.23 |
| 168 | 2024-09-09 | 54.91 | 54.73 | 0.18 |
| 169 | 2024-09-10 | 54.22 | 54.61 | 0.39 |
| 170 | 2024-09-11 | 54.31 | 53.74 | 0.57 |
| 171 | 2024-09-12 | 54.10 | 56.91 | 2.81 |
| 172 | 2024-09-13 | 53.57 | 56.00 | 2.43 |
| 173 | 2024-09-16 | 54.65 | 56.45 | 1.80 |
| 174 | 2024-09-17 | 54.34 | 60.23 | 5.89 |
| 175 | 2024-09-18 | 54.46 | 61.21 | 6.75 |
| 176 | 2024-09-19 | 56.18 | 60.53 | 4.35 |
| 177 | 2024-09-20 | 56.60 | 60.15 | 3.55 |
| 178 | 2024-09-23 | 56.30 | 59.91 | 3.61 |
| 179 | 2024-09-24 | 56.17 | 60.91 | 4.74 |
| 180 | 2024-09-25 | 55.93 | 61.14 | 5.21 |
| 181 | 2024-09-26 | 57.06 | 59.88 | 2.82 |
| 182 | 2024-09-27 | 57.10 | 59.96 | 2.86 |
| 183 | 2024-09-30 | 56.60 | 60.90 | 4.30 |
| 184 | 2024-10-01 | 56.56 | 62.12 | 5.56 |
| 185 | 2024-10-02 | 56.88 | 64.79 | 7.91 |
| 186 | 2024-10-03 | 57.40 | 66.61 | 9.21 |
| 187 | 2024-10-04 | 58.55 | 66.37 | 7.82 |
| 188 | 2024-10-07 | 59.12 | 66.39 | 7.27 |
| 189 | 2024-10-08 | 59.21 | 63.78 | 4.57 |
| 190 | 2024-10-09 | 59.20 | 63.77 | 4.57 |
| 191 | 2024-10-10 | 58.11 | 63.02 | 4.91 |
| 192 | 2024-10-11 | 57.89 | 63.87 | 5.98 |
| 193 | 2024-10-14 | 57.20 | 66.55 | 9.35 |
| 194 | 2024-10-15 | 57.35 | 66.00 | 8.65 |
| 195 | 2024-10-16 | 58.29 | 67.20 | 8.91 |
| 196 | 2024-10-17 | 57.75 | 66.76 | 9.01 |
| 197 | 2024-10-18 | 58.40 | 61.52 | 3.12 |
| 198 | 2024-10-21 | 58.40 | 60.68 | 2.28 |
| 199 | 2024-10-22 | 56.24 | 61.02 | 4.78 |
| 200 | 2024-10-23 | 55.44 | 58.63 | 3.19 |
| 201 | 2024-10-24 | 56.03 | 59.05 | 3.02 |
| 202 | 2024-10-25 | 55.90 | 59.18 | 3.28 |
| 203 | 2024-10-28 | 56.04 | 57.29 | 1.25 |
| 204 | 2024-10-29 | 56.53 | 55.64 | 0.89 |
| 205 | 2024-10-30 | 56.28 | 59.83 | 3.55 |
| 206 | 2024-10-31 | 55.90 | 60.01 | 4.11 |
| 207 | 2024-11-01 | 57.07 | 60.49 | 3.42 |
| 208 | 2024-11-04 | 57.39 | 59.74 | 2.35 |
| 209 | 2024-11-05 | 57.43 | 58.02 | 0.59 |
| 210 | 2024-11-06 | 56.95 | 56.46 | 0.49 |
| 211 | 2024-11-07 | 56.31 | 56.34 | 0.03 |
| 212 | 2024-11-08 | 55.67 | 56.42 | 0.75 |
| 213 | 2024-11-11 | 55.47 | 59.76 | 4.29 |
| 214 | 2024-11-12 | 55.48 | 61.61 | 6.13 |
| 215 | 2024-11-13 | 56.34 | 60.58 | 4.24 |
| 216 | 2024-11-14 | 56.90 | 61.62 | 4.72 |
| 217 | 2024-11-15 | 56.67 | 59.92 | 3.25 |
| 218 | 2024-11-18 | 56.50 | 57.39 | 0.89 |
| 219 | 2024-11-19 | 55.78 | 61.19 | 5.41 |
| 220 | 2024-11-20 | 54.68 | 62.95 | 8.27 |
| 221 | 2024-11-21 | 55.61 | 64.17 | 8.56 |
| 222 | 2024-11-22 | 56.17 | 63.00 | 6.83 |
| 223 | 2024-11-25 | 56.71 | 65.06 | 8.35 |
| 224 | 2024-11-26 | 57.61 | 63.68 | 6.07 |
| 225 | 2024-11-27 | 57.77 | 63.68 | 5.91 |
--------------------------------------------------------------------