diff --git a/src/nn_tests/RNN-LSTM/l.txt b/src/nn_tests/RNN-LSTM/l.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/nn_tests/RNN-LSTM/output.txt b/src/nn_tests/RNN-LSTM/output.txt new file mode 100644 index 0000000..0b936a9 --- /dev/null +++ b/src/nn_tests/RNN-LSTM/output.txt @@ -0,0 +1,39519 @@ +Total valid daily bars used: 227 +First day: 2024-01-08 O=59.23 H=60.68 L=58.82 C=59.64 V=124629 +Last day: 2024-11-29 O=64.45 H=64.45 L=63.00 C=63.77 V=62082 + +Target Min: 40.86, Target Max: 74.47 + +Normalized Targets (First 5 Samples): +Sample 0: 0.933 +Sample 1: 0.930 +Sample 2: 0.965 +Sample 3: 1.000 +Sample 4: 0.534 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.677, o_gate[0] = 0.522, c_hat[0] = -0.501 + c_state[0] = -0.032, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.066, f_gate[0] = 0.703, o_gate[0] = 0.514, c_hat[0] = -0.465 + c_state[0] = -0.053, h_state[0] = -0.027 +Time Step 2: + i_gate[0] = 0.064, f_gate[0] = 0.701, o_gate[0] = 0.521, c_hat[0] = -0.467 + c_state[0] = -0.067, h_state[0] = -0.035 +Time Step 3: + i_gate[0] = 0.060, f_gate[0] = 0.730, o_gate[0] = 0.502, c_hat[0] = -0.485 + c_state[0] = -0.078, h_state[0] = -0.039 +Time Step 4: + i_gate[0] = 0.055, f_gate[0] = 0.746, o_gate[0] = 0.530, c_hat[0] = -0.576 + c_state[0] = -0.090, h_state[0] = -0.048 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.061, dc_hat[0] = 0.016 + Gradient do_[0] = -0.050 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.061, dc_hat[0] = 0.019 + Gradient do_[0] = -0.047 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.085, dc_hat[0] = 0.029 + Gradient do_[0] = -0.050 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.083, dc_hat[0] = 0.029 + Gradient do_[0] = -0.040 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.676, o_gate[0] = 0.523, c_hat[0] = -0.504 + c_state[0] = -0.033, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.066, f_gate[0] = 0.702, o_gate[0] = 0.516, c_hat[0] = -0.470 + c_state[0] = -0.054, h_state[0] = -0.028 +Time Step 2: + i_gate[0] = 0.065, f_gate[0] = 0.700, o_gate[0] = 0.522, c_hat[0] = -0.472 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 3: + i_gate[0] = 0.061, f_gate[0] = 0.729, o_gate[0] = 0.503, c_hat[0] = -0.489 + c_state[0] = -0.079, h_state[0] = -0.040 +Time Step 4: + i_gate[0] = 0.056, f_gate[0] = 0.745, o_gate[0] = 0.530, c_hat[0] = -0.580 + c_state[0] = -0.091, h_state[0] = -0.048 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.050 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.048 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.085, dc_hat[0] = 0.029 + Gradient do_[0] = -0.050 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.083, dc_hat[0] = 0.029 + Gradient do_[0] = -0.040 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.675, o_gate[0] = 0.524, c_hat[0] = -0.508 + c_state[0] = -0.033, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.066, f_gate[0] = 0.701, o_gate[0] = 0.517, c_hat[0] = -0.474 + c_state[0] = -0.054, h_state[0] = -0.028 +Time Step 2: + i_gate[0] = 0.065, f_gate[0] = 0.699, o_gate[0] = 0.523, c_hat[0] = -0.476 + c_state[0] = -0.069, h_state[0] = -0.036 +Time Step 3: + i_gate[0] = 0.061, f_gate[0] = 0.728, o_gate[0] = 0.504, c_hat[0] = -0.493 + c_state[0] = -0.080, h_state[0] = -0.040 +Time Step 4: + i_gate[0] = 0.056, f_gate[0] = 0.743, o_gate[0] = 0.531, c_hat[0] = -0.583 + c_state[0] = -0.092, h_state[0] = -0.049 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.059, dc_hat[0] = 0.015 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.048 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.085, dc_hat[0] = 0.029 + Gradient do_[0] = -0.050 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.674, o_gate[0] = 0.525, c_hat[0] = -0.511 + c_state[0] = -0.033, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.066, f_gate[0] = 0.700, o_gate[0] = 0.518, c_hat[0] = -0.478 + c_state[0] = -0.055, h_state[0] = -0.028 +Time Step 2: + i_gate[0] = 0.065, f_gate[0] = 0.698, o_gate[0] = 0.524, c_hat[0] = -0.481 + c_state[0] = -0.070, h_state[0] = -0.036 +Time Step 3: + i_gate[0] = 0.061, f_gate[0] = 0.727, o_gate[0] = 0.504, c_hat[0] = -0.497 + c_state[0] = -0.081, h_state[0] = -0.041 +Time Step 4: + i_gate[0] = 0.056, f_gate[0] = 0.742, o_gate[0] = 0.531, c_hat[0] = -0.587 + c_state[0] = -0.093, h_state[0] = -0.049 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.015 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.049 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.084, dc_hat[0] = 0.029 + Gradient do_[0] = -0.051 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.673, o_gate[0] = 0.526, c_hat[0] = -0.515 + c_state[0] = -0.033, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.066, f_gate[0] = 0.699, o_gate[0] = 0.519, c_hat[0] = -0.483 + c_state[0] = -0.055, h_state[0] = -0.029 +Time Step 2: + i_gate[0] = 0.065, f_gate[0] = 0.697, o_gate[0] = 0.525, c_hat[0] = -0.485 + c_state[0] = -0.070, h_state[0] = -0.037 +Time Step 3: + i_gate[0] = 0.062, f_gate[0] = 0.726, o_gate[0] = 0.505, c_hat[0] = -0.501 + c_state[0] = -0.082, h_state[0] = -0.041 +Time Step 4: + i_gate[0] = 0.057, f_gate[0] = 0.741, o_gate[0] = 0.532, c_hat[0] = -0.590 + c_state[0] = -0.094, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.015 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.049 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.084, dc_hat[0] = 0.029 + Gradient do_[0] = -0.051 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.672, o_gate[0] = 0.528, c_hat[0] = -0.518 + c_state[0] = -0.034, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.066, f_gate[0] = 0.698, o_gate[0] = 0.520, c_hat[0] = -0.487 + c_state[0] = -0.056, h_state[0] = -0.029 +Time Step 2: + i_gate[0] = 0.066, f_gate[0] = 0.696, o_gate[0] = 0.526, c_hat[0] = -0.489 + c_state[0] = -0.071, h_state[0] = -0.037 +Time Step 3: + i_gate[0] = 0.062, f_gate[0] = 0.724, o_gate[0] = 0.506, c_hat[0] = -0.505 + c_state[0] = -0.083, h_state[0] = -0.042 +Time Step 4: + i_gate[0] = 0.057, f_gate[0] = 0.739, o_gate[0] = 0.532, c_hat[0] = -0.593 + c_state[0] = -0.095, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.050 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.084, dc_hat[0] = 0.028 + Gradient do_[0] = -0.051 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.029 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.671, o_gate[0] = 0.529, c_hat[0] = -0.521 + c_state[0] = -0.034, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.067, f_gate[0] = 0.698, o_gate[0] = 0.521, c_hat[0] = -0.491 + c_state[0] = -0.056, h_state[0] = -0.029 +Time Step 2: + i_gate[0] = 0.066, f_gate[0] = 0.695, o_gate[0] = 0.527, c_hat[0] = -0.493 + c_state[0] = -0.072, h_state[0] = -0.038 +Time Step 3: + i_gate[0] = 0.062, f_gate[0] = 0.723, o_gate[0] = 0.507, c_hat[0] = -0.509 + c_state[0] = -0.084, h_state[0] = -0.042 +Time Step 4: + i_gate[0] = 0.057, f_gate[0] = 0.738, o_gate[0] = 0.533, c_hat[0] = -0.597 + c_state[0] = -0.096, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.050 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028 + Gradient do_[0] = -0.042 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.670, o_gate[0] = 0.530, c_hat[0] = -0.524 + c_state[0] = -0.034, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.067, f_gate[0] = 0.697, o_gate[0] = 0.523, c_hat[0] = -0.495 + c_state[0] = -0.057, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.066, f_gate[0] = 0.694, o_gate[0] = 0.528, c_hat[0] = -0.497 + c_state[0] = -0.072, h_state[0] = -0.038 +Time Step 3: + i_gate[0] = 0.063, f_gate[0] = 0.722, o_gate[0] = 0.507, c_hat[0] = -0.513 + c_state[0] = -0.084, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.058, f_gate[0] = 0.737, o_gate[0] = 0.533, c_hat[0] = -0.600 + c_state[0] = -0.097, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.014 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.051 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028 + Gradient do_[0] = -0.042 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.025 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.669, o_gate[0] = 0.531, c_hat[0] = -0.527 + c_state[0] = -0.034, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.067, f_gate[0] = 0.696, o_gate[0] = 0.524, c_hat[0] = -0.499 + c_state[0] = -0.057, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.066, f_gate[0] = 0.693, o_gate[0] = 0.529, c_hat[0] = -0.501 + c_state[0] = -0.073, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.063, f_gate[0] = 0.721, o_gate[0] = 0.508, c_hat[0] = -0.517 + c_state[0] = -0.085, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.058, f_gate[0] = 0.736, o_gate[0] = 0.534, c_hat[0] = -0.603 + c_state[0] = -0.098, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.055, dc_hat[0] = 0.014 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.051 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.083, dc_hat[0] = 0.028 + Gradient do_[0] = -0.042 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.025 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.668, o_gate[0] = 0.532, c_hat[0] = -0.530 + c_state[0] = -0.034, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.067, f_gate[0] = 0.695, o_gate[0] = 0.525, c_hat[0] = -0.503 + c_state[0] = -0.058, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.067, f_gate[0] = 0.692, o_gate[0] = 0.530, c_hat[0] = -0.505 + c_state[0] = -0.074, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.063, f_gate[0] = 0.720, o_gate[0] = 0.509, c_hat[0] = -0.520 + c_state[0] = -0.086, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.058, f_gate[0] = 0.734, o_gate[0] = 0.534, c_hat[0] = -0.606 + c_state[0] = -0.098, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.052 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.668, o_gate[0] = 0.533, c_hat[0] = -0.533 + c_state[0] = -0.035, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.067, f_gate[0] = 0.694, o_gate[0] = 0.526, c_hat[0] = -0.507 + c_state[0] = -0.058, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.067, f_gate[0] = 0.691, o_gate[0] = 0.531, c_hat[0] = -0.509 + c_state[0] = -0.074, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.509, c_hat[0] = -0.524 + c_state[0] = -0.087, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.059, f_gate[0] = 0.733, o_gate[0] = 0.535, c_hat[0] = -0.609 + c_state[0] = -0.099, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.052 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.667, o_gate[0] = 0.534, c_hat[0] = -0.536 + c_state[0] = -0.035, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.068, f_gate[0] = 0.693, o_gate[0] = 0.527, c_hat[0] = -0.510 + c_state[0] = -0.059, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.067, f_gate[0] = 0.691, o_gate[0] = 0.532, c_hat[0] = -0.513 + c_state[0] = -0.075, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.510, c_hat[0] = -0.527 + c_state[0] = -0.087, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.059, f_gate[0] = 0.732, o_gate[0] = 0.535, c_hat[0] = -0.612 + c_state[0] = -0.100, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.052 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027 + Gradient do_[0] = -0.053 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.666, o_gate[0] = 0.535, c_hat[0] = -0.539 + c_state[0] = -0.035, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.068, f_gate[0] = 0.693, o_gate[0] = 0.528, c_hat[0] = -0.514 + c_state[0] = -0.059, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.067, f_gate[0] = 0.690, o_gate[0] = 0.532, c_hat[0] = -0.516 + c_state[0] = -0.076, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.064, f_gate[0] = 0.717, o_gate[0] = 0.511, c_hat[0] = -0.531 + c_state[0] = -0.088, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.059, f_gate[0] = 0.731, o_gate[0] = 0.535, c_hat[0] = -0.615 + c_state[0] = -0.101, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.053 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027 + Gradient do_[0] = -0.053 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.665, o_gate[0] = 0.536, c_hat[0] = -0.541 + c_state[0] = -0.035, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.068, f_gate[0] = 0.692, o_gate[0] = 0.529, c_hat[0] = -0.517 + c_state[0] = -0.060, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.068, f_gate[0] = 0.689, o_gate[0] = 0.533, c_hat[0] = -0.520 + c_state[0] = -0.076, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.064, f_gate[0] = 0.717, o_gate[0] = 0.511, c_hat[0] = -0.534 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.060, f_gate[0] = 0.730, o_gate[0] = 0.536, c_hat[0] = -0.618 + c_state[0] = -0.102, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.053 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027 + Gradient do_[0] = -0.053 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.082, dc_hat[0] = 0.028 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.665, o_gate[0] = 0.536, c_hat[0] = -0.544 + c_state[0] = -0.036, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.068, f_gate[0] = 0.691, o_gate[0] = 0.530, c_hat[0] = -0.520 + c_state[0] = -0.060, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.068, f_gate[0] = 0.688, o_gate[0] = 0.534, c_hat[0] = -0.523 + c_state[0] = -0.077, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.065, f_gate[0] = 0.716, o_gate[0] = 0.512, c_hat[0] = -0.537 + c_state[0] = -0.090, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.060, f_gate[0] = 0.729, o_gate[0] = 0.536, c_hat[0] = -0.621 + c_state[0] = -0.103, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.053 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027 + Gradient do_[0] = -0.053 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.664, o_gate[0] = 0.537, c_hat[0] = -0.546 + c_state[0] = -0.036, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.068, f_gate[0] = 0.691, o_gate[0] = 0.531, c_hat[0] = -0.524 + c_state[0] = -0.060, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.068, f_gate[0] = 0.688, o_gate[0] = 0.535, c_hat[0] = -0.526 + c_state[0] = -0.077, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.065, f_gate[0] = 0.715, o_gate[0] = 0.512, c_hat[0] = -0.540 + c_state[0] = -0.090, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.060, f_gate[0] = 0.728, o_gate[0] = 0.536, c_hat[0] = -0.623 + c_state[0] = -0.103, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.053 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.079, dc_hat[0] = 0.026 + Gradient do_[0] = -0.053 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.078, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.663, o_gate[0] = 0.538, c_hat[0] = -0.548 + c_state[0] = -0.036, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.068, f_gate[0] = 0.690, o_gate[0] = 0.531, c_hat[0] = -0.527 + c_state[0] = -0.061, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.068, f_gate[0] = 0.687, o_gate[0] = 0.535, c_hat[0] = -0.529 + c_state[0] = -0.078, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.065, f_gate[0] = 0.714, o_gate[0] = 0.513, c_hat[0] = -0.543 + c_state[0] = -0.091, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.060, f_gate[0] = 0.728, o_gate[0] = 0.537, c_hat[0] = -0.626 + c_state[0] = -0.104, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.053 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.079, dc_hat[0] = 0.026 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.081, dc_hat[0] = 0.027 + Gradient do_[0] = -0.044 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.663, o_gate[0] = 0.539, c_hat[0] = -0.550 + c_state[0] = -0.036, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.069, f_gate[0] = 0.689, o_gate[0] = 0.532, c_hat[0] = -0.529 + c_state[0] = -0.061, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.069, f_gate[0] = 0.686, o_gate[0] = 0.536, c_hat[0] = -0.532 + c_state[0] = -0.078, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.065, f_gate[0] = 0.713, o_gate[0] = 0.513, c_hat[0] = -0.546 + c_state[0] = -0.092, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.061, f_gate[0] = 0.727, o_gate[0] = 0.537, c_hat[0] = -0.628 + c_state[0] = -0.105, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.053 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.078, dc_hat[0] = 0.026 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027 + Gradient do_[0] = -0.044 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.662, o_gate[0] = 0.539, c_hat[0] = -0.553 + c_state[0] = -0.036, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.069, f_gate[0] = 0.689, o_gate[0] = 0.533, c_hat[0] = -0.532 + c_state[0] = -0.062, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.069, f_gate[0] = 0.686, o_gate[0] = 0.536, c_hat[0] = -0.535 + c_state[0] = -0.079, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.066, f_gate[0] = 0.713, o_gate[0] = 0.513, c_hat[0] = -0.548 + c_state[0] = -0.092, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.061, f_gate[0] = 0.726, o_gate[0] = 0.537, c_hat[0] = -0.631 + c_state[0] = -0.105, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.049 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.053 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.026 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.027 + Gradient do_[0] = -0.044 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.662, o_gate[0] = 0.540, c_hat[0] = -0.555 + c_state[0] = -0.036, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.069, f_gate[0] = 0.688, o_gate[0] = 0.533, c_hat[0] = -0.535 + c_state[0] = -0.062, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.069, f_gate[0] = 0.685, o_gate[0] = 0.537, c_hat[0] = -0.538 + c_state[0] = -0.080, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.066, f_gate[0] = 0.712, o_gate[0] = 0.514, c_hat[0] = -0.551 + c_state[0] = -0.093, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.061, f_gate[0] = 0.725, o_gate[0] = 0.537, c_hat[0] = -0.633 + c_state[0] = -0.106, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.048 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.053, dc_hat[0] = 0.017 + Gradient do_[0] = -0.052 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025 + Gradient do_[0] = -0.052 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.080, dc_hat[0] = 0.026 + Gradient do_[0] = -0.044 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.661, o_gate[0] = 0.540, c_hat[0] = -0.556 + c_state[0] = -0.036, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.069, f_gate[0] = 0.688, o_gate[0] = 0.534, c_hat[0] = -0.538 + c_state[0] = -0.062, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.069, f_gate[0] = 0.685, o_gate[0] = 0.537, c_hat[0] = -0.541 + c_state[0] = -0.080, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.066, f_gate[0] = 0.711, o_gate[0] = 0.514, c_hat[0] = -0.554 + c_state[0] = -0.094, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.061, f_gate[0] = 0.724, o_gate[0] = 0.537, c_hat[0] = -0.635 + c_state[0] = -0.107, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.048 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.017 + Gradient do_[0] = -0.052 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025 + Gradient do_[0] = -0.051 +Backward Time Step 1: + Gradient di[0] = -0.019, df[0] = 0.079, dc_hat[0] = 0.026 + Gradient do_[0] = -0.044 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.024 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.660, o_gate[0] = 0.541, c_hat[0] = -0.558 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.069, f_gate[0] = 0.687, o_gate[0] = 0.534, c_hat[0] = -0.540 + c_state[0] = -0.063, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.069, f_gate[0] = 0.684, o_gate[0] = 0.538, c_hat[0] = -0.543 + c_state[0] = -0.081, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.066, f_gate[0] = 0.711, o_gate[0] = 0.514, c_hat[0] = -0.556 + c_state[0] = -0.094, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.062, f_gate[0] = 0.724, o_gate[0] = 0.537, c_hat[0] = -0.638 + c_state[0] = -0.108, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.048 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.052 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.025 + Gradient do_[0] = -0.051 +Backward Time Step 1: + Gradient di[0] = -0.019, df[0] = 0.079, dc_hat[0] = 0.026 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.023 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.660, o_gate[0] = 0.541, c_hat[0] = -0.560 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.069, f_gate[0] = 0.687, o_gate[0] = 0.535, c_hat[0] = -0.543 + c_state[0] = -0.063, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.538, c_hat[0] = -0.546 + c_state[0] = -0.081, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.067, f_gate[0] = 0.710, o_gate[0] = 0.514, c_hat[0] = -0.558 + c_state[0] = -0.095, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.062, f_gate[0] = 0.723, o_gate[0] = 0.537, c_hat[0] = -0.640 + c_state[0] = -0.108, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011 + Gradient do_[0] = -0.047 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.051, dc_hat[0] = 0.016 + Gradient do_[0] = -0.051 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.024 + Gradient do_[0] = -0.051 +Backward Time Step 1: + Gradient di[0] = -0.019, df[0] = 0.078, dc_hat[0] = 0.026 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.023 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.660, o_gate[0] = 0.541, c_hat[0] = -0.562 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.069, f_gate[0] = 0.687, o_gate[0] = 0.535, c_hat[0] = -0.545 + c_state[0] = -0.063, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.070, f_gate[0] = 0.683, o_gate[0] = 0.538, c_hat[0] = -0.548 + c_state[0] = -0.081, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.067, f_gate[0] = 0.710, o_gate[0] = 0.514, c_hat[0] = -0.561 + c_state[0] = -0.095, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.062, f_gate[0] = 0.722, o_gate[0] = 0.537, c_hat[0] = -0.642 + c_state[0] = -0.109, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.011, df[0] = 0.044, dc_hat[0] = 0.011 + Gradient do_[0] = -0.047 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.050, dc_hat[0] = 0.016 + Gradient do_[0] = -0.050 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.024 + Gradient do_[0] = -0.050 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.078, dc_hat[0] = 0.026 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.659, o_gate[0] = 0.542, c_hat[0] = -0.563 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.686, o_gate[0] = 0.535, c_hat[0] = -0.547 + c_state[0] = -0.063, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.070, f_gate[0] = 0.683, o_gate[0] = 0.538, c_hat[0] = -0.551 + c_state[0] = -0.082, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.067, f_gate[0] = 0.709, o_gate[0] = 0.514, c_hat[0] = -0.563 + c_state[0] = -0.096, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.062, f_gate[0] = 0.722, o_gate[0] = 0.537, c_hat[0] = -0.644 + c_state[0] = -0.109, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.011 + Gradient do_[0] = -0.046 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.050 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.024 + Gradient do_[0] = -0.050 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.077, dc_hat[0] = 0.025 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.659, o_gate[0] = 0.542, c_hat[0] = -0.565 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.686, o_gate[0] = 0.536, c_hat[0] = -0.550 + c_state[0] = -0.064, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.070, f_gate[0] = 0.682, o_gate[0] = 0.538, c_hat[0] = -0.553 + c_state[0] = -0.082, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.067, f_gate[0] = 0.709, o_gate[0] = 0.514, c_hat[0] = -0.565 + c_state[0] = -0.096, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.721, o_gate[0] = 0.537, c_hat[0] = -0.646 + c_state[0] = -0.110, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.011 + Gradient do_[0] = -0.045 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.015 + Gradient do_[0] = -0.049 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.023 + Gradient do_[0] = -0.049 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.658, o_gate[0] = 0.542, c_hat[0] = -0.566 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.552 + c_state[0] = -0.064, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.070, f_gate[0] = 0.682, o_gate[0] = 0.538, c_hat[0] = -0.555 + c_state[0] = -0.083, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.068, f_gate[0] = 0.708, o_gate[0] = 0.514, c_hat[0] = -0.567 + c_state[0] = -0.097, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.721, o_gate[0] = 0.537, c_hat[0] = -0.648 + c_state[0] = -0.111, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.010 + Gradient do_[0] = -0.045 +Backward Time Step 3: + Gradient di[0] = -0.011, df[0] = 0.046, dc_hat[0] = 0.015 + Gradient do_[0] = -0.048 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.023 + Gradient do_[0] = -0.049 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.025 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.076, dc_hat[0] = 0.023 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.658, o_gate[0] = 0.542, c_hat[0] = -0.568 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.554 + c_state[0] = -0.064, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.538, c_hat[0] = -0.557 + c_state[0] = -0.083, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.068, f_gate[0] = 0.708, o_gate[0] = 0.514, c_hat[0] = -0.569 + c_state[0] = -0.097, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.720, o_gate[0] = 0.537, c_hat[0] = -0.650 + c_state[0] = -0.111, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.010 + Gradient do_[0] = -0.044 +Backward Time Step 3: + Gradient di[0] = -0.011, df[0] = 0.045, dc_hat[0] = 0.014 + Gradient do_[0] = -0.048 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.023 + Gradient do_[0] = -0.048 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.025 + Gradient do_[0] = -0.043 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.023 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.658, o_gate[0] = 0.542, c_hat[0] = -0.569 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.556 + c_state[0] = -0.065, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.538, c_hat[0] = -0.559 + c_state[0] = -0.084, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.068, f_gate[0] = 0.707, o_gate[0] = 0.513, c_hat[0] = -0.571 + c_state[0] = -0.098, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.720, o_gate[0] = 0.536, c_hat[0] = -0.651 + c_state[0] = -0.112, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.010 + Gradient do_[0] = -0.043 +Backward Time Step 3: + Gradient di[0] = -0.011, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.047 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.022 + Gradient do_[0] = -0.048 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.024 + Gradient do_[0] = -0.042 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.657, o_gate[0] = 0.542, c_hat[0] = -0.571 + c_state[0] = -0.037, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.685, o_gate[0] = 0.536, c_hat[0] = -0.557 + c_state[0] = -0.065, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.538, c_hat[0] = -0.561 + c_state[0] = -0.084, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.068, f_gate[0] = 0.707, o_gate[0] = 0.513, c_hat[0] = -0.573 + c_state[0] = -0.098, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.536, c_hat[0] = -0.653 + c_state[0] = -0.112, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.010 + Gradient do_[0] = -0.043 +Backward Time Step 3: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.046 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.067, dc_hat[0] = 0.022 + Gradient do_[0] = -0.047 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.024 + Gradient do_[0] = -0.042 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.075, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.657, o_gate[0] = 0.542, c_hat[0] = -0.572 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.536, c_hat[0] = -0.559 + c_state[0] = -0.065, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.538, c_hat[0] = -0.563 + c_state[0] = -0.084, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.068, f_gate[0] = 0.707, o_gate[0] = 0.513, c_hat[0] = -0.575 + c_state[0] = -0.099, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.536, c_hat[0] = -0.655 + c_state[0] = -0.113, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.009 + Gradient do_[0] = -0.042 +Backward Time Step 3: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.045 +Backward Time Step 2: + Gradient di[0] = -0.017, df[0] = 0.066, dc_hat[0] = 0.022 + Gradient do_[0] = -0.047 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.024 + Gradient do_[0] = -0.042 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.657, o_gate[0] = 0.542, c_hat[0] = -0.573 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.536, c_hat[0] = -0.561 + c_state[0] = -0.065, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.538, c_hat[0] = -0.564 + c_state[0] = -0.085, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.512, c_hat[0] = -0.576 + c_state[0] = -0.099, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.535, c_hat[0] = -0.656 + c_state[0] = -0.113, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.009 + Gradient do_[0] = -0.041 +Backward Time Step 3: + Gradient di[0] = -0.010, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.044 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.021 + Gradient do_[0] = -0.046 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.024 + Gradient do_[0] = -0.042 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.542, c_hat[0] = -0.574 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.684, o_gate[0] = 0.536, c_hat[0] = -0.562 + c_state[0] = -0.065, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.538, c_hat[0] = -0.566 + c_state[0] = -0.085, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.512, c_hat[0] = -0.578 + c_state[0] = -0.100, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.535, c_hat[0] = -0.657 + c_state[0] = -0.114, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.009 + Gradient do_[0] = -0.040 +Backward Time Step 3: + Gradient di[0] = -0.010, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.043 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.021 + Gradient do_[0] = -0.045 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.072, dc_hat[0] = 0.023 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.074, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.575 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.684, o_gate[0] = 0.535, c_hat[0] = -0.564 + c_state[0] = -0.066, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.680, o_gate[0] = 0.537, c_hat[0] = -0.567 + c_state[0] = -0.085, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.512, c_hat[0] = -0.579 + c_state[0] = -0.100, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.534, c_hat[0] = -0.659 + c_state[0] = -0.114, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.009, df[0] = 0.035, dc_hat[0] = 0.009 + Gradient do_[0] = -0.040 +Backward Time Step 3: + Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.042 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.021 + Gradient do_[0] = -0.045 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.071, dc_hat[0] = 0.023 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.018, df[0] = 0.073, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.576 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.535, c_hat[0] = -0.565 + c_state[0] = -0.066, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.680, o_gate[0] = 0.537, c_hat[0] = -0.569 + c_state[0] = -0.085, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.706, o_gate[0] = 0.511, c_hat[0] = -0.580 + c_state[0] = -0.100, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.534, c_hat[0] = -0.660 + c_state[0] = -0.114, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.008 + Gradient do_[0] = -0.039 +Backward Time Step 3: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.041 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020 + Gradient do_[0] = -0.044 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.071, dc_hat[0] = 0.023 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.073, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.577 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.535, c_hat[0] = -0.567 + c_state[0] = -0.066, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.536, c_hat[0] = -0.570 + c_state[0] = -0.086, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.511, c_hat[0] = -0.582 + c_state[0] = -0.101, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.533, c_hat[0] = -0.661 + c_state[0] = -0.115, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.008 + Gradient do_[0] = -0.038 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = 0.037, dc_hat[0] = 0.012 + Gradient do_[0] = -0.041 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.020 + Gradient do_[0] = -0.044 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.070, dc_hat[0] = 0.023 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.073, dc_hat[0] = 0.022 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.656, o_gate[0] = 0.541, c_hat[0] = -0.578 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.535, c_hat[0] = -0.568 + c_state[0] = -0.066, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.536, c_hat[0] = -0.571 + c_state[0] = -0.086, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.510, c_hat[0] = -0.583 + c_state[0] = -0.101, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.533, c_hat[0] = -0.663 + c_state[0] = -0.115, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.008 + Gradient do_[0] = -0.037 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.040 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.020 + Gradient do_[0] = -0.043 +Backward Time Step 1: + Gradient di[0] = -0.018, df[0] = 0.069, dc_hat[0] = 0.022 + Gradient do_[0] = -0.040 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.540, c_hat[0] = -0.579 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.534, c_hat[0] = -0.569 + c_state[0] = -0.066, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.536, c_hat[0] = -0.573 + c_state[0] = -0.086, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.509, c_hat[0] = -0.584 + c_state[0] = -0.101, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.532, c_hat[0] = -0.664 + c_state[0] = -0.116, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.008 + Gradient do_[0] = -0.036 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.039 +Backward Time Step 2: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.042 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.022 + Gradient do_[0] = -0.040 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.540, c_hat[0] = -0.580 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.534, c_hat[0] = -0.571 + c_state[0] = -0.066, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.535, c_hat[0] = -0.574 + c_state[0] = -0.087, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.509, c_hat[0] = -0.585 + c_state[0] = -0.102, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.532, c_hat[0] = -0.665 + c_state[0] = -0.116, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.008 + Gradient do_[0] = -0.036 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.011 + Gradient do_[0] = -0.038 +Backward Time Step 2: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.042 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.022 + Gradient do_[0] = -0.040 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.072, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.539, c_hat[0] = -0.581 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.533, c_hat[0] = -0.572 + c_state[0] = -0.067, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.534, c_hat[0] = -0.575 + c_state[0] = -0.087, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.705, o_gate[0] = 0.508, c_hat[0] = -0.586 + c_state[0] = -0.102, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.531, c_hat[0] = -0.666 + c_state[0] = -0.116, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.007 + Gradient do_[0] = -0.035 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.037 +Backward Time Step 2: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.019 + Gradient do_[0] = -0.041 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.022 + Gradient do_[0] = -0.040 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.539, c_hat[0] = -0.581 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.683, o_gate[0] = 0.533, c_hat[0] = -0.573 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.534, c_hat[0] = -0.576 + c_state[0] = -0.087, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.705, o_gate[0] = 0.507, c_hat[0] = -0.587 + c_state[0] = -0.102, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.530, c_hat[0] = -0.667 + c_state[0] = -0.117, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.007 + Gradient do_[0] = -0.034 +Backward Time Step 3: + Gradient di[0] = -0.008, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.036 +Backward Time Step 2: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.019 + Gradient do_[0] = -0.041 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.067, dc_hat[0] = 0.022 + Gradient do_[0] = -0.039 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.539, c_hat[0] = -0.582 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.532, c_hat[0] = -0.574 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.679, o_gate[0] = 0.533, c_hat[0] = -0.577 + c_state[0] = -0.087, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.507, c_hat[0] = -0.588 + c_state[0] = -0.102, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.529, c_hat[0] = -0.668 + c_state[0] = -0.117, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.007 + Gradient do_[0] = -0.033 +Backward Time Step 3: + Gradient di[0] = -0.008, df[0] = 0.031, dc_hat[0] = 0.010 + Gradient do_[0] = -0.035 +Backward Time Step 2: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.040 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.066, dc_hat[0] = 0.021 + Gradient do_[0] = -0.039 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.655, o_gate[0] = 0.538, c_hat[0] = -0.583 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.532, c_hat[0] = -0.575 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.533, c_hat[0] = -0.578 + c_state[0] = -0.087, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.506, c_hat[0] = -0.589 + c_state[0] = -0.103, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.529, c_hat[0] = -0.668 + c_state[0] = -0.117, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.007 + Gradient do_[0] = -0.033 +Backward Time Step 3: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.035 +Backward Time Step 2: + Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.018 + Gradient do_[0] = -0.040 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.066, dc_hat[0] = 0.021 + Gradient do_[0] = -0.039 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.071, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.537, c_hat[0] = -0.583 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.531, c_hat[0] = -0.576 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.532, c_hat[0] = -0.578 + c_state[0] = -0.087, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.505, c_hat[0] = -0.590 + c_state[0] = -0.103, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.528, c_hat[0] = -0.669 + c_state[0] = -0.117, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = 0.027, dc_hat[0] = 0.007 + Gradient do_[0] = -0.032 +Backward Time Step 3: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.034 +Backward Time Step 2: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.018 + Gradient do_[0] = -0.039 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.065, dc_hat[0] = 0.021 + Gradient do_[0] = -0.039 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.021 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.537, c_hat[0] = -0.584 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.531, c_hat[0] = -0.576 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.531, c_hat[0] = -0.579 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.504, c_hat[0] = -0.590 + c_state[0] = -0.103, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.527, c_hat[0] = -0.670 + c_state[0] = -0.118, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.006 + Gradient do_[0] = -0.031 +Backward Time Step 3: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.033 +Backward Time Step 2: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.018 + Gradient do_[0] = -0.039 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.065, dc_hat[0] = 0.021 + Gradient do_[0] = -0.038 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.536, c_hat[0] = -0.585 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.530, c_hat[0] = -0.577 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.530, c_hat[0] = -0.580 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.503, c_hat[0] = -0.591 + c_state[0] = -0.103, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.526, c_hat[0] = -0.671 + c_state[0] = -0.118, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.006 + Gradient do_[0] = -0.031 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = 0.028, dc_hat[0] = 0.009 + Gradient do_[0] = -0.033 +Backward Time Step 2: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.017 + Gradient do_[0] = -0.038 +Backward Time Step 1: + Gradient di[0] = -0.017, df[0] = 0.064, dc_hat[0] = 0.021 + Gradient do_[0] = -0.038 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.070, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.536, c_hat[0] = -0.585 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.529, c_hat[0] = -0.578 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.530, c_hat[0] = -0.580 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.502, c_hat[0] = -0.592 + c_state[0] = -0.103, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.525, c_hat[0] = -0.671 + c_state[0] = -0.118, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.006 + Gradient do_[0] = -0.030 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = 0.028, dc_hat[0] = 0.009 + Gradient do_[0] = -0.032 +Backward Time Step 2: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.017 + Gradient do_[0] = -0.038 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.020 + Gradient do_[0] = -0.038 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.535, c_hat[0] = -0.586 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.529, c_hat[0] = -0.578 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.529, c_hat[0] = -0.581 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.502, c_hat[0] = -0.592 + c_state[0] = -0.103, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.524, c_hat[0] = -0.672 + c_state[0] = -0.118, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.006 + Gradient do_[0] = -0.030 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.031 +Backward Time Step 2: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.017 + Gradient do_[0] = -0.037 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.020 + Gradient do_[0] = -0.038 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.534, c_hat[0] = -0.586 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.528, c_hat[0] = -0.579 + c_state[0] = -0.067, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.528, c_hat[0] = -0.582 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.501, c_hat[0] = -0.593 + c_state[0] = -0.104, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.523, c_hat[0] = -0.672 + c_state[0] = -0.118, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.006 + Gradient do_[0] = -0.029 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.031 +Backward Time Step 2: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.017 + Gradient do_[0] = -0.037 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.020 + Gradient do_[0] = -0.038 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.066, f_gate[0] = 0.654, o_gate[0] = 0.534, c_hat[0] = -0.587 + c_state[0] = -0.038, h_state[0] = -0.021 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.527, c_hat[0] = -0.580 + c_state[0] = -0.068, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.527, c_hat[0] = -0.582 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.500, c_hat[0] = -0.593 + c_state[0] = -0.104, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.522, c_hat[0] = -0.673 + c_state[0] = -0.118, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.006 + Gradient do_[0] = -0.029 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.030 +Backward Time Step 2: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.017 + Gradient do_[0] = -0.037 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020 + Gradient do_[0] = -0.038 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Epoch 50, Train Loss=0.070382, Weight Norm=8.702058 +Sample Predictions at Epoch 50: + Day 192 (2024-10-11) => Predicted: 65.196, Actual: 63.870, Error: 1.33 + Day 193 (2024-10-14) => Predicted: 65.778, Actual: 66.550, Error: 0.77 + Day 194 (2024-10-15) => Predicted: 66.128, Actual: 66.000, Error: 0.13 + Day 195 (2024-10-16) => Predicted: 66.036, Actual: 67.200, Error: 1.16 + Day 196 (2024-10-17) => Predicted: 65.810, Actual: 66.760, Error: 0.95 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.533, c_hat[0] = -0.587 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.526, c_hat[0] = -0.580 + c_state[0] = -0.068, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.526, c_hat[0] = -0.582 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.499, c_hat[0] = -0.594 + c_state[0] = -0.104, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.522, c_hat[0] = -0.673 + c_state[0] = -0.118, h_state[0] = -0.062 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.006 + Gradient do_[0] = -0.028 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.008 + Gradient do_[0] = -0.030 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.069, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.532, c_hat[0] = -0.588 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.525, c_hat[0] = -0.581 + c_state[0] = -0.068, h_state[0] = -0.036 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.525, c_hat[0] = -0.583 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.498, c_hat[0] = -0.594 + c_state[0] = -0.104, h_state[0] = -0.052 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.521, c_hat[0] = -0.674 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.005 + Gradient do_[0] = -0.028 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.008 + Gradient do_[0] = -0.029 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.062, dc_hat[0] = 0.020 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.532, c_hat[0] = -0.588 + c_state[0] = -0.038, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.525, c_hat[0] = -0.581 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.524, c_hat[0] = -0.583 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.497, c_hat[0] = -0.594 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.520, c_hat[0] = -0.674 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.005 + Gradient do_[0] = -0.027 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.024, dc_hat[0] = 0.008 + Gradient do_[0] = -0.029 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.020 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.654, o_gate[0] = 0.531, c_hat[0] = -0.589 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.524, c_hat[0] = -0.582 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.523, c_hat[0] = -0.584 + c_state[0] = -0.088, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.496, c_hat[0] = -0.595 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.519, c_hat[0] = -0.674 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.005 + Gradient do_[0] = -0.027 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.028 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.016 + Gradient do_[0] = -0.035 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.019 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.017, df[0] = 0.068, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.530, c_hat[0] = -0.589 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.523, c_hat[0] = -0.582 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.522, c_hat[0] = -0.584 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.495, c_hat[0] = -0.595 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.518, c_hat[0] = -0.675 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.005 + Gradient do_[0] = -0.027 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.028 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.016 + Gradient do_[0] = -0.035 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.061, dc_hat[0] = 0.019 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.068, dc_hat[0] = 0.020 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.530, c_hat[0] = -0.589 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.522, c_hat[0] = -0.582 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.521, c_hat[0] = -0.584 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.494, c_hat[0] = -0.595 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.517, c_hat[0] = -0.675 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.005 + Gradient do_[0] = -0.026 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.027 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.016 + Gradient do_[0] = -0.035 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.068, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.529, c_hat[0] = -0.590 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.521, c_hat[0] = -0.583 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.521, c_hat[0] = -0.584 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.493, c_hat[0] = -0.595 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.516, c_hat[0] = -0.675 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.005 + Gradient do_[0] = -0.026 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.027 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.015 + Gradient do_[0] = -0.035 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.528, c_hat[0] = -0.590 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.520, c_hat[0] = -0.583 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.520, c_hat[0] = -0.585 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.492, c_hat[0] = -0.596 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.515, c_hat[0] = -0.676 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005 + Gradient do_[0] = -0.026 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007 + Gradient do_[0] = -0.027 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.015 + Gradient do_[0] = -0.034 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.060, dc_hat[0] = 0.019 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.527, c_hat[0] = -0.591 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.520, c_hat[0] = -0.583 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.519, c_hat[0] = -0.585 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.491, c_hat[0] = -0.596 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.514, c_hat[0] = -0.676 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007 + Gradient do_[0] = -0.027 +Backward Time Step 2: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.015 + Gradient do_[0] = -0.034 +Backward Time Step 1: + Gradient di[0] = -0.016, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.037 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.527, c_hat[0] = -0.591 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.519, c_hat[0] = -0.584 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.518, c_hat[0] = -0.585 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.490, c_hat[0] = -0.596 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.513, c_hat[0] = -0.676 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007 + Gradient do_[0] = -0.026 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.015 + Gradient do_[0] = -0.034 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.526, c_hat[0] = -0.591 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.518, c_hat[0] = -0.584 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.517, c_hat[0] = -0.585 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.489, c_hat[0] = -0.596 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.512, c_hat[0] = -0.676 + c_state[0] = -0.119, h_state[0] = -0.061 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.020, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.007 + Gradient do_[0] = -0.026 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015 + Gradient do_[0] = -0.034 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.525, c_hat[0] = -0.592 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.517, c_hat[0] = -0.584 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.516, c_hat[0] = -0.585 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.488, c_hat[0] = -0.596 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.511, c_hat[0] = -0.677 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.007 + Gradient do_[0] = -0.026 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015 + Gradient do_[0] = -0.034 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.019 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.067, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.524, c_hat[0] = -0.592 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.516, c_hat[0] = -0.585 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.515, c_hat[0] = -0.586 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.487, c_hat[0] = -0.597 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.510, c_hat[0] = -0.677 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.007 + Gradient do_[0] = -0.026 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015 + Gradient do_[0] = -0.034 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.019 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.524, c_hat[0] = -0.592 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.515, c_hat[0] = -0.585 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.514, c_hat[0] = -0.586 + c_state[0] = -0.089, h_state[0] = -0.046 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.486, c_hat[0] = -0.597 + c_state[0] = -0.104, h_state[0] = -0.051 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.509, c_hat[0] = -0.677 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.026 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.015 + Gradient do_[0] = -0.034 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.523, c_hat[0] = -0.593 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.514, c_hat[0] = -0.585 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.513, c_hat[0] = -0.586 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.485, c_hat[0] = -0.597 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.508, c_hat[0] = -0.677 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.015 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.522, c_hat[0] = -0.593 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.514, c_hat[0] = -0.585 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.512, c_hat[0] = -0.586 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.484, c_hat[0] = -0.597 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.507, c_hat[0] = -0.677 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.015 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.521, c_hat[0] = -0.593 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.513, c_hat[0] = -0.585 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.511, c_hat[0] = -0.586 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.483, c_hat[0] = -0.597 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.506, c_hat[0] = -0.678 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.015 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.521, c_hat[0] = -0.593 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.512, c_hat[0] = -0.586 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.510, c_hat[0] = -0.586 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.482, c_hat[0] = -0.597 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.505, c_hat[0] = -0.678 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.520, c_hat[0] = -0.594 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.511, c_hat[0] = -0.586 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.509, c_hat[0] = -0.586 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.481, c_hat[0] = -0.597 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.504, c_hat[0] = -0.678 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.519, c_hat[0] = -0.594 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.510, c_hat[0] = -0.586 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.508, c_hat[0] = -0.587 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.480, c_hat[0] = -0.598 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.503, c_hat[0] = -0.678 + c_state[0] = -0.119, h_state[0] = -0.060 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.518, c_hat[0] = -0.594 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.509, c_hat[0] = -0.586 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.507, c_hat[0] = -0.587 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.479, c_hat[0] = -0.598 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.717, o_gate[0] = 0.502, c_hat[0] = -0.678 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.066, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.518, c_hat[0] = -0.595 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.508, c_hat[0] = -0.587 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.506, c_hat[0] = -0.587 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.478, c_hat[0] = -0.598 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.501, c_hat[0] = -0.678 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.517, c_hat[0] = -0.595 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.507, c_hat[0] = -0.587 + c_state[0] = -0.068, h_state[0] = -0.035 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.506, c_hat[0] = -0.587 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.477, c_hat[0] = -0.598 + c_state[0] = -0.104, h_state[0] = -0.050 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.500, c_hat[0] = -0.679 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.516, c_hat[0] = -0.595 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.507, c_hat[0] = -0.587 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.505, c_hat[0] = -0.587 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.476, c_hat[0] = -0.598 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.499, c_hat[0] = -0.679 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.019 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.515, c_hat[0] = -0.596 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.506, c_hat[0] = -0.587 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.504, c_hat[0] = -0.587 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.475, c_hat[0] = -0.598 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.498, c_hat[0] = -0.679 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.515, c_hat[0] = -0.596 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.505, c_hat[0] = -0.587 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.503, c_hat[0] = -0.587 + c_state[0] = -0.089, h_state[0] = -0.045 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.474, c_hat[0] = -0.598 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.497, c_hat[0] = -0.679 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.057, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.514, c_hat[0] = -0.596 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.504, c_hat[0] = -0.588 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.678, o_gate[0] = 0.502, c_hat[0] = -0.588 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.474, c_hat[0] = -0.599 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.496, c_hat[0] = -0.679 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.513, c_hat[0] = -0.596 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.503, c_hat[0] = -0.588 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.501, c_hat[0] = -0.588 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.473, c_hat[0] = -0.599 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.495, c_hat[0] = -0.679 + c_state[0] = -0.119, h_state[0] = -0.059 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.513, c_hat[0] = -0.597 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.502, c_hat[0] = -0.588 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.500, c_hat[0] = -0.588 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.472, c_hat[0] = -0.599 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.494, c_hat[0] = -0.680 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.512, c_hat[0] = -0.597 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.502, c_hat[0] = -0.588 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.499, c_hat[0] = -0.588 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.471, c_hat[0] = -0.599 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.065, f_gate[0] = 0.718, o_gate[0] = 0.494, c_hat[0] = -0.680 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.653, o_gate[0] = 0.511, c_hat[0] = -0.597 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.682, o_gate[0] = 0.501, c_hat[0] = -0.589 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.498, c_hat[0] = -0.588 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.470, c_hat[0] = -0.599 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.493, c_hat[0] = -0.680 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.510, c_hat[0] = -0.597 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.500, c_hat[0] = -0.589 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.498, c_hat[0] = -0.588 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.704, o_gate[0] = 0.469, c_hat[0] = -0.599 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.492, c_hat[0] = -0.680 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.018 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.510, c_hat[0] = -0.598 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.499, c_hat[0] = -0.589 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.497, c_hat[0] = -0.589 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.468, c_hat[0] = -0.600 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.491, c_hat[0] = -0.680 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.509, c_hat[0] = -0.598 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.498, c_hat[0] = -0.589 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.496, c_hat[0] = -0.589 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.467, c_hat[0] = -0.600 + c_state[0] = -0.104, h_state[0] = -0.049 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.490, c_hat[0] = -0.681 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.065, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.508, c_hat[0] = -0.598 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.498, c_hat[0] = -0.590 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.495, c_hat[0] = -0.589 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.466, c_hat[0] = -0.600 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.489, c_hat[0] = -0.681 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.508, c_hat[0] = -0.599 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.497, c_hat[0] = -0.590 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.494, c_hat[0] = -0.589 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.466, c_hat[0] = -0.600 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.488, c_hat[0] = -0.681 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.020, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.043, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.056, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.507, c_hat[0] = -0.599 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.496, c_hat[0] = -0.590 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.493, c_hat[0] = -0.589 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.465, c_hat[0] = -0.600 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.488, c_hat[0] = -0.681 + c_state[0] = -0.119, h_state[0] = -0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.014 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.506, c_hat[0] = -0.599 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.495, c_hat[0] = -0.590 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.493, c_hat[0] = -0.589 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.464, c_hat[0] = -0.601 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.487, c_hat[0] = -0.681 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.506, c_hat[0] = -0.599 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.495, c_hat[0] = -0.591 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.492, c_hat[0] = -0.590 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.463, c_hat[0] = -0.601 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.486, c_hat[0] = -0.682 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.505, c_hat[0] = -0.600 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.494, c_hat[0] = -0.591 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.491, c_hat[0] = -0.590 + c_state[0] = -0.089, h_state[0] = -0.044 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.462, c_hat[0] = -0.601 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.485, c_hat[0] = -0.682 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.504, c_hat[0] = -0.600 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.493, c_hat[0] = -0.591 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.490, c_hat[0] = -0.590 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.462, c_hat[0] = -0.601 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.718, o_gate[0] = 0.484, c_hat[0] = -0.682 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.504, c_hat[0] = -0.600 + c_state[0] = -0.039, h_state[0] = -0.020 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.492, c_hat[0] = -0.591 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.490, c_hat[0] = -0.590 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.461, c_hat[0] = -0.602 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.483, c_hat[0] = -0.682 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.503, c_hat[0] = -0.601 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.492, c_hat[0] = -0.592 + c_state[0] = -0.068, h_state[0] = -0.034 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.489, c_hat[0] = -0.591 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.460, c_hat[0] = -0.602 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.483, c_hat[0] = -0.682 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.502, c_hat[0] = -0.601 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.491, c_hat[0] = -0.592 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.488, c_hat[0] = -0.591 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.459, c_hat[0] = -0.602 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.482, c_hat[0] = -0.683 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.502, c_hat[0] = -0.601 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.490, c_hat[0] = -0.592 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.487, c_hat[0] = -0.591 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.458, c_hat[0] = -0.602 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.481, c_hat[0] = -0.683 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.501, c_hat[0] = -0.602 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.490, c_hat[0] = -0.592 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.486, c_hat[0] = -0.591 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.458, c_hat[0] = -0.603 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.480, c_hat[0] = -0.683 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.042, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.064, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.065, f_gate[0] = 0.652, o_gate[0] = 0.500, c_hat[0] = -0.602 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.489, c_hat[0] = -0.593 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.486, c_hat[0] = -0.592 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.457, c_hat[0] = -0.603 + c_state[0] = -0.104, h_state[0] = -0.048 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.479, c_hat[0] = -0.683 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.500, c_hat[0] = -0.602 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.488, c_hat[0] = -0.593 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.485, c_hat[0] = -0.592 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.456, c_hat[0] = -0.603 + c_state[0] = -0.104, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.479, c_hat[0] = -0.684 + c_state[0] = -0.119, h_state[0] = -0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.499, c_hat[0] = -0.602 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.487, c_hat[0] = -0.593 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.484, c_hat[0] = -0.592 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.455, c_hat[0] = -0.603 + c_state[0] = -0.104, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.478, c_hat[0] = -0.684 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.499, c_hat[0] = -0.603 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.487, c_hat[0] = -0.594 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.484, c_hat[0] = -0.592 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.455, c_hat[0] = -0.604 + c_state[0] = -0.104, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.477, c_hat[0] = -0.684 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.016, df[0] = 0.063, dc_hat[0] = 0.018 + Gradient do_[0] = -0.024 +Epoch 100, Train Loss=0.032092, Weight Norm=8.704198 +Sample Predictions at Epoch 100: + Day 192 (2024-10-11) => Predicted: 61.580, Actual: 63.870, Error: 2.29 + Day 193 (2024-10-14) => Predicted: 62.203, Actual: 66.550, Error: 4.35 + Day 194 (2024-10-15) => Predicted: 62.507, Actual: 66.000, Error: 3.49 + Day 195 (2024-10-16) => Predicted: 62.350, Actual: 67.200, Error: 4.85 + Day 196 (2024-10-17) => Predicted: 62.186, Actual: 66.760, Error: 4.57 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.498, c_hat[0] = -0.603 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.486, c_hat[0] = -0.594 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.483, c_hat[0] = -0.593 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.454, c_hat[0] = -0.604 + c_state[0] = -0.104, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.476, c_hat[0] = -0.684 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.497, c_hat[0] = -0.603 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.485, c_hat[0] = -0.594 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.482, c_hat[0] = -0.593 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.453, c_hat[0] = -0.604 + c_state[0] = -0.104, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.476, c_hat[0] = -0.685 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.652, o_gate[0] = 0.497, c_hat[0] = -0.604 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.485, c_hat[0] = -0.595 + c_state[0] = -0.068, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.481, c_hat[0] = -0.593 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.453, c_hat[0] = -0.604 + c_state[0] = -0.104, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.475, c_hat[0] = -0.685 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.496, c_hat[0] = -0.604 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.484, c_hat[0] = -0.595 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.481, c_hat[0] = -0.594 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.452, c_hat[0] = -0.605 + c_state[0] = -0.104, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.474, c_hat[0] = -0.685 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.496, c_hat[0] = -0.604 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.483, c_hat[0] = -0.595 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.480, c_hat[0] = -0.594 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.451, c_hat[0] = -0.605 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.474, c_hat[0] = -0.686 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.017 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.495, c_hat[0] = -0.604 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.483, c_hat[0] = -0.595 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.479, c_hat[0] = -0.594 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.450, c_hat[0] = -0.605 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.473, c_hat[0] = -0.686 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.019, dc_hat[0] = 0.006 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.063, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.494, c_hat[0] = -0.605 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.482, c_hat[0] = -0.596 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.479, c_hat[0] = -0.594 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.450, c_hat[0] = -0.606 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.472, c_hat[0] = -0.686 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.494, c_hat[0] = -0.605 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.482, c_hat[0] = -0.596 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.478, c_hat[0] = -0.595 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.449, c_hat[0] = -0.606 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.471, c_hat[0] = -0.686 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.493, c_hat[0] = -0.605 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.481, c_hat[0] = -0.596 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.477, c_hat[0] = -0.595 + c_state[0] = -0.089, h_state[0] = -0.043 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.448, c_hat[0] = -0.606 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.471, c_hat[0] = -0.687 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.493, c_hat[0] = -0.606 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.480, c_hat[0] = -0.597 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.477, c_hat[0] = -0.595 + c_state[0] = -0.089, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.448, c_hat[0] = -0.607 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.470, c_hat[0] = -0.687 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.013 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.492, c_hat[0] = -0.606 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.480, c_hat[0] = -0.597 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.476, c_hat[0] = -0.596 + c_state[0] = -0.089, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.447, c_hat[0] = -0.607 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.469, c_hat[0] = -0.687 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.025 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.492, c_hat[0] = -0.606 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.681, o_gate[0] = 0.479, c_hat[0] = -0.597 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.475, c_hat[0] = -0.596 + c_state[0] = -0.089, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.446, c_hat[0] = -0.607 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.719, o_gate[0] = 0.469, c_hat[0] = -0.688 + c_state[0] = -0.119, h_state[0] = -0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.491, c_hat[0] = -0.606 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.478, c_hat[0] = -0.598 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.475, c_hat[0] = -0.596 + c_state[0] = -0.089, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.446, c_hat[0] = -0.608 + c_state[0] = -0.105, h_state[0] = -0.047 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.468, c_hat[0] = -0.688 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.053, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.490, c_hat[0] = -0.607 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.478, c_hat[0] = -0.598 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.678, o_gate[0] = 0.474, c_hat[0] = -0.597 + c_state[0] = -0.089, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.445, c_hat[0] = -0.608 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.467, c_hat[0] = -0.688 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.490, c_hat[0] = -0.607 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.477, c_hat[0] = -0.598 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.474, c_hat[0] = -0.597 + c_state[0] = -0.089, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.444, c_hat[0] = -0.608 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.467, c_hat[0] = -0.688 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.489, c_hat[0] = -0.607 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.477, c_hat[0] = -0.599 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.473, c_hat[0] = -0.597 + c_state[0] = -0.089, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.444, c_hat[0] = -0.609 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.466, c_hat[0] = -0.689 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.062, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.489, c_hat[0] = -0.608 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.476, c_hat[0] = -0.599 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.472, c_hat[0] = -0.598 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.705, o_gate[0] = 0.443, c_hat[0] = -0.609 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.465, c_hat[0] = -0.689 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.488, c_hat[0] = -0.608 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.475, c_hat[0] = -0.599 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.472, c_hat[0] = -0.598 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.443, c_hat[0] = -0.609 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.465, c_hat[0] = -0.689 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.488, c_hat[0] = -0.608 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.475, c_hat[0] = -0.600 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.471, c_hat[0] = -0.598 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.442, c_hat[0] = -0.610 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.464, c_hat[0] = -0.690 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.651, o_gate[0] = 0.487, c_hat[0] = -0.608 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.474, c_hat[0] = -0.600 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.471, c_hat[0] = -0.599 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.441, c_hat[0] = -0.610 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.464, c_hat[0] = -0.690 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.018, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.487, c_hat[0] = -0.609 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.474, c_hat[0] = -0.600 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.470, c_hat[0] = -0.599 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.441, c_hat[0] = -0.610 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.463, c_hat[0] = -0.690 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.486, c_hat[0] = -0.609 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.473, c_hat[0] = -0.601 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.469, c_hat[0] = -0.599 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.440, c_hat[0] = -0.611 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.462, c_hat[0] = -0.691 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.039, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.052, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.486, c_hat[0] = -0.609 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.473, c_hat[0] = -0.601 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.469, c_hat[0] = -0.600 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.439, c_hat[0] = -0.611 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.462, c_hat[0] = -0.691 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.485, c_hat[0] = -0.610 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.472, c_hat[0] = -0.601 + c_state[0] = -0.069, h_state[0] = -0.033 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.468, c_hat[0] = -0.600 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.439, c_hat[0] = -0.611 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.461, c_hat[0] = -0.691 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.024 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.485, c_hat[0] = -0.610 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.472, c_hat[0] = -0.602 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.468, c_hat[0] = -0.600 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.438, c_hat[0] = -0.612 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.460, c_hat[0] = -0.692 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.017 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.484, c_hat[0] = -0.610 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.471, c_hat[0] = -0.602 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.467, c_hat[0] = -0.601 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.438, c_hat[0] = -0.612 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.460, c_hat[0] = -0.692 + c_state[0] = -0.119, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.016 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.061, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.484, c_hat[0] = -0.610 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.470, c_hat[0] = -0.602 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.466, c_hat[0] = -0.601 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.437, c_hat[0] = -0.612 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.459, c_hat[0] = -0.692 + c_state[0] = -0.120, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.483, c_hat[0] = -0.611 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.470, c_hat[0] = -0.603 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.466, c_hat[0] = -0.601 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.437, c_hat[0] = -0.613 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.459, c_hat[0] = -0.692 + c_state[0] = -0.120, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.483, c_hat[0] = -0.611 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.469, c_hat[0] = -0.603 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.465, c_hat[0] = -0.602 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.436, c_hat[0] = -0.613 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.458, c_hat[0] = -0.693 + c_state[0] = -0.120, h_state[0] = -0.055 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.024 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.482, c_hat[0] = -0.611 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.469, c_hat[0] = -0.603 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.465, c_hat[0] = -0.602 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.435, c_hat[0] = -0.613 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.457, c_hat[0] = -0.693 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.038, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.014, df[0] = 0.051, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.482, c_hat[0] = -0.612 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.680, o_gate[0] = 0.468, c_hat[0] = -0.604 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.464, c_hat[0] = -0.602 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.435, c_hat[0] = -0.614 + c_state[0] = -0.105, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.457, c_hat[0] = -0.693 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.481, c_hat[0] = -0.612 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.679, o_gate[0] = 0.468, c_hat[0] = -0.604 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.464, c_hat[0] = -0.603 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.434, c_hat[0] = -0.614 + c_state[0] = -0.106, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.456, c_hat[0] = -0.694 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.017, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.012 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.481, c_hat[0] = -0.612 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.467, c_hat[0] = -0.604 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.463, c_hat[0] = -0.603 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.434, c_hat[0] = -0.614 + c_state[0] = -0.106, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.456, c_hat[0] = -0.694 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.480, c_hat[0] = -0.612 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.467, c_hat[0] = -0.605 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.463, c_hat[0] = -0.603 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.433, c_hat[0] = -0.615 + c_state[0] = -0.106, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.455, c_hat[0] = -0.694 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.480, c_hat[0] = -0.613 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.466, c_hat[0] = -0.605 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.462, c_hat[0] = -0.604 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.433, c_hat[0] = -0.615 + c_state[0] = -0.106, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.455, c_hat[0] = -0.695 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.060, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.650, o_gate[0] = 0.479, c_hat[0] = -0.613 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.466, c_hat[0] = -0.605 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.462, c_hat[0] = -0.604 + c_state[0] = -0.090, h_state[0] = -0.042 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.432, c_hat[0] = -0.615 + c_state[0] = -0.106, h_state[0] = -0.046 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.454, c_hat[0] = -0.695 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.479, c_hat[0] = -0.613 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.465, c_hat[0] = -0.606 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.461, c_hat[0] = -0.604 + c_state[0] = -0.090, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.432, c_hat[0] = -0.616 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.453, c_hat[0] = -0.695 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.003 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.478, c_hat[0] = -0.613 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.465, c_hat[0] = -0.606 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.460, c_hat[0] = -0.605 + c_state[0] = -0.090, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.431, c_hat[0] = -0.616 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.453, c_hat[0] = -0.696 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.478, c_hat[0] = -0.614 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.464, c_hat[0] = -0.606 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.460, c_hat[0] = -0.605 + c_state[0] = -0.090, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.430, c_hat[0] = -0.616 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.452, c_hat[0] = -0.696 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.477, c_hat[0] = -0.614 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.464, c_hat[0] = -0.607 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.677, o_gate[0] = 0.459, c_hat[0] = -0.605 + c_state[0] = -0.090, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.430, c_hat[0] = -0.617 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.452, c_hat[0] = -0.696 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.477, c_hat[0] = -0.614 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.463, c_hat[0] = -0.607 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.459, c_hat[0] = -0.606 + c_state[0] = -0.090, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.429, c_hat[0] = -0.617 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.451, c_hat[0] = -0.696 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.477, c_hat[0] = -0.614 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.463, c_hat[0] = -0.607 + c_state[0] = -0.069, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.458, c_hat[0] = -0.606 + c_state[0] = -0.090, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.429, c_hat[0] = -0.618 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.451, c_hat[0] = -0.697 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.023 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.476, c_hat[0] = -0.615 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.462, c_hat[0] = -0.608 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.458, c_hat[0] = -0.606 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.428, c_hat[0] = -0.618 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.450, c_hat[0] = -0.697 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.005 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.476, c_hat[0] = -0.615 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.462, c_hat[0] = -0.608 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.457, c_hat[0] = -0.607 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.428, c_hat[0] = -0.618 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.450, c_hat[0] = -0.697 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.016, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.475, c_hat[0] = -0.615 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.461, c_hat[0] = -0.608 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.457, c_hat[0] = -0.607 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.427, c_hat[0] = -0.619 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.449, c_hat[0] = -0.698 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.059, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.475, c_hat[0] = -0.616 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.461, c_hat[0] = -0.609 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.456, c_hat[0] = -0.607 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.427, c_hat[0] = -0.619 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.449, c_hat[0] = -0.698 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.474, c_hat[0] = -0.616 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.460, c_hat[0] = -0.609 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.456, c_hat[0] = -0.608 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.426, c_hat[0] = -0.619 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.448, c_hat[0] = -0.698 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.015 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.474, c_hat[0] = -0.616 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.460, c_hat[0] = -0.609 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.455, c_hat[0] = -0.608 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.426, c_hat[0] = -0.620 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.448, c_hat[0] = -0.699 + c_state[0] = -0.120, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.036 +Backward Time Step 0: + Gradient di[0] = -0.015, df[0] = 0.058, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.473, c_hat[0] = -0.616 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.679, o_gate[0] = 0.459, c_hat[0] = -0.610 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.455, c_hat[0] = -0.608 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.425, c_hat[0] = -0.620 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.447, c_hat[0] = -0.699 + c_state[0] = -0.121, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.473, c_hat[0] = -0.617 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.459, c_hat[0] = -0.610 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.455, c_hat[0] = -0.609 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.425, c_hat[0] = -0.620 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.447, c_hat[0] = -0.699 + c_state[0] = -0.121, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.016 + Gradient do_[0] = -0.024 +Epoch 150, Train Loss=0.022807, Weight Norm=8.707580 +Sample Predictions at Epoch 150: + Day 192 (2024-10-11) => Predicted: 59.591, Actual: 63.870, Error: 4.28 + Day 193 (2024-10-14) => Predicted: 60.233, Actual: 66.550, Error: 6.32 + Day 194 (2024-10-15) => Predicted: 60.512, Actual: 66.000, Error: 5.49 + Day 195 (2024-10-16) => Predicted: 60.396, Actual: 67.200, Error: 6.80 + Day 196 (2024-10-17) => Predicted: 60.265, Actual: 66.760, Error: 6.49 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.649, o_gate[0] = 0.473, c_hat[0] = -0.617 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.459, c_hat[0] = -0.610 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.454, c_hat[0] = -0.609 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.424, c_hat[0] = -0.621 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.719, o_gate[0] = 0.446, c_hat[0] = -0.699 + c_state[0] = -0.121, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.472, c_hat[0] = -0.617 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.458, c_hat[0] = -0.611 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.454, c_hat[0] = -0.609 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.424, c_hat[0] = -0.621 + c_state[0] = -0.106, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.446, c_hat[0] = -0.700 + c_state[0] = -0.121, h_state[0] = -0.054 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.472, c_hat[0] = -0.617 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.458, c_hat[0] = -0.611 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.453, c_hat[0] = -0.610 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.704, o_gate[0] = 0.423, c_hat[0] = -0.621 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.445, c_hat[0] = -0.700 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.022 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.471, c_hat[0] = -0.618 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.457, c_hat[0] = -0.611 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.453, c_hat[0] = -0.610 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.423, c_hat[0] = -0.622 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.445, c_hat[0] = -0.700 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.022 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.030 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.058, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.471, c_hat[0] = -0.618 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.457, c_hat[0] = -0.612 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.452, c_hat[0] = -0.610 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.423, c_hat[0] = -0.622 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.444, c_hat[0] = -0.701 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.015, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.035, dc_hat[0] = 0.011 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.048, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.471, c_hat[0] = -0.618 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.456, c_hat[0] = -0.612 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.452, c_hat[0] = -0.611 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.422, c_hat[0] = -0.622 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.444, c_hat[0] = -0.701 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.470, c_hat[0] = -0.618 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.456, c_hat[0] = -0.612 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.451, c_hat[0] = -0.611 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.422, c_hat[0] = -0.623 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.443, c_hat[0] = -0.701 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.470, c_hat[0] = -0.619 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.455, c_hat[0] = -0.613 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.451, c_hat[0] = -0.611 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.421, c_hat[0] = -0.623 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.443, c_hat[0] = -0.702 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.469, c_hat[0] = -0.619 + c_state[0] = -0.039, h_state[0] = -0.019 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.455, c_hat[0] = -0.613 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.450, c_hat[0] = -0.612 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.421, c_hat[0] = -0.623 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.442, c_hat[0] = -0.702 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.469, c_hat[0] = -0.619 + c_state[0] = -0.039, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.455, c_hat[0] = -0.613 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.450, c_hat[0] = -0.612 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.420, c_hat[0] = -0.624 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.442, c_hat[0] = -0.702 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.469, c_hat[0] = -0.619 + c_state[0] = -0.039, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.454, c_hat[0] = -0.614 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.676, o_gate[0] = 0.450, c_hat[0] = -0.612 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.420, c_hat[0] = -0.624 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.441, c_hat[0] = -0.703 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.468, c_hat[0] = -0.619 + c_state[0] = -0.039, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.454, c_hat[0] = -0.614 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.449, c_hat[0] = -0.613 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.419, c_hat[0] = -0.624 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.441, c_hat[0] = -0.703 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.468, c_hat[0] = -0.620 + c_state[0] = -0.039, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.070, f_gate[0] = 0.678, o_gate[0] = 0.453, c_hat[0] = -0.614 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.449, c_hat[0] = -0.613 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.419, c_hat[0] = -0.625 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.440, c_hat[0] = -0.703 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.467, c_hat[0] = -0.620 + c_state[0] = -0.039, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.678, o_gate[0] = 0.453, c_hat[0] = -0.615 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.448, c_hat[0] = -0.613 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.418, c_hat[0] = -0.625 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.440, c_hat[0] = -0.703 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.047, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.057, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.467, c_hat[0] = -0.620 + c_state[0] = -0.039, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.678, o_gate[0] = 0.453, c_hat[0] = -0.615 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.448, c_hat[0] = -0.614 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.418, c_hat[0] = -0.625 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.439, c_hat[0] = -0.704 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.021 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.021 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.467, c_hat[0] = -0.620 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.678, o_gate[0] = 0.452, c_hat[0] = -0.615 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.447, c_hat[0] = -0.614 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.418, c_hat[0] = -0.626 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.439, c_hat[0] = -0.704 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.648, o_gate[0] = 0.466, c_hat[0] = -0.621 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.452, c_hat[0] = -0.616 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.447, c_hat[0] = -0.614 + c_state[0] = -0.091, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.417, c_hat[0] = -0.626 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.439, c_hat[0] = -0.704 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.014, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.466, c_hat[0] = -0.621 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.451, c_hat[0] = -0.616 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.447, c_hat[0] = -0.615 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.417, c_hat[0] = -0.626 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.438, c_hat[0] = -0.705 + c_state[0] = -0.121, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.465, c_hat[0] = -0.621 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.451, c_hat[0] = -0.616 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.446, c_hat[0] = -0.615 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.416, c_hat[0] = -0.627 + c_state[0] = -0.107, h_state[0] = -0.045 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.438, c_hat[0] = -0.705 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.465, c_hat[0] = -0.621 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.451, c_hat[0] = -0.616 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.446, c_hat[0] = -0.615 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.416, c_hat[0] = -0.627 + c_state[0] = -0.107, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.437, c_hat[0] = -0.705 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.465, c_hat[0] = -0.622 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.450, c_hat[0] = -0.617 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.445, c_hat[0] = -0.616 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.415, c_hat[0] = -0.627 + c_state[0] = -0.107, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.437, c_hat[0] = -0.705 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.013, df[0] = 0.046, dc_hat[0] = 0.014 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.464, c_hat[0] = -0.622 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.450, c_hat[0] = -0.617 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.445, c_hat[0] = -0.616 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.415, c_hat[0] = -0.628 + c_state[0] = -0.107, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.436, c_hat[0] = -0.706 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.033, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.013 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.464, c_hat[0] = -0.622 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.449, c_hat[0] = -0.617 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.444, c_hat[0] = -0.616 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.415, c_hat[0] = -0.628 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.436, c_hat[0] = -0.706 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.013 + Gradient do_[0] = -0.035 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.464, c_hat[0] = -0.622 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.449, c_hat[0] = -0.618 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.444, c_hat[0] = -0.617 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.414, c_hat[0] = -0.628 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.436, c_hat[0] = -0.706 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.056, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.463, c_hat[0] = -0.622 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.449, c_hat[0] = -0.618 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.444, c_hat[0] = -0.617 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.414, c_hat[0] = -0.629 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.435, c_hat[0] = -0.707 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.020 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.463, c_hat[0] = -0.623 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.448, c_hat[0] = -0.618 + c_state[0] = -0.070, h_state[0] = -0.032 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.443, c_hat[0] = -0.617 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.413, c_hat[0] = -0.629 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.435, c_hat[0] = -0.707 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.013, dc_hat[0] = 0.003 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.463, c_hat[0] = -0.623 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.448, c_hat[0] = -0.619 + c_state[0] = -0.070, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.443, c_hat[0] = -0.618 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.413, c_hat[0] = -0.629 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.434, c_hat[0] = -0.707 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.462, c_hat[0] = -0.623 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.447, c_hat[0] = -0.619 + c_state[0] = -0.070, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.443, c_hat[0] = -0.618 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.413, c_hat[0] = -0.630 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.434, c_hat[0] = -0.707 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.015 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.462, c_hat[0] = -0.623 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.447, c_hat[0] = -0.619 + c_state[0] = -0.070, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.442, c_hat[0] = -0.618 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.703, o_gate[0] = 0.412, c_hat[0] = -0.630 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.433, c_hat[0] = -0.708 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.013, dc_hat[0] = 0.004 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.010 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.461, c_hat[0] = -0.624 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.447, c_hat[0] = -0.619 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.442, c_hat[0] = -0.619 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.412, c_hat[0] = -0.630 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.433, c_hat[0] = -0.708 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.009 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.461, c_hat[0] = -0.624 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.446, c_hat[0] = -0.620 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.675, o_gate[0] = 0.441, c_hat[0] = -0.619 + c_state[0] = -0.092, h_state[0] = -0.041 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.411, c_hat[0] = -0.631 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.433, c_hat[0] = -0.708 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.032, dc_hat[0] = 0.009 + Gradient do_[0] = -0.028 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.461, c_hat[0] = -0.624 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.446, c_hat[0] = -0.620 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.441, c_hat[0] = -0.619 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.411, c_hat[0] = -0.631 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.432, c_hat[0] = -0.709 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.647, o_gate[0] = 0.460, c_hat[0] = -0.624 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.446, c_hat[0] = -0.620 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.441, c_hat[0] = -0.620 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.411, c_hat[0] = -0.631 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.432, c_hat[0] = -0.709 + c_state[0] = -0.122, h_state[0] = -0.053 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.460, c_hat[0] = -0.624 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.677, o_gate[0] = 0.445, c_hat[0] = -0.621 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.440, c_hat[0] = -0.620 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.410, c_hat[0] = -0.632 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.431, c_hat[0] = -0.709 + c_state[0] = -0.122, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.055, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.460, c_hat[0] = -0.625 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.445, c_hat[0] = -0.621 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.440, c_hat[0] = -0.620 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.410, c_hat[0] = -0.632 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.431, c_hat[0] = -0.709 + c_state[0] = -0.122, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.459, c_hat[0] = -0.625 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.445, c_hat[0] = -0.621 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.439, c_hat[0] = -0.621 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.409, c_hat[0] = -0.632 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.431, c_hat[0] = -0.710 + c_state[0] = -0.122, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.459, c_hat[0] = -0.625 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.444, c_hat[0] = -0.622 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.439, c_hat[0] = -0.621 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.409, c_hat[0] = -0.633 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.718, o_gate[0] = 0.430, c_hat[0] = -0.710 + c_state[0] = -0.122, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.019 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.459, c_hat[0] = -0.625 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.444, c_hat[0] = -0.622 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.439, c_hat[0] = -0.621 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.409, c_hat[0] = -0.633 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.430, c_hat[0] = -0.710 + c_state[0] = -0.122, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.024 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.458, c_hat[0] = -0.625 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.443, c_hat[0] = -0.622 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.438, c_hat[0] = -0.622 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.408, c_hat[0] = -0.633 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.429, c_hat[0] = -0.711 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.064, f_gate[0] = 0.646, o_gate[0] = 0.458, c_hat[0] = -0.626 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.443, c_hat[0] = -0.622 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.438, c_hat[0] = -0.622 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.408, c_hat[0] = -0.634 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.429, c_hat[0] = -0.711 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.031, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.458, c_hat[0] = -0.626 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.443, c_hat[0] = -0.623 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.438, c_hat[0] = -0.622 + c_state[0] = -0.092, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.408, c_hat[0] = -0.634 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.429, c_hat[0] = -0.711 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.012, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.457, c_hat[0] = -0.626 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.442, c_hat[0] = -0.623 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.437, c_hat[0] = -0.622 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.407, c_hat[0] = -0.634 + c_state[0] = -0.108, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.428, c_hat[0] = -0.711 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.009, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.044, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.457, c_hat[0] = -0.626 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.442, c_hat[0] = -0.623 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.437, c_hat[0] = -0.623 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.407, c_hat[0] = -0.634 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.428, c_hat[0] = -0.712 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.027 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.457, c_hat[0] = -0.626 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.442, c_hat[0] = -0.624 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.437, c_hat[0] = -0.623 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.406, c_hat[0] = -0.635 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.427, c_hat[0] = -0.712 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.014, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.456, c_hat[0] = -0.627 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.441, c_hat[0] = -0.624 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.436, c_hat[0] = -0.623 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.406, c_hat[0] = -0.635 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.427, c_hat[0] = -0.712 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.054, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.456, c_hat[0] = -0.627 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.441, c_hat[0] = -0.624 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.436, c_hat[0] = -0.624 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.406, c_hat[0] = -0.635 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.427, c_hat[0] = -0.712 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013 + Gradient do_[0] = -0.034 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.456, c_hat[0] = -0.627 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.441, c_hat[0] = -0.624 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.436, c_hat[0] = -0.624 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.405, c_hat[0] = -0.636 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.426, c_hat[0] = -0.713 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.013 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.455, c_hat[0] = -0.627 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.440, c_hat[0] = -0.625 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.435, c_hat[0] = -0.624 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.405, c_hat[0] = -0.636 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.426, c_hat[0] = -0.713 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.018 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.455, c_hat[0] = -0.627 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.440, c_hat[0] = -0.625 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.435, c_hat[0] = -0.625 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.405, c_hat[0] = -0.636 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.426, c_hat[0] = -0.713 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.646, o_gate[0] = 0.455, c_hat[0] = -0.628 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.440, c_hat[0] = -0.625 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.674, o_gate[0] = 0.434, c_hat[0] = -0.625 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.404, c_hat[0] = -0.637 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.425, c_hat[0] = -0.713 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.030, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Epoch 200, Train Loss=0.018094, Weight Norm=8.711754 +Sample Predictions at Epoch 200: + Day 192 (2024-10-11) => Predicted: 58.925, Actual: 63.870, Error: 4.95 + Day 193 (2024-10-14) => Predicted: 59.571, Actual: 66.550, Error: 6.98 + Day 194 (2024-10-15) => Predicted: 59.838, Actual: 66.000, Error: 6.16 + Day 195 (2024-10-16) => Predicted: 59.802, Actual: 67.200, Error: 7.40 + Day 196 (2024-10-17) => Predicted: 59.701, Actual: 66.760, Error: 7.06 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.676, o_gate[0] = 0.439, c_hat[0] = -0.625 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.434, c_hat[0] = -0.625 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.404, c_hat[0] = -0.637 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.425, c_hat[0] = -0.714 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.043, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.439, c_hat[0] = -0.626 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.434, c_hat[0] = -0.625 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.404, c_hat[0] = -0.637 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.425, c_hat[0] = -0.714 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.439, c_hat[0] = -0.626 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.433, c_hat[0] = -0.626 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.702, o_gate[0] = 0.403, c_hat[0] = -0.638 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.424, c_hat[0] = -0.714 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.011, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.454, c_hat[0] = -0.628 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.438, c_hat[0] = -0.626 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.433, c_hat[0] = -0.626 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.403, c_hat[0] = -0.638 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.424, c_hat[0] = -0.714 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.453, c_hat[0] = -0.628 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.438, c_hat[0] = -0.627 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.433, c_hat[0] = -0.626 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.403, c_hat[0] = -0.638 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.423, c_hat[0] = -0.715 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.053, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.453, c_hat[0] = -0.629 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.438, c_hat[0] = -0.627 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.627 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.402, c_hat[0] = -0.638 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.423, c_hat[0] = -0.715 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.017 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.026 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.453, c_hat[0] = -0.629 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.627 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.627 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.402, c_hat[0] = -0.639 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.423, c_hat[0] = -0.715 + c_state[0] = -0.123, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.009 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.452, c_hat[0] = -0.629 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.627 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.627 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.402, c_hat[0] = -0.639 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.422, c_hat[0] = -0.716 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.452, c_hat[0] = -0.629 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.628 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.432, c_hat[0] = -0.628 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.401, c_hat[0] = -0.639 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.422, c_hat[0] = -0.716 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.017 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.452, c_hat[0] = -0.629 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.437, c_hat[0] = -0.628 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.431, c_hat[0] = -0.628 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.401, c_hat[0] = -0.640 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.422, c_hat[0] = -0.716 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.029, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.014 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.436, c_hat[0] = -0.628 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.431, c_hat[0] = -0.628 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.401, c_hat[0] = -0.640 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.421, c_hat[0] = -0.716 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.042, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.436, c_hat[0] = -0.628 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.431, c_hat[0] = -0.628 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.400, c_hat[0] = -0.640 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.421, c_hat[0] = -0.717 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.436, c_hat[0] = -0.629 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.430, c_hat[0] = -0.629 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.400, c_hat[0] = -0.640 + c_state[0] = -0.109, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.421, c_hat[0] = -0.717 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.451, c_hat[0] = -0.630 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.435, c_hat[0] = -0.629 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.430, c_hat[0] = -0.629 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.400, c_hat[0] = -0.641 + c_state[0] = -0.110, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.420, c_hat[0] = -0.717 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.012, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.450, c_hat[0] = -0.630 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.435, c_hat[0] = -0.629 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.430, c_hat[0] = -0.629 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.399, c_hat[0] = -0.641 + c_state[0] = -0.110, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.420, c_hat[0] = -0.717 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.450, c_hat[0] = -0.630 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.435, c_hat[0] = -0.629 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.429, c_hat[0] = -0.630 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.399, c_hat[0] = -0.641 + c_state[0] = -0.110, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.420, c_hat[0] = -0.718 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.003 + Gradient do_[0] = -0.016 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.450, c_hat[0] = -0.631 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.434, c_hat[0] = -0.630 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.429, c_hat[0] = -0.630 + c_state[0] = -0.093, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.399, c_hat[0] = -0.642 + c_state[0] = -0.110, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.419, c_hat[0] = -0.718 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.010, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.033 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.052, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.645, o_gate[0] = 0.449, c_hat[0] = -0.631 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.434, c_hat[0] = -0.630 + c_state[0] = -0.071, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.429, c_hat[0] = -0.630 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.398, c_hat[0] = -0.642 + c_state[0] = -0.110, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.717, o_gate[0] = 0.419, c_hat[0] = -0.718 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.449, c_hat[0] = -0.631 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.675, o_gate[0] = 0.434, c_hat[0] = -0.630 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.630 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.398, c_hat[0] = -0.642 + c_state[0] = -0.110, h_state[0] = -0.044 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.419, c_hat[0] = -0.718 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.025 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.449, c_hat[0] = -0.631 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.434, c_hat[0] = -0.630 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.631 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.398, c_hat[0] = -0.642 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.418, c_hat[0] = -0.718 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.016 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.028, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.449, c_hat[0] = -0.631 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.433, c_hat[0] = -0.631 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.428, c_hat[0] = -0.631 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.397, c_hat[0] = -0.643 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.418, c_hat[0] = -0.719 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.003 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.448, c_hat[0] = -0.631 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.433, c_hat[0] = -0.631 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.631 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.397, c_hat[0] = -0.643 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.418, c_hat[0] = -0.719 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.041, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.448, c_hat[0] = -0.632 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.433, c_hat[0] = -0.631 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.631 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.397, c_hat[0] = -0.643 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.417, c_hat[0] = -0.719 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.448, c_hat[0] = -0.632 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.631 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.632 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.417, c_hat[0] = -0.719 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.632 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.427, c_hat[0] = -0.632 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.417, c_hat[0] = -0.720 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.632 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.426, c_hat[0] = -0.632 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.416, c_hat[0] = -0.720 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.012 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.432, c_hat[0] = -0.632 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.426, c_hat[0] = -0.633 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.701, o_gate[0] = 0.396, c_hat[0] = -0.644 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.416, c_hat[0] = -0.720 + c_state[0] = -0.124, h_state[0] = -0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.447, c_hat[0] = -0.632 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.431, c_hat[0] = -0.632 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.426, c_hat[0] = -0.633 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.395, c_hat[0] = -0.645 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.416, c_hat[0] = -0.720 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.051, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.431, c_hat[0] = -0.633 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.633 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.395, c_hat[0] = -0.645 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.431, c_hat[0] = -0.633 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.633 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.395, c_hat[0] = -0.645 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.633 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.634 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.394, c_hat[0] = -0.645 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = 0.009, dc_hat[0] = 0.002 + Gradient do_[0] = -0.015 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.027, dc_hat[0] = 0.008 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.446, c_hat[0] = -0.633 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.633 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.425, c_hat[0] = -0.634 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.394, c_hat[0] = -0.646 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.415, c_hat[0] = -0.721 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.040, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.445, c_hat[0] = -0.633 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.634 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.424, c_hat[0] = -0.634 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.394, c_hat[0] = -0.646 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.414, c_hat[0] = -0.722 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.445, c_hat[0] = -0.633 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.430, c_hat[0] = -0.634 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.424, c_hat[0] = -0.634 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.646 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.414, c_hat[0] = -0.722 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.445, c_hat[0] = -0.633 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.634 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.424, c_hat[0] = -0.635 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.647 + c_state[0] = -0.110, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.414, c_hat[0] = -0.722 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.444, c_hat[0] = -0.634 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.634 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.635 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.647 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.413, c_hat[0] = -0.722 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.644, o_gate[0] = 0.444, c_hat[0] = -0.634 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.635 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.635 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.393, c_hat[0] = -0.647 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.413, c_hat[0] = -0.722 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.444, c_hat[0] = -0.634 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.674, o_gate[0] = 0.429, c_hat[0] = -0.635 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.635 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.392, c_hat[0] = -0.647 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.413, c_hat[0] = -0.723 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.008 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.023 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.444, c_hat[0] = -0.634 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.635 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.636 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.392, c_hat[0] = -0.648 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.723 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.634 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.635 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.636 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.392, c_hat[0] = -0.648 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.723 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.050, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.634 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.428, c_hat[0] = -0.635 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.422, c_hat[0] = -0.636 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.648 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.723 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.635 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.422, c_hat[0] = -0.637 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.648 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.412, c_hat[0] = -0.724 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.014 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.026, dc_hat[0] = 0.007 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.443, c_hat[0] = -0.635 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.637 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.649 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.411, c_hat[0] = -0.724 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.637 + c_state[0] = -0.094, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.391, c_hat[0] = -0.649 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.411, c_hat[0] = -0.724 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.023 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.039, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.427, c_hat[0] = -0.636 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.637 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.649 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.411, c_hat[0] = -0.724 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.421, c_hat[0] = -0.638 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.649 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.410, c_hat[0] = -0.724 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.013, df[0] = 0.049, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.442, c_hat[0] = -0.635 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.638 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.650 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.410, c_hat[0] = -0.725 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.008, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.013 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.635 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.638 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.390, c_hat[0] = -0.650 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.716, o_gate[0] = 0.410, c_hat[0] = -0.725 + c_state[0] = -0.125, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.636 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.426, c_hat[0] = -0.637 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.638 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.389, c_hat[0] = -0.650 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.410, c_hat[0] = -0.725 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.636 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.420, c_hat[0] = -0.639 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.389, c_hat[0] = -0.650 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.409, c_hat[0] = -0.725 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Epoch 250, Train Loss=0.015130, Weight Norm=8.716688 +Sample Predictions at Epoch 250: + Day 192 (2024-10-11) => Predicted: 58.722, Actual: 63.870, Error: 5.15 + Day 193 (2024-10-14) => Predicted: 59.364, Actual: 66.550, Error: 7.19 + Day 194 (2024-10-15) => Predicted: 59.624, Actual: 66.000, Error: 6.38 + Day 195 (2024-10-16) => Predicted: 59.677, Actual: 67.200, Error: 7.52 + Day 196 (2024-10-17) => Predicted: 59.600, Actual: 66.760, Error: 7.16 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.441, c_hat[0] = -0.636 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.639 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.700, o_gate[0] = 0.389, c_hat[0] = -0.651 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.409, c_hat[0] = -0.726 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.639 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.651 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.409, c_hat[0] = -0.726 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.425, c_hat[0] = -0.638 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.639 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.651 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.726 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.013 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.025, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.049, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.638 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.640 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.651 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.726 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.440, c_hat[0] = -0.636 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.639 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.388, c_hat[0] = -0.652 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.063, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.726 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.038, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.439, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.639 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.652 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.408, c_hat[0] = -0.727 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.643, o_gate[0] = 0.439, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.673, o_gate[0] = 0.424, c_hat[0] = -0.639 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.652 + c_state[0] = -0.111, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.022 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.439, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.639 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.640 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.652 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.002 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.439, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.639 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.641 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.387, c_hat[0] = -0.653 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.439, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.640 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.072, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.641 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.653 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.407, c_hat[0] = -0.727 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.011 + Gradient do_[0] = -0.031 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.423, c_hat[0] = -0.640 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.417, c_hat[0] = -0.641 + c_state[0] = -0.095, h_state[0] = -0.040 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.653 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.406, c_hat[0] = -0.728 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.640 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.417, c_hat[0] = -0.641 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.653 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.406, c_hat[0] = -0.728 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.637 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.640 + c_state[0] = -0.072, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.386, c_hat[0] = -0.654 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.406, c_hat[0] = -0.728 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.007, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = 0.037, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.438, c_hat[0] = -0.638 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.641 + c_state[0] = -0.073, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.654 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.728 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.641 + c_state[0] = -0.073, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.654 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.728 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.012 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.024, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.422, c_hat[0] = -0.641 + c_state[0] = -0.073, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.416, c_hat[0] = -0.642 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.654 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.729 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.048, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.641 + c_state[0] = -0.073, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.385, c_hat[0] = -0.655 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.405, c_hat[0] = -0.729 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.037, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.437, c_hat[0] = -0.638 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.641 + c_state[0] = -0.073, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.729 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.638 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.642 + c_state[0] = -0.073, h_state[0] = -0.031 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.729 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.011 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.638 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.421, c_hat[0] = -0.642 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.415, c_hat[0] = -0.643 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.729 + c_state[0] = -0.126, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.021 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.642 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.384, c_hat[0] = -0.655 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.404, c_hat[0] = -0.730 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.642 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.436, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.642 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644 + c_state[0] = -0.095, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.002 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.420, c_hat[0] = -0.643 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.419, c_hat[0] = -0.643 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.644 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.383, c_hat[0] = -0.656 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.403, c_hat[0] = -0.730 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.007 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.419, c_hat[0] = -0.643 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.699, o_gate[0] = 0.382, c_hat[0] = -0.657 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.402, c_hat[0] = -0.731 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.011 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.642, o_gate[0] = 0.435, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.672, o_gate[0] = 0.419, c_hat[0] = -0.643 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.657 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.402, c_hat[0] = -0.731 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.023, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.435, c_hat[0] = -0.639 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.643 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.657 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.715, o_gate[0] = 0.402, c_hat[0] = -0.731 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.018 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.419, c_hat[0] = -0.644 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.413, c_hat[0] = -0.645 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.657 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.402, c_hat[0] = -0.731 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.412, c_hat[0] = -0.646 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.382, c_hat[0] = -0.658 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.731 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.036, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.047, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.412, c_hat[0] = -0.646 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658 + c_state[0] = -0.112, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.006, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.030 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.434, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.670, o_gate[0] = 0.412, c_hat[0] = -0.646 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.644 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.412, c_hat[0] = -0.646 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.418, c_hat[0] = -0.645 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.646 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.381, c_hat[0] = -0.658 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.401, c_hat[0] = -0.732 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.732 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.733 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.022 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.433, c_hat[0] = -0.640 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.733 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.417, c_hat[0] = -0.645 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.411, c_hat[0] = -0.647 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.659 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.400, c_hat[0] = -0.733 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.010 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.645 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.380, c_hat[0] = -0.660 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.733 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.733 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.022, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.733 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.012 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.432, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.399, c_hat[0] = -0.734 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.416, c_hat[0] = -0.646 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.410, c_hat[0] = -0.648 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.069, f_gate[0] = 0.698, o_gate[0] = 0.379, c_hat[0] = -0.660 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734 + c_state[0] = -0.127, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.035, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.646 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734 + c_state[0] = -0.128, h_state[0] = -0.051 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.046, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734 + c_state[0] = -0.128, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.641 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.734 + c_state[0] = -0.128, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.019 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.431, c_hat[0] = -0.642 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.398, c_hat[0] = -0.735 + c_state[0] = -0.128, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.018 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.430, c_hat[0] = -0.642 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.415, c_hat[0] = -0.647 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.409, c_hat[0] = -0.649 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.378, c_hat[0] = -0.661 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735 + c_state[0] = -0.128, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.018 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.430, c_hat[0] = -0.642 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.671, o_gate[0] = 0.414, c_hat[0] = -0.647 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.408, c_hat[0] = -0.650 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.377, c_hat[0] = -0.662 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735 + c_state[0] = -0.128, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.005, dc_hat[0] = 0.001 + Gradient do_[0] = -0.009 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.018 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.010 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.641, o_gate[0] = 0.430, c_hat[0] = -0.642 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.647 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.408, c_hat[0] = -0.650 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.377, c_hat[0] = -0.662 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735 + c_state[0] = -0.128, h_state[0] = -0.050 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = 0.004, dc_hat[0] = 0.001 + Gradient do_[0] = -0.008 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = 0.021, dc_hat[0] = 0.006 + Gradient do_[0] = -0.018 +Backward Time Step 1: + Gradient di[0] = -0.010, df[0] = 0.034, dc_hat[0] = 0.009 + Gradient do_[0] = -0.029 +Backward Time Step 0: + Gradient di[0] = -0.012, df[0] = 0.045, dc_hat[0] = 0.011 + Gradient do_[0] = -0.021 +Epoch 300, Train Loss=0.013052, Weight Norm=8.722285 +Sample Predictions at Epoch 300: + Day 192 (2024-10-11) => Predicted: 58.728, Actual: 63.870, Error: 5.14 + Day 193 (2024-10-14) => Predicted: 59.362, Actual: 66.550, Error: 7.19 + Day 194 (2024-10-15) => Predicted: 59.616, Actual: 66.000, Error: 6.38 + Day 195 (2024-10-16) => Predicted: 59.759, Actual: 67.200, Error: 7.44 + Day 196 (2024-10-17) => Predicted: 59.704, Actual: 66.760, Error: 7.06 +Time Step 0: + i_gate[0] = 0.063, f_gate[0] = 0.640, o_gate[0] = 0.430, c_hat[0] = -0.642 + c_state[0] = -0.040, h_state[0] = -0.017 +Time Step 1: + i_gate[0] = 0.071, f_gate[0] = 0.670, o_gate[0] = 0.414, c_hat[0] = -0.648 + c_state[0] = -0.073, h_state[0] = -0.030 +Time Step 2: + i_gate[0] = 0.073, f_gate[0] = 0.669, o_gate[0] = 0.408, c_hat[0] = -0.650 + c_state[0] = -0.096, h_state[0] = -0.039 +Time Step 3: + i_gate[0] = 0.070, f_gate[0] = 0.698, o_gate[0] = 0.377, c_hat[0] = -0.662 + c_state[0] = -0.113, h_state[0] = -0.043 +Time Step 4: + i_gate[0] = 0.064, f_gate[0] = 0.714, o_gate[0] = 0.397, c_hat[0] = -0.735 + c_state[0] = -0.128, h_state[0] = -0.050 + +Validation (last 30 days): + Day 197 (2024-10-18) => Predict=60.821, Actual=61.520, Error=0.70 + Day 198 (2024-10-21) => Predict=59.374, Actual=60.680, Error=1.31 + Day 199 (2024-10-22) => Predict=59.129, Actual=61.020, Error=1.89 + Day 200 (2024-10-23) => Predict=58.994, Actual=58.630, Error=0.36 + Day 201 (2024-10-24) => Predict=59.008, Actual=59.050, Error=0.04 + Day 202 (2024-10-25) => Predict=57.945, Actual=59.180, Error=1.24 + Day 203 (2024-10-28) => Predict=58.681, Actual=57.290, Error=1.39 + Day 204 (2024-10-29) => Predict=58.279, Actual=55.640, Error=2.64 + Day 205 (2024-10-30) => Predict=57.079, Actual=59.830, Error=2.75 + Day 206 (2024-10-31) => Predict=56.069, Actual=60.010, Error=3.94 + Day 207 (2024-11-01) => Predict=54.329, Actual=60.490, Error=6.16 + Day 208 (2024-11-04) => Predict=53.864, Actual=59.740, Error=5.88 + Day 209 (2024-11-05) => Predict=53.572, Actual=58.020, Error=4.45 + Day 210 (2024-11-06) => Predict=53.235, Actual=56.460, Error=3.22 + Day 211 (2024-11-07) => Predict=52.301, Actual=56.340, Error=4.04 + Day 212 (2024-11-08) => Predict=51.879, Actual=56.420, Error=4.54 + Day 213 (2024-11-11) => Predict=51.576, Actual=59.760, Error=8.18 + Day 214 (2024-11-12) => Predict=51.059, Actual=61.610, Error=10.55 + Day 215 (2024-11-13) => Predict=50.151, Actual=60.580, Error=10.43 + Day 216 (2024-11-14) => Predict=49.912, Actual=61.620, Error=11.71 + Day 217 (2024-11-15) => Predict=48.988, Actual=59.920, Error=10.93 + Day 218 (2024-11-18) => Predict=47.550, Actual=57.390, Error=9.84 + Day 219 (2024-11-19) => Predict=47.246, Actual=61.190, Error=13.94 + Day 220 (2024-11-20) => Predict=46.853, Actual=62.950, Error=16.10 + Day 221 (2024-11-21) => Predict=47.210, Actual=64.170, Error=16.96 + Day 222 (2024-11-22) => Predict=47.339, Actual=63.000, Error=15.66 + Day 223 (2024-11-25) => Predict=47.516, Actual=65.060, Error=17.54 + Day 224 (2024-11-26) => Predict=50.052, Actual=63.680, Error=13.63 + Day 225 (2024-11-27) => Predict=51.703, Actual=63.680, Error=11.98 +Total valid daily bars used: 227 +First day: 2024-01-08 O=59.23 H=60.68 L=58.82 C=59.64 V=124629 +Last day: 2024-11-29 O=64.45 H=64.45 L=63.00 C=63.77 V=62082 + +Target Min: 40.86, Target Max: 74.47 + +Normalized Targets (First 5 Samples): +Sample 0: 0.933 +Sample 1: 0.930 +Sample 2: 0.965 +Sample 3: 1.000 +Sample 4: 0.534 +Time Step 0: + i_gate[0] = 0.539, f_gate[0] = 0.688, o_gate[0] = 0.411, c_hat[0] = 0.666 + c_state[0] = 0.359, h_state[0] = 0.141 +Time Step 1: + i_gate[0] = 0.484, f_gate[0] = 0.667, o_gate[0] = 0.398, c_hat[0] = 0.726 + c_state[0] = 0.591, h_state[0] = 0.211 +Time Step 2: + i_gate[0] = 0.451, f_gate[0] = 0.655, o_gate[0] = 0.395, c_hat[0] = 0.737 + c_state[0] = 0.720, h_state[0] = 0.243 +Time Step 3: + i_gate[0] = 0.422, f_gate[0] = 0.639, o_gate[0] = 0.400, c_hat[0] = 0.799 + c_state[0] = 0.796, h_state[0] = 0.265 +Time Step 4: + i_gate[0] = 0.398, f_gate[0] = 0.636, o_gate[0] = 0.410, c_hat[0] = 0.794 + c_state[0] = 0.823, h_state[0] = 0.277 +Backward Time Step 4: + Gradient di[0] = 0.031, df[0] = 0.024, dc_hat[0] = 0.024 + Gradient do_[0] = 0.306 +Backward Time Step 3: + Gradient di[0] = 0.033, df[0] = 0.025, dc_hat[0] = 0.026 + Gradient do_[0] = 0.347 +Backward Time Step 2: + Gradient di[0] = 0.057, df[0] = 0.046, dc_hat[0] = 0.064 + Gradient do_[0] = 0.476 +Backward Time Step 1: + Gradient di[0] = 0.075, df[0] = 0.061, dc_hat[0] = 0.095 + Gradient do_[0] = 0.504 +Backward Time Step 0: + Gradient di[0] = 0.085, df[0] = 0.076, dc_hat[0] = 0.155 + Gradient do_[0] = 0.320 +Epoch 1, Train Loss=0.036383, Weight Norm=12.005491 +Sample Predictions at Epoch 1: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 49.44 | 63.87 | 14.43 | +| 193 | 2024-10-14 | 49.80 | 66.55 | 16.75 | +| 194 | 2024-10-15 | 49.96 | 66.00 | 16.04 | +| 195 | 2024-10-16 | 49.04 | 67.20 | 18.16 | +| 196 | 2024-10-17 | 49.31 | 66.76 | 17.45 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.538, f_gate[0] = 0.686, o_gate[0] = 0.399, c_hat[0] = 0.658 + c_state[0] = 0.354, h_state[0] = 0.135 +Time Step 1: + i_gate[0] = 0.482, f_gate[0] = 0.665, o_gate[0] = 0.383, c_hat[0] = 0.718 + c_state[0] = 0.581, h_state[0] = 0.201 +Time Step 2: + i_gate[0] = 0.449, f_gate[0] = 0.652, o_gate[0] = 0.379, c_hat[0] = 0.730 + c_state[0] = 0.707, h_state[0] = 0.231 +Time Step 3: + i_gate[0] = 0.419, f_gate[0] = 0.635, o_gate[0] = 0.383, c_hat[0] = 0.794 + c_state[0] = 0.782, h_state[0] = 0.250 +Time Step 4: + i_gate[0] = 0.396, f_gate[0] = 0.632, o_gate[0] = 0.391, c_hat[0] = 0.790 + c_state[0] = 0.806, h_state[0] = 0.261 +Backward Time Step 4: + Gradient di[0] = 0.027, df[0] = 0.021, dc_hat[0] = 0.021 + Gradient do_[0] = 0.267 +Backward Time Step 3: + Gradient di[0] = 0.029, df[0] = 0.022, dc_hat[0] = 0.023 + Gradient do_[0] = 0.308 +Backward Time Step 2: + Gradient di[0] = 0.051, df[0] = 0.042, dc_hat[0] = 0.060 + Gradient do_[0] = 0.439 +Backward Time Step 1: + Gradient di[0] = 0.069, df[0] = 0.057, dc_hat[0] = 0.090 + Gradient do_[0] = 0.474 +Backward Time Step 0: + Gradient di[0] = 0.080, df[0] = 0.072, dc_hat[0] = 0.148 + Gradient do_[0] = 0.304 +Time Step 0: + i_gate[0] = 0.536, f_gate[0] = 0.685, o_gate[0] = 0.387, c_hat[0] = 0.652 + c_state[0] = 0.349, h_state[0] = 0.130 +Time Step 1: + i_gate[0] = 0.480, f_gate[0] = 0.664, o_gate[0] = 0.370, c_hat[0] = 0.712 + c_state[0] = 0.574, h_state[0] = 0.191 +Time Step 2: + i_gate[0] = 0.447, f_gate[0] = 0.650, o_gate[0] = 0.364, c_hat[0] = 0.725 + c_state[0] = 0.697, h_state[0] = 0.219 +Time Step 3: + i_gate[0] = 0.417, f_gate[0] = 0.633, o_gate[0] = 0.367, c_hat[0] = 0.791 + c_state[0] = 0.771, h_state[0] = 0.237 +Time Step 4: + i_gate[0] = 0.393, f_gate[0] = 0.629, o_gate[0] = 0.373, c_hat[0] = 0.787 + c_state[0] = 0.794, h_state[0] = 0.246 +Backward Time Step 4: + Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.018 + Gradient do_[0] = 0.234 +Backward Time Step 3: + Gradient di[0] = 0.025, df[0] = 0.019, dc_hat[0] = 0.020 + Gradient do_[0] = 0.275 +Backward Time Step 2: + Gradient di[0] = 0.047, df[0] = 0.039, dc_hat[0] = 0.055 + Gradient do_[0] = 0.407 +Backward Time Step 1: + Gradient di[0] = 0.064, df[0] = 0.053, dc_hat[0] = 0.084 + Gradient do_[0] = 0.448 +Backward Time Step 0: + Gradient di[0] = 0.074, df[0] = 0.068, dc_hat[0] = 0.141 + Gradient do_[0] = 0.291 +Time Step 0: + i_gate[0] = 0.535, f_gate[0] = 0.684, o_gate[0] = 0.375, c_hat[0] = 0.648 + c_state[0] = 0.346, h_state[0] = 0.125 +Time Step 1: + i_gate[0] = 0.478, f_gate[0] = 0.662, o_gate[0] = 0.356, c_hat[0] = 0.709 + c_state[0] = 0.568, h_state[0] = 0.183 +Time Step 2: + i_gate[0] = 0.445, f_gate[0] = 0.648, o_gate[0] = 0.349, c_hat[0] = 0.722 + c_state[0] = 0.690, h_state[0] = 0.209 +Time Step 3: + i_gate[0] = 0.415, f_gate[0] = 0.631, o_gate[0] = 0.351, c_hat[0] = 0.789 + c_state[0] = 0.763, h_state[0] = 0.225 +Time Step 4: + i_gate[0] = 0.392, f_gate[0] = 0.626, o_gate[0] = 0.356, c_hat[0] = 0.785 + c_state[0] = 0.785, h_state[0] = 0.233 +Backward Time Step 4: + Gradient di[0] = 0.020, df[0] = 0.015, dc_hat[0] = 0.016 + Gradient do_[0] = 0.206 +Backward Time Step 3: + Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.018 + Gradient do_[0] = 0.246 +Backward Time Step 2: + Gradient di[0] = 0.043, df[0] = 0.035, dc_hat[0] = 0.051 + Gradient do_[0] = 0.381 +Backward Time Step 1: + Gradient di[0] = 0.059, df[0] = 0.049, dc_hat[0] = 0.079 + Gradient do_[0] = 0.426 +Backward Time Step 0: + Gradient di[0] = 0.070, df[0] = 0.064, dc_hat[0] = 0.134 + Gradient do_[0] = 0.280 +Time Step 0: + i_gate[0] = 0.534, f_gate[0] = 0.683, o_gate[0] = 0.364, c_hat[0] = 0.645 + c_state[0] = 0.344, h_state[0] = 0.121 +Time Step 1: + i_gate[0] = 0.476, f_gate[0] = 0.661, o_gate[0] = 0.343, c_hat[0] = 0.707 + c_state[0] = 0.565, h_state[0] = 0.175 +Time Step 2: + i_gate[0] = 0.444, f_gate[0] = 0.647, o_gate[0] = 0.335, c_hat[0] = 0.721 + c_state[0] = 0.685, h_state[0] = 0.199 +Time Step 3: + i_gate[0] = 0.414, f_gate[0] = 0.629, o_gate[0] = 0.336, c_hat[0] = 0.789 + c_state[0] = 0.757, h_state[0] = 0.215 +Time Step 4: + i_gate[0] = 0.390, f_gate[0] = 0.624, o_gate[0] = 0.340, c_hat[0] = 0.786 + c_state[0] = 0.779, h_state[0] = 0.221 +Backward Time Step 4: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.014 + Gradient do_[0] = 0.184 +Backward Time Step 3: + Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.016 + Gradient do_[0] = 0.223 +Backward Time Step 2: + Gradient di[0] = 0.039, df[0] = 0.032, dc_hat[0] = 0.047 + Gradient do_[0] = 0.359 +Backward Time Step 1: + Gradient di[0] = 0.055, df[0] = 0.046, dc_hat[0] = 0.074 + Gradient do_[0] = 0.408 +Backward Time Step 0: + Gradient di[0] = 0.066, df[0] = 0.061, dc_hat[0] = 0.128 + Gradient do_[0] = 0.271 +Time Step 0: + i_gate[0] = 0.533, f_gate[0] = 0.683, o_gate[0] = 0.353, c_hat[0] = 0.644 + c_state[0] = 0.343, h_state[0] = 0.117 +Time Step 1: + i_gate[0] = 0.475, f_gate[0] = 0.661, o_gate[0] = 0.331, c_hat[0] = 0.706 + c_state[0] = 0.562, h_state[0] = 0.169 +Time Step 2: + i_gate[0] = 0.442, f_gate[0] = 0.646, o_gate[0] = 0.322, c_hat[0] = 0.722 + c_state[0] = 0.682, h_state[0] = 0.191 +Time Step 3: + i_gate[0] = 0.412, f_gate[0] = 0.628, o_gate[0] = 0.322, c_hat[0] = 0.790 + c_state[0] = 0.754, h_state[0] = 0.205 +Time Step 4: + i_gate[0] = 0.388, f_gate[0] = 0.623, o_gate[0] = 0.324, c_hat[0] = 0.787 + c_state[0] = 0.775, h_state[0] = 0.211 +Backward Time Step 4: + Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.012 + Gradient do_[0] = 0.166 +Backward Time Step 3: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.014 + Gradient do_[0] = 0.204 +Backward Time Step 2: + Gradient di[0] = 0.036, df[0] = 0.030, dc_hat[0] = 0.043 + Gradient do_[0] = 0.342 +Backward Time Step 1: + Gradient di[0] = 0.051, df[0] = 0.043, dc_hat[0] = 0.069 + Gradient do_[0] = 0.394 +Backward Time Step 0: + Gradient di[0] = 0.062, df[0] = 0.057, dc_hat[0] = 0.121 + Gradient do_[0] = 0.264 +Time Step 0: + i_gate[0] = 0.533, f_gate[0] = 0.683, o_gate[0] = 0.343, c_hat[0] = 0.644 + c_state[0] = 0.343, h_state[0] = 0.113 +Time Step 1: + i_gate[0] = 0.474, f_gate[0] = 0.660, o_gate[0] = 0.320, c_hat[0] = 0.707 + c_state[0] = 0.562, h_state[0] = 0.163 +Time Step 2: + i_gate[0] = 0.441, f_gate[0] = 0.645, o_gate[0] = 0.310, c_hat[0] = 0.723 + c_state[0] = 0.681, h_state[0] = 0.184 +Time Step 3: + i_gate[0] = 0.411, f_gate[0] = 0.627, o_gate[0] = 0.309, c_hat[0] = 0.791 + c_state[0] = 0.752, h_state[0] = 0.197 +Time Step 4: + i_gate[0] = 0.387, f_gate[0] = 0.621, o_gate[0] = 0.311, c_hat[0] = 0.789 + c_state[0] = 0.773, h_state[0] = 0.202 +Backward Time Step 4: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.011 + Gradient do_[0] = 0.154 +Backward Time Step 3: + Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.012 + Gradient do_[0] = 0.191 +Backward Time Step 2: + Gradient di[0] = 0.034, df[0] = 0.028, dc_hat[0] = 0.040 + Gradient do_[0] = 0.330 +Backward Time Step 1: + Gradient di[0] = 0.049, df[0] = 0.041, dc_hat[0] = 0.065 + Gradient do_[0] = 0.384 +Backward Time Step 0: + Gradient di[0] = 0.060, df[0] = 0.055, dc_hat[0] = 0.116 + Gradient do_[0] = 0.260 +Time Step 0: + i_gate[0] = 0.532, f_gate[0] = 0.682, o_gate[0] = 0.334, c_hat[0] = 0.645 + c_state[0] = 0.343, h_state[0] = 0.110 +Time Step 1: + i_gate[0] = 0.474, f_gate[0] = 0.660, o_gate[0] = 0.310, c_hat[0] = 0.708 + c_state[0] = 0.562, h_state[0] = 0.158 +Time Step 2: + i_gate[0] = 0.441, f_gate[0] = 0.644, o_gate[0] = 0.300, c_hat[0] = 0.725 + c_state[0] = 0.681, h_state[0] = 0.177 +Time Step 3: + i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.298, c_hat[0] = 0.793 + c_state[0] = 0.752, h_state[0] = 0.190 +Time Step 4: + i_gate[0] = 0.386, f_gate[0] = 0.620, o_gate[0] = 0.300, c_hat[0] = 0.791 + c_state[0] = 0.772, h_state[0] = 0.194 +Backward Time Step 4: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010 + Gradient do_[0] = 0.147 +Backward Time Step 3: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.011 + Gradient do_[0] = 0.183 +Backward Time Step 2: + Gradient di[0] = 0.032, df[0] = 0.027, dc_hat[0] = 0.038 + Gradient do_[0] = 0.323 +Backward Time Step 1: + Gradient di[0] = 0.046, df[0] = 0.039, dc_hat[0] = 0.062 + Gradient do_[0] = 0.379 +Backward Time Step 0: + Gradient di[0] = 0.058, df[0] = 0.053, dc_hat[0] = 0.111 + Gradient do_[0] = 0.257 +Time Step 0: + i_gate[0] = 0.532, f_gate[0] = 0.682, o_gate[0] = 0.326, c_hat[0] = 0.646 + c_state[0] = 0.344, h_state[0] = 0.108 +Time Step 1: + i_gate[0] = 0.473, f_gate[0] = 0.659, o_gate[0] = 0.301, c_hat[0] = 0.710 + c_state[0] = 0.562, h_state[0] = 0.154 +Time Step 2: + i_gate[0] = 0.440, f_gate[0] = 0.644, o_gate[0] = 0.291, c_hat[0] = 0.727 + c_state[0] = 0.682, h_state[0] = 0.172 +Time Step 3: + i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.289, c_hat[0] = 0.796 + c_state[0] = 0.752, h_state[0] = 0.184 +Time Step 4: + i_gate[0] = 0.385, f_gate[0] = 0.619, o_gate[0] = 0.290, c_hat[0] = 0.794 + c_state[0] = 0.772, h_state[0] = 0.188 +Backward Time Step 4: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009 + Gradient do_[0] = 0.143 +Backward Time Step 3: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.011 + Gradient do_[0] = 0.179 +Backward Time Step 2: + Gradient di[0] = 0.031, df[0] = 0.026, dc_hat[0] = 0.036 + Gradient do_[0] = 0.319 +Backward Time Step 1: + Gradient di[0] = 0.045, df[0] = 0.038, dc_hat[0] = 0.060 + Gradient do_[0] = 0.376 +Backward Time Step 0: + Gradient di[0] = 0.056, df[0] = 0.052, dc_hat[0] = 0.108 + Gradient do_[0] = 0.256 +Time Step 0: + i_gate[0] = 0.531, f_gate[0] = 0.682, o_gate[0] = 0.319, c_hat[0] = 0.648 + c_state[0] = 0.344, h_state[0] = 0.106 +Time Step 1: + i_gate[0] = 0.473, f_gate[0] = 0.659, o_gate[0] = 0.294, c_hat[0] = 0.712 + c_state[0] = 0.564, h_state[0] = 0.150 +Time Step 2: + i_gate[0] = 0.439, f_gate[0] = 0.643, o_gate[0] = 0.283, c_hat[0] = 0.730 + c_state[0] = 0.683, h_state[0] = 0.168 +Time Step 3: + i_gate[0] = 0.409, f_gate[0] = 0.624, o_gate[0] = 0.280, c_hat[0] = 0.798 + c_state[0] = 0.752, h_state[0] = 0.179 +Time Step 4: + i_gate[0] = 0.384, f_gate[0] = 0.619, o_gate[0] = 0.281, c_hat[0] = 0.797 + c_state[0] = 0.772, h_state[0] = 0.182 +Backward Time Step 4: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.009 + Gradient do_[0] = 0.141 +Backward Time Step 3: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010 + Gradient do_[0] = 0.177 +Backward Time Step 2: + Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.034 + Gradient do_[0] = 0.318 +Backward Time Step 1: + Gradient di[0] = 0.044, df[0] = 0.036, dc_hat[0] = 0.057 + Gradient do_[0] = 0.374 +Backward Time Step 0: + Gradient di[0] = 0.055, df[0] = 0.050, dc_hat[0] = 0.105 + Gradient do_[0] = 0.256 +Time Step 0: + i_gate[0] = 0.531, f_gate[0] = 0.683, o_gate[0] = 0.312, c_hat[0] = 0.651 + c_state[0] = 0.345, h_state[0] = 0.104 +Time Step 1: + i_gate[0] = 0.472, f_gate[0] = 0.659, o_gate[0] = 0.286, c_hat[0] = 0.715 + c_state[0] = 0.565, h_state[0] = 0.147 +Time Step 2: + i_gate[0] = 0.438, f_gate[0] = 0.643, o_gate[0] = 0.275, c_hat[0] = 0.733 + c_state[0] = 0.684, h_state[0] = 0.164 +Time Step 3: + i_gate[0] = 0.408, f_gate[0] = 0.624, o_gate[0] = 0.273, c_hat[0] = 0.801 + c_state[0] = 0.753, h_state[0] = 0.174 +Time Step 4: + i_gate[0] = 0.383, f_gate[0] = 0.618, o_gate[0] = 0.272, c_hat[0] = 0.800 + c_state[0] = 0.772, h_state[0] = 0.177 +Backward Time Step 4: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.008 + Gradient do_[0] = 0.140 +Backward Time Step 3: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009 + Gradient do_[0] = 0.175 +Backward Time Step 2: + Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.033 + Gradient do_[0] = 0.317 +Backward Time Step 1: + Gradient di[0] = 0.043, df[0] = 0.035, dc_hat[0] = 0.055 + Gradient do_[0] = 0.374 +Backward Time Step 0: + Gradient di[0] = 0.054, df[0] = 0.049, dc_hat[0] = 0.102 + Gradient do_[0] = 0.257 +Time Step 0: + i_gate[0] = 0.531, f_gate[0] = 0.683, o_gate[0] = 0.305, c_hat[0] = 0.653 + c_state[0] = 0.347, h_state[0] = 0.102 +Time Step 1: + i_gate[0] = 0.471, f_gate[0] = 0.659, o_gate[0] = 0.279, c_hat[0] = 0.717 + c_state[0] = 0.567, h_state[0] = 0.143 +Time Step 2: + i_gate[0] = 0.437, f_gate[0] = 0.643, o_gate[0] = 0.268, c_hat[0] = 0.736 + c_state[0] = 0.686, h_state[0] = 0.159 +Time Step 3: + i_gate[0] = 0.407, f_gate[0] = 0.623, o_gate[0] = 0.265, c_hat[0] = 0.804 + c_state[0] = 0.755, h_state[0] = 0.169 +Time Step 4: + i_gate[0] = 0.382, f_gate[0] = 0.617, o_gate[0] = 0.264, c_hat[0] = 0.803 + c_state[0] = 0.773, h_state[0] = 0.171 +Backward Time Step 4: + Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.139 +Backward Time Step 3: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009 + Gradient do_[0] = 0.174 +Backward Time Step 2: + Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.031 + Gradient do_[0] = 0.317 +Backward Time Step 1: + Gradient di[0] = 0.041, df[0] = 0.034, dc_hat[0] = 0.053 + Gradient do_[0] = 0.374 +Backward Time Step 0: + Gradient di[0] = 0.053, df[0] = 0.048, dc_hat[0] = 0.099 + Gradient do_[0] = 0.258 +Time Step 0: + i_gate[0] = 0.530, f_gate[0] = 0.683, o_gate[0] = 0.299, c_hat[0] = 0.656 + c_state[0] = 0.348, h_state[0] = 0.100 +Time Step 1: + i_gate[0] = 0.471, f_gate[0] = 0.659, o_gate[0] = 0.272, c_hat[0] = 0.720 + c_state[0] = 0.568, h_state[0] = 0.140 +Time Step 2: + i_gate[0] = 0.437, f_gate[0] = 0.642, o_gate[0] = 0.260, c_hat[0] = 0.739 + c_state[0] = 0.688, h_state[0] = 0.155 +Time Step 3: + i_gate[0] = 0.406, f_gate[0] = 0.623, o_gate[0] = 0.257, c_hat[0] = 0.807 + c_state[0] = 0.756, h_state[0] = 0.164 +Time Step 4: + i_gate[0] = 0.381, f_gate[0] = 0.616, o_gate[0] = 0.256, c_hat[0] = 0.806 + c_state[0] = 0.773, h_state[0] = 0.166 +Backward Time Step 4: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.007 + Gradient do_[0] = 0.138 +Backward Time Step 3: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.008 + Gradient do_[0] = 0.173 +Backward Time Step 2: + Gradient di[0] = 0.028, df[0] = 0.022, dc_hat[0] = 0.030 + Gradient do_[0] = 0.316 +Backward Time Step 1: + Gradient di[0] = 0.040, df[0] = 0.033, dc_hat[0] = 0.051 + Gradient do_[0] = 0.374 +Backward Time Step 0: + Gradient di[0] = 0.052, df[0] = 0.047, dc_hat[0] = 0.096 + Gradient do_[0] = 0.259 +Time Step 0: + i_gate[0] = 0.530, f_gate[0] = 0.683, o_gate[0] = 0.292, c_hat[0] = 0.659 + c_state[0] = 0.349, h_state[0] = 0.098 +Time Step 1: + i_gate[0] = 0.470, f_gate[0] = 0.659, o_gate[0] = 0.265, c_hat[0] = 0.723 + c_state[0] = 0.570, h_state[0] = 0.137 +Time Step 2: + i_gate[0] = 0.436, f_gate[0] = 0.642, o_gate[0] = 0.253, c_hat[0] = 0.742 + c_state[0] = 0.690, h_state[0] = 0.151 +Time Step 3: + i_gate[0] = 0.405, f_gate[0] = 0.622, o_gate[0] = 0.249, c_hat[0] = 0.810 + c_state[0] = 0.757, h_state[0] = 0.159 +Time Step 4: + i_gate[0] = 0.380, f_gate[0] = 0.615, o_gate[0] = 0.247, c_hat[0] = 0.810 + c_state[0] = 0.774, h_state[0] = 0.161 +Backward Time Step 4: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.007 + Gradient do_[0] = 0.137 +Backward Time Step 3: + Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.172 +Backward Time Step 2: + Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.029 + Gradient do_[0] = 0.316 +Backward Time Step 1: + Gradient di[0] = 0.039, df[0] = 0.032, dc_hat[0] = 0.049 + Gradient do_[0] = 0.374 +Backward Time Step 0: + Gradient di[0] = 0.051, df[0] = 0.046, dc_hat[0] = 0.093 + Gradient do_[0] = 0.260 +Time Step 0: + i_gate[0] = 0.530, f_gate[0] = 0.683, o_gate[0] = 0.285, c_hat[0] = 0.662 + c_state[0] = 0.350, h_state[0] = 0.096 +Time Step 1: + i_gate[0] = 0.470, f_gate[0] = 0.659, o_gate[0] = 0.258, c_hat[0] = 0.726 + c_state[0] = 0.572, h_state[0] = 0.133 +Time Step 2: + i_gate[0] = 0.435, f_gate[0] = 0.642, o_gate[0] = 0.245, c_hat[0] = 0.746 + c_state[0] = 0.691, h_state[0] = 0.147 +Time Step 3: + i_gate[0] = 0.404, f_gate[0] = 0.622, o_gate[0] = 0.241, c_hat[0] = 0.812 + c_state[0] = 0.758, h_state[0] = 0.154 +Time Step 4: + i_gate[0] = 0.379, f_gate[0] = 0.615, o_gate[0] = 0.239, c_hat[0] = 0.813 + c_state[0] = 0.774, h_state[0] = 0.155 +Backward Time Step 4: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006 + Gradient do_[0] = 0.135 +Backward Time Step 3: + Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.007 + Gradient do_[0] = 0.170 +Backward Time Step 2: + Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.027 + Gradient do_[0] = 0.314 +Backward Time Step 1: + Gradient di[0] = 0.038, df[0] = 0.031, dc_hat[0] = 0.047 + Gradient do_[0] = 0.374 +Backward Time Step 0: + Gradient di[0] = 0.050, df[0] = 0.045, dc_hat[0] = 0.090 + Gradient do_[0] = 0.260 +Time Step 0: + i_gate[0] = 0.529, f_gate[0] = 0.683, o_gate[0] = 0.278, c_hat[0] = 0.664 + c_state[0] = 0.351, h_state[0] = 0.094 +Time Step 1: + i_gate[0] = 0.469, f_gate[0] = 0.659, o_gate[0] = 0.250, c_hat[0] = 0.729 + c_state[0] = 0.573, h_state[0] = 0.130 +Time Step 2: + i_gate[0] = 0.434, f_gate[0] = 0.641, o_gate[0] = 0.237, c_hat[0] = 0.749 + c_state[0] = 0.693, h_state[0] = 0.142 +Time Step 3: + i_gate[0] = 0.403, f_gate[0] = 0.621, o_gate[0] = 0.233, c_hat[0] = 0.815 + c_state[0] = 0.759, h_state[0] = 0.149 +Time Step 4: + i_gate[0] = 0.378, f_gate[0] = 0.614, o_gate[0] = 0.230, c_hat[0] = 0.816 + c_state[0] = 0.774, h_state[0] = 0.150 +Backward Time Step 4: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006 + Gradient do_[0] = 0.133 +Backward Time Step 3: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.007 + Gradient do_[0] = 0.167 +Backward Time Step 2: + Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.026 + Gradient do_[0] = 0.313 +Backward Time Step 1: + Gradient di[0] = 0.037, df[0] = 0.030, dc_hat[0] = 0.045 + Gradient do_[0] = 0.373 +Backward Time Step 0: + Gradient di[0] = 0.049, df[0] = 0.044, dc_hat[0] = 0.087 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.529, f_gate[0] = 0.683, o_gate[0] = 0.271, c_hat[0] = 0.667 + c_state[0] = 0.353, h_state[0] = 0.092 +Time Step 1: + i_gate[0] = 0.468, f_gate[0] = 0.659, o_gate[0] = 0.243, c_hat[0] = 0.731 + c_state[0] = 0.575, h_state[0] = 0.126 +Time Step 2: + i_gate[0] = 0.433, f_gate[0] = 0.641, o_gate[0] = 0.230, c_hat[0] = 0.752 + c_state[0] = 0.694, h_state[0] = 0.138 +Time Step 3: + i_gate[0] = 0.402, f_gate[0] = 0.621, o_gate[0] = 0.225, c_hat[0] = 0.818 + c_state[0] = 0.760, h_state[0] = 0.144 +Time Step 4: + i_gate[0] = 0.377, f_gate[0] = 0.613, o_gate[0] = 0.222, c_hat[0] = 0.819 + c_state[0] = 0.774, h_state[0] = 0.144 +Backward Time Step 4: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.130 +Backward Time Step 3: + Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.164 +Backward Time Step 2: + Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.025 + Gradient do_[0] = 0.311 +Backward Time Step 1: + Gradient di[0] = 0.036, df[0] = 0.029, dc_hat[0] = 0.043 + Gradient do_[0] = 0.372 +Backward Time Step 0: + Gradient di[0] = 0.048, df[0] = 0.042, dc_hat[0] = 0.084 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.529, f_gate[0] = 0.684, o_gate[0] = 0.264, c_hat[0] = 0.669 + c_state[0] = 0.354, h_state[0] = 0.090 +Time Step 1: + i_gate[0] = 0.468, f_gate[0] = 0.659, o_gate[0] = 0.236, c_hat[0] = 0.734 + c_state[0] = 0.576, h_state[0] = 0.123 +Time Step 2: + i_gate[0] = 0.432, f_gate[0] = 0.641, o_gate[0] = 0.222, c_hat[0] = 0.755 + c_state[0] = 0.696, h_state[0] = 0.134 +Time Step 3: + i_gate[0] = 0.401, f_gate[0] = 0.620, o_gate[0] = 0.217, c_hat[0] = 0.821 + c_state[0] = 0.760, h_state[0] = 0.139 +Time Step 4: + i_gate[0] = 0.376, f_gate[0] = 0.612, o_gate[0] = 0.214, c_hat[0] = 0.822 + c_state[0] = 0.775, h_state[0] = 0.139 +Backward Time Step 4: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.127 +Backward Time Step 3: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006 + Gradient do_[0] = 0.161 +Backward Time Step 2: + Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.023 + Gradient do_[0] = 0.309 +Backward Time Step 1: + Gradient di[0] = 0.035, df[0] = 0.028, dc_hat[0] = 0.041 + Gradient do_[0] = 0.371 +Backward Time Step 0: + Gradient di[0] = 0.047, df[0] = 0.041, dc_hat[0] = 0.081 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.528, f_gate[0] = 0.684, o_gate[0] = 0.257, c_hat[0] = 0.671 + c_state[0] = 0.355, h_state[0] = 0.088 +Time Step 1: + i_gate[0] = 0.467, f_gate[0] = 0.659, o_gate[0] = 0.229, c_hat[0] = 0.736 + c_state[0] = 0.578, h_state[0] = 0.119 +Time Step 2: + i_gate[0] = 0.432, f_gate[0] = 0.640, o_gate[0] = 0.215, c_hat[0] = 0.757 + c_state[0] = 0.697, h_state[0] = 0.129 +Time Step 3: + i_gate[0] = 0.400, f_gate[0] = 0.620, o_gate[0] = 0.209, c_hat[0] = 0.823 + c_state[0] = 0.761, h_state[0] = 0.134 +Time Step 4: + i_gate[0] = 0.375, f_gate[0] = 0.612, o_gate[0] = 0.206, c_hat[0] = 0.825 + c_state[0] = 0.774, h_state[0] = 0.133 +Backward Time Step 4: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.124 +Backward Time Step 3: + Gradient di[0] = 0.009, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.157 +Backward Time Step 2: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.022 + Gradient do_[0] = 0.307 +Backward Time Step 1: + Gradient di[0] = 0.034, df[0] = 0.027, dc_hat[0] = 0.039 + Gradient do_[0] = 0.369 +Backward Time Step 0: + Gradient di[0] = 0.045, df[0] = 0.040, dc_hat[0] = 0.079 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.528, f_gate[0] = 0.684, o_gate[0] = 0.251, c_hat[0] = 0.674 + c_state[0] = 0.356, h_state[0] = 0.086 +Time Step 1: + i_gate[0] = 0.467, f_gate[0] = 0.659, o_gate[0] = 0.222, c_hat[0] = 0.738 + c_state[0] = 0.579, h_state[0] = 0.116 +Time Step 2: + i_gate[0] = 0.431, f_gate[0] = 0.640, o_gate[0] = 0.208, c_hat[0] = 0.760 + c_state[0] = 0.698, h_state[0] = 0.125 +Time Step 3: + i_gate[0] = 0.399, f_gate[0] = 0.619, o_gate[0] = 0.202, c_hat[0] = 0.825 + c_state[0] = 0.761, h_state[0] = 0.130 +Time Step 4: + i_gate[0] = 0.374, f_gate[0] = 0.611, o_gate[0] = 0.198, c_hat[0] = 0.828 + c_state[0] = 0.774, h_state[0] = 0.129 +Backward Time Step 4: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004 + Gradient do_[0] = 0.121 +Backward Time Step 3: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.154 +Backward Time Step 2: + Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.021 + Gradient do_[0] = 0.304 +Backward Time Step 1: + Gradient di[0] = 0.032, df[0] = 0.026, dc_hat[0] = 0.038 + Gradient do_[0] = 0.368 +Backward Time Step 0: + Gradient di[0] = 0.044, df[0] = 0.039, dc_hat[0] = 0.076 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.528, f_gate[0] = 0.684, o_gate[0] = 0.244, c_hat[0] = 0.676 + c_state[0] = 0.356, h_state[0] = 0.084 +Time Step 1: + i_gate[0] = 0.466, f_gate[0] = 0.658, o_gate[0] = 0.215, c_hat[0] = 0.741 + c_state[0] = 0.580, h_state[0] = 0.113 +Time Step 2: + i_gate[0] = 0.430, f_gate[0] = 0.640, o_gate[0] = 0.201, c_hat[0] = 0.763 + c_state[0] = 0.699, h_state[0] = 0.121 +Time Step 3: + i_gate[0] = 0.398, f_gate[0] = 0.618, o_gate[0] = 0.195, c_hat[0] = 0.828 + c_state[0] = 0.762, h_state[0] = 0.125 +Time Step 4: + i_gate[0] = 0.373, f_gate[0] = 0.610, o_gate[0] = 0.191, c_hat[0] = 0.830 + c_state[0] = 0.774, h_state[0] = 0.124 +Backward Time Step 4: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004 + Gradient do_[0] = 0.118 +Backward Time Step 3: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.151 +Backward Time Step 2: + Gradient di[0] = 0.021, df[0] = 0.016, dc_hat[0] = 0.020 + Gradient do_[0] = 0.302 +Backward Time Step 1: + Gradient di[0] = 0.031, df[0] = 0.025, dc_hat[0] = 0.036 + Gradient do_[0] = 0.366 +Backward Time Step 0: + Gradient di[0] = 0.043, df[0] = 0.038, dc_hat[0] = 0.074 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.238, c_hat[0] = 0.677 + c_state[0] = 0.357, h_state[0] = 0.082 +Time Step 1: + i_gate[0] = 0.466, f_gate[0] = 0.658, o_gate[0] = 0.209, c_hat[0] = 0.743 + c_state[0] = 0.581, h_state[0] = 0.109 +Time Step 2: + i_gate[0] = 0.429, f_gate[0] = 0.639, o_gate[0] = 0.195, c_hat[0] = 0.765 + c_state[0] = 0.700, h_state[0] = 0.118 +Time Step 3: + i_gate[0] = 0.397, f_gate[0] = 0.618, o_gate[0] = 0.189, c_hat[0] = 0.830 + c_state[0] = 0.762, h_state[0] = 0.121 +Time Step 4: + i_gate[0] = 0.372, f_gate[0] = 0.609, o_gate[0] = 0.184, c_hat[0] = 0.833 + c_state[0] = 0.774, h_state[0] = 0.119 +Backward Time Step 4: + Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.004 + Gradient do_[0] = 0.115 +Backward Time Step 3: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.005 + Gradient do_[0] = 0.148 +Backward Time Step 2: + Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.019 + Gradient do_[0] = 0.299 +Backward Time Step 1: + Gradient di[0] = 0.030, df[0] = 0.024, dc_hat[0] = 0.034 + Gradient do_[0] = 0.365 +Backward Time Step 0: + Gradient di[0] = 0.042, df[0] = 0.037, dc_hat[0] = 0.071 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.233, c_hat[0] = 0.679 + c_state[0] = 0.358, h_state[0] = 0.080 +Time Step 1: + i_gate[0] = 0.465, f_gate[0] = 0.658, o_gate[0] = 0.203, c_hat[0] = 0.745 + c_state[0] = 0.582, h_state[0] = 0.107 +Time Step 2: + i_gate[0] = 0.429, f_gate[0] = 0.639, o_gate[0] = 0.189, c_hat[0] = 0.767 + c_state[0] = 0.701, h_state[0] = 0.114 +Time Step 3: + i_gate[0] = 0.397, f_gate[0] = 0.617, o_gate[0] = 0.183, c_hat[0] = 0.832 + c_state[0] = 0.763, h_state[0] = 0.117 +Time Step 4: + i_gate[0] = 0.371, f_gate[0] = 0.609, o_gate[0] = 0.177, c_hat[0] = 0.835 + c_state[0] = 0.774, h_state[0] = 0.115 +Backward Time Step 4: + Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.113 +Backward Time Step 3: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004 + Gradient do_[0] = 0.145 +Backward Time Step 2: + Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.018 + Gradient do_[0] = 0.297 +Backward Time Step 1: + Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.033 + Gradient do_[0] = 0.364 +Backward Time Step 0: + Gradient di[0] = 0.041, df[0] = 0.036, dc_hat[0] = 0.069 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.227, c_hat[0] = 0.681 + c_state[0] = 0.359, h_state[0] = 0.078 +Time Step 1: + i_gate[0] = 0.465, f_gate[0] = 0.658, o_gate[0] = 0.198, c_hat[0] = 0.746 + c_state[0] = 0.583, h_state[0] = 0.104 +Time Step 2: + i_gate[0] = 0.428, f_gate[0] = 0.639, o_gate[0] = 0.183, c_hat[0] = 0.769 + c_state[0] = 0.702, h_state[0] = 0.111 +Time Step 3: + i_gate[0] = 0.396, f_gate[0] = 0.617, o_gate[0] = 0.177, c_hat[0] = 0.834 + c_state[0] = 0.763, h_state[0] = 0.114 +Time Step 4: + i_gate[0] = 0.370, f_gate[0] = 0.608, o_gate[0] = 0.171, c_hat[0] = 0.837 + c_state[0] = 0.774, h_state[0] = 0.111 +Backward Time Step 4: + Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.110 +Backward Time Step 3: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.004 + Gradient do_[0] = 0.142 +Backward Time Step 2: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017 + Gradient do_[0] = 0.295 +Backward Time Step 1: + Gradient di[0] = 0.029, df[0] = 0.023, dc_hat[0] = 0.032 + Gradient do_[0] = 0.362 +Backward Time Step 0: + Gradient di[0] = 0.040, df[0] = 0.035, dc_hat[0] = 0.067 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.222, c_hat[0] = 0.683 + c_state[0] = 0.360, h_state[0] = 0.077 +Time Step 1: + i_gate[0] = 0.464, f_gate[0] = 0.658, o_gate[0] = 0.192, c_hat[0] = 0.748 + c_state[0] = 0.584, h_state[0] = 0.101 +Time Step 2: + i_gate[0] = 0.427, f_gate[0] = 0.638, o_gate[0] = 0.178, c_hat[0] = 0.771 + c_state[0] = 0.702, h_state[0] = 0.108 +Time Step 3: + i_gate[0] = 0.395, f_gate[0] = 0.616, o_gate[0] = 0.171, c_hat[0] = 0.836 + c_state[0] = 0.763, h_state[0] = 0.110 +Time Step 4: + i_gate[0] = 0.370, f_gate[0] = 0.607, o_gate[0] = 0.166, c_hat[0] = 0.839 + c_state[0] = 0.773, h_state[0] = 0.108 +Backward Time Step 4: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.108 +Backward Time Step 3: + Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.004 + Gradient do_[0] = 0.139 +Backward Time Step 2: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.016 + Gradient do_[0] = 0.293 +Backward Time Step 1: + Gradient di[0] = 0.028, df[0] = 0.022, dc_hat[0] = 0.031 + Gradient do_[0] = 0.361 +Backward Time Step 0: + Gradient di[0] = 0.039, df[0] = 0.034, dc_hat[0] = 0.065 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.684, o_gate[0] = 0.217, c_hat[0] = 0.684 + c_state[0] = 0.360, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.464, f_gate[0] = 0.658, o_gate[0] = 0.187, c_hat[0] = 0.750 + c_state[0] = 0.585, h_state[0] = 0.099 +Time Step 2: + i_gate[0] = 0.427, f_gate[0] = 0.638, o_gate[0] = 0.173, c_hat[0] = 0.773 + c_state[0] = 0.703, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.394, f_gate[0] = 0.616, o_gate[0] = 0.166, c_hat[0] = 0.837 + c_state[0] = 0.763, h_state[0] = 0.107 +Time Step 4: + i_gate[0] = 0.369, f_gate[0] = 0.607, o_gate[0] = 0.160, c_hat[0] = 0.841 + c_state[0] = 0.773, h_state[0] = 0.104 +Backward Time Step 4: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.106 +Backward Time Step 3: + Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.004 + Gradient do_[0] = 0.137 +Backward Time Step 2: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016 + Gradient do_[0] = 0.291 +Backward Time Step 1: + Gradient di[0] = 0.027, df[0] = 0.021, dc_hat[0] = 0.029 + Gradient do_[0] = 0.360 +Backward Time Step 0: + Gradient di[0] = 0.039, df[0] = 0.033, dc_hat[0] = 0.063 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.526, f_gate[0] = 0.684, o_gate[0] = 0.212, c_hat[0] = 0.685 + c_state[0] = 0.361, h_state[0] = 0.073 +Time Step 1: + i_gate[0] = 0.463, f_gate[0] = 0.658, o_gate[0] = 0.183, c_hat[0] = 0.751 + c_state[0] = 0.585, h_state[0] = 0.096 +Time Step 2: + i_gate[0] = 0.426, f_gate[0] = 0.637, o_gate[0] = 0.168, c_hat[0] = 0.775 + c_state[0] = 0.704, h_state[0] = 0.102 +Time Step 3: + i_gate[0] = 0.394, f_gate[0] = 0.615, o_gate[0] = 0.161, c_hat[0] = 0.839 + c_state[0] = 0.763, h_state[0] = 0.104 +Time Step 4: + i_gate[0] = 0.368, f_gate[0] = 0.606, o_gate[0] = 0.155, c_hat[0] = 0.843 + c_state[0] = 0.773, h_state[0] = 0.101 +Backward Time Step 4: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.104 +Backward Time Step 3: + Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.134 +Backward Time Step 2: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.015 + Gradient do_[0] = 0.289 +Backward Time Step 1: + Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.028 + Gradient do_[0] = 0.359 +Backward Time Step 0: + Gradient di[0] = 0.038, df[0] = 0.033, dc_hat[0] = 0.062 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.208, c_hat[0] = 0.687 + c_state[0] = 0.361, h_state[0] = 0.072 +Time Step 1: + i_gate[0] = 0.463, f_gate[0] = 0.657, o_gate[0] = 0.178, c_hat[0] = 0.753 + c_state[0] = 0.586, h_state[0] = 0.094 +Time Step 2: + i_gate[0] = 0.426, f_gate[0] = 0.637, o_gate[0] = 0.164, c_hat[0] = 0.777 + c_state[0] = 0.704, h_state[0] = 0.099 +Time Step 3: + i_gate[0] = 0.393, f_gate[0] = 0.615, o_gate[0] = 0.157, c_hat[0] = 0.841 + c_state[0] = 0.763, h_state[0] = 0.101 +Time Step 4: + i_gate[0] = 0.367, f_gate[0] = 0.605, o_gate[0] = 0.151, c_hat[0] = 0.844 + c_state[0] = 0.772, h_state[0] = 0.098 +Backward Time Step 4: + Gradient di[0] = 0.005, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = 0.102 +Backward Time Step 3: + Gradient di[0] = 0.006, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.132 +Backward Time Step 2: + Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.014 + Gradient do_[0] = 0.288 +Backward Time Step 1: + Gradient di[0] = 0.026, df[0] = 0.020, dc_hat[0] = 0.027 + Gradient do_[0] = 0.358 +Backward Time Step 0: + Gradient di[0] = 0.037, df[0] = 0.032, dc_hat[0] = 0.060 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.203, c_hat[0] = 0.688 + c_state[0] = 0.362, h_state[0] = 0.071 +Time Step 1: + i_gate[0] = 0.462, f_gate[0] = 0.657, o_gate[0] = 0.174, c_hat[0] = 0.754 + c_state[0] = 0.587, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.425, f_gate[0] = 0.637, o_gate[0] = 0.159, c_hat[0] = 0.779 + c_state[0] = 0.705, h_state[0] = 0.097 +Time Step 3: + i_gate[0] = 0.393, f_gate[0] = 0.614, o_gate[0] = 0.152, c_hat[0] = 0.842 + c_state[0] = 0.763, h_state[0] = 0.098 +Time Step 4: + i_gate[0] = 0.367, f_gate[0] = 0.605, o_gate[0] = 0.146, c_hat[0] = 0.846 + c_state[0] = 0.772, h_state[0] = 0.095 +Backward Time Step 4: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.100 +Backward Time Step 3: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.130 +Backward Time Step 2: + Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.014 + Gradient do_[0] = 0.286 +Backward Time Step 1: + Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.026 + Gradient do_[0] = 0.357 +Backward Time Step 0: + Gradient di[0] = 0.036, df[0] = 0.031, dc_hat[0] = 0.058 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.199, c_hat[0] = 0.689 + c_state[0] = 0.363, h_state[0] = 0.069 +Time Step 1: + i_gate[0] = 0.462, f_gate[0] = 0.657, o_gate[0] = 0.170, c_hat[0] = 0.756 + c_state[0] = 0.587, h_state[0] = 0.090 +Time Step 2: + i_gate[0] = 0.424, f_gate[0] = 0.636, o_gate[0] = 0.155, c_hat[0] = 0.780 + c_state[0] = 0.705, h_state[0] = 0.094 +Time Step 3: + i_gate[0] = 0.392, f_gate[0] = 0.614, o_gate[0] = 0.148, c_hat[0] = 0.844 + c_state[0] = 0.763, h_state[0] = 0.095 +Time Step 4: + i_gate[0] = 0.366, f_gate[0] = 0.604, o_gate[0] = 0.142, c_hat[0] = 0.848 + c_state[0] = 0.771, h_state[0] = 0.092 +Backward Time Step 4: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.098 +Backward Time Step 3: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.127 +Backward Time Step 2: + Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.013 + Gradient do_[0] = 0.284 +Backward Time Step 1: + Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.026 + Gradient do_[0] = 0.355 +Backward Time Step 0: + Gradient di[0] = 0.036, df[0] = 0.031, dc_hat[0] = 0.057 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.195, c_hat[0] = 0.691 + c_state[0] = 0.363, h_state[0] = 0.068 +Time Step 1: + i_gate[0] = 0.462, f_gate[0] = 0.657, o_gate[0] = 0.166, c_hat[0] = 0.757 + c_state[0] = 0.588, h_state[0] = 0.088 +Time Step 2: + i_gate[0] = 0.424, f_gate[0] = 0.636, o_gate[0] = 0.151, c_hat[0] = 0.782 + c_state[0] = 0.705, h_state[0] = 0.092 +Time Step 3: + i_gate[0] = 0.391, f_gate[0] = 0.613, o_gate[0] = 0.144, c_hat[0] = 0.845 + c_state[0] = 0.763, h_state[0] = 0.093 +Time Step 4: + i_gate[0] = 0.366, f_gate[0] = 0.603, o_gate[0] = 0.138, c_hat[0] = 0.849 + c_state[0] = 0.771, h_state[0] = 0.089 +Backward Time Step 4: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.097 +Backward Time Step 3: + Gradient di[0] = 0.005, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = 0.125 +Backward Time Step 2: + Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.282 +Backward Time Step 1: + Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.025 + Gradient do_[0] = 0.354 +Backward Time Step 0: + Gradient di[0] = 0.035, df[0] = 0.030, dc_hat[0] = 0.056 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.192, c_hat[0] = 0.692 + c_state[0] = 0.364, h_state[0] = 0.067 +Time Step 1: + i_gate[0] = 0.461, f_gate[0] = 0.657, o_gate[0] = 0.162, c_hat[0] = 0.758 + c_state[0] = 0.589, h_state[0] = 0.086 +Time Step 2: + i_gate[0] = 0.423, f_gate[0] = 0.636, o_gate[0] = 0.148, c_hat[0] = 0.783 + c_state[0] = 0.706, h_state[0] = 0.090 +Time Step 3: + i_gate[0] = 0.391, f_gate[0] = 0.613, o_gate[0] = 0.141, c_hat[0] = 0.846 + c_state[0] = 0.763, h_state[0] = 0.090 +Time Step 4: + i_gate[0] = 0.365, f_gate[0] = 0.603, o_gate[0] = 0.134, c_hat[0] = 0.851 + c_state[0] = 0.770, h_state[0] = 0.087 +Backward Time Step 4: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.095 +Backward Time Step 3: + Gradient di[0] = 0.005, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = 0.123 +Backward Time Step 2: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.012 + Gradient do_[0] = 0.281 +Backward Time Step 1: + Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.024 + Gradient do_[0] = 0.353 +Backward Time Step 0: + Gradient di[0] = 0.034, df[0] = 0.029, dc_hat[0] = 0.054 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.526, f_gate[0] = 0.685, o_gate[0] = 0.188, c_hat[0] = 0.693 + c_state[0] = 0.364, h_state[0] = 0.066 +Time Step 1: + i_gate[0] = 0.461, f_gate[0] = 0.656, o_gate[0] = 0.159, c_hat[0] = 0.759 + c_state[0] = 0.589, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.423, f_gate[0] = 0.635, o_gate[0] = 0.144, c_hat[0] = 0.785 + c_state[0] = 0.706, h_state[0] = 0.088 +Time Step 3: + i_gate[0] = 0.390, f_gate[0] = 0.612, o_gate[0] = 0.137, c_hat[0] = 0.847 + c_state[0] = 0.763, h_state[0] = 0.088 +Time Step 4: + i_gate[0] = 0.364, f_gate[0] = 0.602, o_gate[0] = 0.130, c_hat[0] = 0.852 + c_state[0] = 0.770, h_state[0] = 0.084 +Backward Time Step 4: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.094 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.121 +Backward Time Step 2: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.012 + Gradient do_[0] = 0.279 +Backward Time Step 1: + Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.023 + Gradient do_[0] = 0.352 +Backward Time Step 0: + Gradient di[0] = 0.034, df[0] = 0.029, dc_hat[0] = 0.053 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.185, c_hat[0] = 0.694 + c_state[0] = 0.364, h_state[0] = 0.065 +Time Step 1: + i_gate[0] = 0.461, f_gate[0] = 0.656, o_gate[0] = 0.155, c_hat[0] = 0.761 + c_state[0] = 0.589, h_state[0] = 0.082 +Time Step 2: + i_gate[0] = 0.422, f_gate[0] = 0.635, o_gate[0] = 0.141, c_hat[0] = 0.786 + c_state[0] = 0.706, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.390, f_gate[0] = 0.612, o_gate[0] = 0.134, c_hat[0] = 0.849 + c_state[0] = 0.763, h_state[0] = 0.086 +Time Step 4: + i_gate[0] = 0.364, f_gate[0] = 0.601, o_gate[0] = 0.127, c_hat[0] = 0.853 + c_state[0] = 0.769, h_state[0] = 0.082 +Backward Time Step 4: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.092 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.119 +Backward Time Step 2: + Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.011 + Gradient do_[0] = 0.278 +Backward Time Step 1: + Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.023 + Gradient do_[0] = 0.351 +Backward Time Step 0: + Gradient di[0] = 0.033, df[0] = 0.028, dc_hat[0] = 0.052 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.182, c_hat[0] = 0.695 + c_state[0] = 0.365, h_state[0] = 0.063 +Time Step 1: + i_gate[0] = 0.460, f_gate[0] = 0.656, o_gate[0] = 0.152, c_hat[0] = 0.762 + c_state[0] = 0.590, h_state[0] = 0.081 +Time Step 2: + i_gate[0] = 0.422, f_gate[0] = 0.634, o_gate[0] = 0.138, c_hat[0] = 0.787 + c_state[0] = 0.706, h_state[0] = 0.084 +Time Step 3: + i_gate[0] = 0.389, f_gate[0] = 0.611, o_gate[0] = 0.131, c_hat[0] = 0.850 + c_state[0] = 0.762, h_state[0] = 0.084 +Time Step 4: + i_gate[0] = 0.363, f_gate[0] = 0.601, o_gate[0] = 0.124, c_hat[0] = 0.854 + c_state[0] = 0.768, h_state[0] = 0.080 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.091 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.118 +Backward Time Step 2: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.011 + Gradient do_[0] = 0.276 +Backward Time Step 1: + Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.022 + Gradient do_[0] = 0.350 +Backward Time Step 0: + Gradient di[0] = 0.032, df[0] = 0.028, dc_hat[0] = 0.051 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.179, c_hat[0] = 0.696 + c_state[0] = 0.365, h_state[0] = 0.062 +Time Step 1: + i_gate[0] = 0.460, f_gate[0] = 0.656, o_gate[0] = 0.149, c_hat[0] = 0.763 + c_state[0] = 0.590, h_state[0] = 0.079 +Time Step 2: + i_gate[0] = 0.421, f_gate[0] = 0.634, o_gate[0] = 0.135, c_hat[0] = 0.788 + c_state[0] = 0.707, h_state[0] = 0.082 +Time Step 3: + i_gate[0] = 0.389, f_gate[0] = 0.611, o_gate[0] = 0.128, c_hat[0] = 0.851 + c_state[0] = 0.762, h_state[0] = 0.082 +Time Step 4: + i_gate[0] = 0.363, f_gate[0] = 0.600, o_gate[0] = 0.121, c_hat[0] = 0.855 + c_state[0] = 0.768, h_state[0] = 0.078 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.090 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.116 +Backward Time Step 2: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.011 + Gradient do_[0] = 0.275 +Backward Time Step 1: + Gradient di[0] = 0.021, df[0] = 0.016, dc_hat[0] = 0.021 + Gradient do_[0] = 0.349 +Backward Time Step 0: + Gradient di[0] = 0.032, df[0] = 0.027, dc_hat[0] = 0.050 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.176, c_hat[0] = 0.696 + c_state[0] = 0.366, h_state[0] = 0.062 +Time Step 1: + i_gate[0] = 0.460, f_gate[0] = 0.656, o_gate[0] = 0.146, c_hat[0] = 0.764 + c_state[0] = 0.591, h_state[0] = 0.078 +Time Step 2: + i_gate[0] = 0.421, f_gate[0] = 0.634, o_gate[0] = 0.132, c_hat[0] = 0.790 + c_state[0] = 0.707, h_state[0] = 0.080 +Time Step 3: + i_gate[0] = 0.388, f_gate[0] = 0.610, o_gate[0] = 0.125, c_hat[0] = 0.852 + c_state[0] = 0.762, h_state[0] = 0.080 +Time Step 4: + i_gate[0] = 0.362, f_gate[0] = 0.600, o_gate[0] = 0.118, c_hat[0] = 0.857 + c_state[0] = 0.767, h_state[0] = 0.076 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.088 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.114 +Backward Time Step 2: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010 + Gradient do_[0] = 0.273 +Backward Time Step 1: + Gradient di[0] = 0.021, df[0] = 0.016, dc_hat[0] = 0.021 + Gradient do_[0] = 0.348 +Backward Time Step 0: + Gradient di[0] = 0.031, df[0] = 0.027, dc_hat[0] = 0.049 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.173, c_hat[0] = 0.697 + c_state[0] = 0.366, h_state[0] = 0.061 +Time Step 1: + i_gate[0] = 0.459, f_gate[0] = 0.656, o_gate[0] = 0.144, c_hat[0] = 0.764 + c_state[0] = 0.591, h_state[0] = 0.076 +Time Step 2: + i_gate[0] = 0.421, f_gate[0] = 0.633, o_gate[0] = 0.130, c_hat[0] = 0.791 + c_state[0] = 0.707, h_state[0] = 0.079 +Time Step 3: + i_gate[0] = 0.388, f_gate[0] = 0.610, o_gate[0] = 0.122, c_hat[0] = 0.853 + c_state[0] = 0.761, h_state[0] = 0.079 +Time Step 4: + i_gate[0] = 0.362, f_gate[0] = 0.599, o_gate[0] = 0.116, c_hat[0] = 0.858 + c_state[0] = 0.766, h_state[0] = 0.074 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.087 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.113 +Backward Time Step 2: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.010 + Gradient do_[0] = 0.272 +Backward Time Step 1: + Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.020 + Gradient do_[0] = 0.348 +Backward Time Step 0: + Gradient di[0] = 0.031, df[0] = 0.026, dc_hat[0] = 0.048 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.170, c_hat[0] = 0.698 + c_state[0] = 0.366, h_state[0] = 0.060 +Time Step 1: + i_gate[0] = 0.459, f_gate[0] = 0.655, o_gate[0] = 0.141, c_hat[0] = 0.765 + c_state[0] = 0.591, h_state[0] = 0.075 +Time Step 2: + i_gate[0] = 0.420, f_gate[0] = 0.633, o_gate[0] = 0.127, c_hat[0] = 0.792 + c_state[0] = 0.707, h_state[0] = 0.077 +Time Step 3: + i_gate[0] = 0.387, f_gate[0] = 0.609, o_gate[0] = 0.120, c_hat[0] = 0.854 + c_state[0] = 0.761, h_state[0] = 0.077 +Time Step 4: + i_gate[0] = 0.361, f_gate[0] = 0.599, o_gate[0] = 0.113, c_hat[0] = 0.859 + c_state[0] = 0.766, h_state[0] = 0.073 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.086 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.111 +Backward Time Step 2: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.010 + Gradient do_[0] = 0.271 +Backward Time Step 1: + Gradient di[0] = 0.020, df[0] = 0.015, dc_hat[0] = 0.020 + Gradient do_[0] = 0.347 +Backward Time Step 0: + Gradient di[0] = 0.031, df[0] = 0.026, dc_hat[0] = 0.047 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.168, c_hat[0] = 0.699 + c_state[0] = 0.367, h_state[0] = 0.059 +Time Step 1: + i_gate[0] = 0.459, f_gate[0] = 0.655, o_gate[0] = 0.139, c_hat[0] = 0.766 + c_state[0] = 0.592, h_state[0] = 0.074 +Time Step 2: + i_gate[0] = 0.420, f_gate[0] = 0.633, o_gate[0] = 0.125, c_hat[0] = 0.793 + c_state[0] = 0.707, h_state[0] = 0.076 +Time Step 3: + i_gate[0] = 0.387, f_gate[0] = 0.609, o_gate[0] = 0.118, c_hat[0] = 0.855 + c_state[0] = 0.761, h_state[0] = 0.075 +Time Step 4: + i_gate[0] = 0.361, f_gate[0] = 0.598, o_gate[0] = 0.111, c_hat[0] = 0.859 + c_state[0] = 0.765, h_state[0] = 0.071 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.085 +Backward Time Step 3: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.002 + Gradient do_[0] = 0.110 +Backward Time Step 2: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.010 + Gradient do_[0] = 0.270 +Backward Time Step 1: + Gradient di[0] = 0.020, df[0] = 0.015, dc_hat[0] = 0.019 + Gradient do_[0] = 0.346 +Backward Time Step 0: + Gradient di[0] = 0.030, df[0] = 0.026, dc_hat[0] = 0.046 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.166, c_hat[0] = 0.700 + c_state[0] = 0.367, h_state[0] = 0.058 +Time Step 1: + i_gate[0] = 0.458, f_gate[0] = 0.655, o_gate[0] = 0.137, c_hat[0] = 0.767 + c_state[0] = 0.592, h_state[0] = 0.073 +Time Step 2: + i_gate[0] = 0.419, f_gate[0] = 0.632, o_gate[0] = 0.123, c_hat[0] = 0.794 + c_state[0] = 0.707, h_state[0] = 0.075 +Time Step 3: + i_gate[0] = 0.386, f_gate[0] = 0.608, o_gate[0] = 0.116, c_hat[0] = 0.855 + c_state[0] = 0.760, h_state[0] = 0.074 +Time Step 4: + i_gate[0] = 0.360, f_gate[0] = 0.598, o_gate[0] = 0.109, c_hat[0] = 0.860 + c_state[0] = 0.764, h_state[0] = 0.070 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.085 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.109 +Backward Time Step 2: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.009 + Gradient do_[0] = 0.269 +Backward Time Step 1: + Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.019 + Gradient do_[0] = 0.345 +Backward Time Step 0: + Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.046 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.163, c_hat[0] = 0.700 + c_state[0] = 0.367, h_state[0] = 0.057 +Time Step 1: + i_gate[0] = 0.458, f_gate[0] = 0.655, o_gate[0] = 0.135, c_hat[0] = 0.768 + c_state[0] = 0.592, h_state[0] = 0.072 +Time Step 2: + i_gate[0] = 0.419, f_gate[0] = 0.632, o_gate[0] = 0.121, c_hat[0] = 0.795 + c_state[0] = 0.707, h_state[0] = 0.074 +Time Step 3: + i_gate[0] = 0.386, f_gate[0] = 0.608, o_gate[0] = 0.114, c_hat[0] = 0.856 + c_state[0] = 0.760, h_state[0] = 0.073 +Time Step 4: + i_gate[0] = 0.359, f_gate[0] = 0.597, o_gate[0] = 0.107, c_hat[0] = 0.861 + c_state[0] = 0.763, h_state[0] = 0.069 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.084 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.108 +Backward Time Step 2: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.009 + Gradient do_[0] = 0.268 +Backward Time Step 1: + Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.019 + Gradient do_[0] = 0.345 +Backward Time Step 0: + Gradient di[0] = 0.029, df[0] = 0.025, dc_hat[0] = 0.045 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.525, f_gate[0] = 0.685, o_gate[0] = 0.161, c_hat[0] = 0.701 + c_state[0] = 0.368, h_state[0] = 0.057 +Time Step 1: + i_gate[0] = 0.458, f_gate[0] = 0.655, o_gate[0] = 0.133, c_hat[0] = 0.769 + c_state[0] = 0.593, h_state[0] = 0.071 +Time Step 2: + i_gate[0] = 0.418, f_gate[0] = 0.632, o_gate[0] = 0.119, c_hat[0] = 0.795 + c_state[0] = 0.707, h_state[0] = 0.072 +Time Step 3: + i_gate[0] = 0.385, f_gate[0] = 0.607, o_gate[0] = 0.112, c_hat[0] = 0.857 + c_state[0] = 0.759, h_state[0] = 0.072 +Time Step 4: + i_gate[0] = 0.359, f_gate[0] = 0.597, o_gate[0] = 0.105, c_hat[0] = 0.862 + c_state[0] = 0.763, h_state[0] = 0.067 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.083 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.107 +Backward Time Step 2: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.009 + Gradient do_[0] = 0.267 +Backward Time Step 1: + Gradient di[0] = 0.019, df[0] = 0.014, dc_hat[0] = 0.018 + Gradient do_[0] = 0.344 +Backward Time Step 0: + Gradient di[0] = 0.029, df[0] = 0.025, dc_hat[0] = 0.044 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.685, o_gate[0] = 0.160, c_hat[0] = 0.702 + c_state[0] = 0.368, h_state[0] = 0.056 +Time Step 1: + i_gate[0] = 0.457, f_gate[0] = 0.655, o_gate[0] = 0.131, c_hat[0] = 0.769 + c_state[0] = 0.593, h_state[0] = 0.070 +Time Step 2: + i_gate[0] = 0.418, f_gate[0] = 0.632, o_gate[0] = 0.117, c_hat[0] = 0.796 + c_state[0] = 0.707, h_state[0] = 0.071 +Time Step 3: + i_gate[0] = 0.385, f_gate[0] = 0.607, o_gate[0] = 0.110, c_hat[0] = 0.858 + c_state[0] = 0.759, h_state[0] = 0.070 +Time Step 4: + i_gate[0] = 0.358, f_gate[0] = 0.596, o_gate[0] = 0.103, c_hat[0] = 0.863 + c_state[0] = 0.762, h_state[0] = 0.066 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.083 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.106 +Backward Time Step 2: + Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.009 + Gradient do_[0] = 0.266 +Backward Time Step 1: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.018 + Gradient do_[0] = 0.344 +Backward Time Step 0: + Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.044 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.685, o_gate[0] = 0.158, c_hat[0] = 0.702 + c_state[0] = 0.368, h_state[0] = 0.056 +Time Step 1: + i_gate[0] = 0.457, f_gate[0] = 0.655, o_gate[0] = 0.129, c_hat[0] = 0.770 + c_state[0] = 0.593, h_state[0] = 0.069 +Time Step 2: + i_gate[0] = 0.418, f_gate[0] = 0.631, o_gate[0] = 0.115, c_hat[0] = 0.797 + c_state[0] = 0.707, h_state[0] = 0.070 +Time Step 3: + i_gate[0] = 0.384, f_gate[0] = 0.607, o_gate[0] = 0.108, c_hat[0] = 0.858 + c_state[0] = 0.759, h_state[0] = 0.069 +Time Step 4: + i_gate[0] = 0.358, f_gate[0] = 0.596, o_gate[0] = 0.101, c_hat[0] = 0.864 + c_state[0] = 0.761, h_state[0] = 0.065 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.082 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.105 +Backward Time Step 2: + Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.009 + Gradient do_[0] = 0.266 +Backward Time Step 1: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.018 + Gradient do_[0] = 0.343 +Backward Time Step 0: + Gradient di[0] = 0.028, df[0] = 0.024, dc_hat[0] = 0.043 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.156, c_hat[0] = 0.703 + c_state[0] = 0.369, h_state[0] = 0.055 +Time Step 1: + i_gate[0] = 0.457, f_gate[0] = 0.654, o_gate[0] = 0.128, c_hat[0] = 0.770 + c_state[0] = 0.593, h_state[0] = 0.068 +Time Step 2: + i_gate[0] = 0.417, f_gate[0] = 0.631, o_gate[0] = 0.114, c_hat[0] = 0.798 + c_state[0] = 0.707, h_state[0] = 0.069 +Time Step 3: + i_gate[0] = 0.384, f_gate[0] = 0.606, o_gate[0] = 0.107, c_hat[0] = 0.859 + c_state[0] = 0.758, h_state[0] = 0.068 +Time Step 4: + i_gate[0] = 0.358, f_gate[0] = 0.595, o_gate[0] = 0.100, c_hat[0] = 0.864 + c_state[0] = 0.760, h_state[0] = 0.064 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.082 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = 0.104 +Backward Time Step 2: + Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.265 +Backward Time Step 1: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017 + Gradient do_[0] = 0.343 +Backward Time Step 0: + Gradient di[0] = 0.028, df[0] = 0.024, dc_hat[0] = 0.043 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.154, c_hat[0] = 0.703 + c_state[0] = 0.369, h_state[0] = 0.054 +Time Step 1: + i_gate[0] = 0.457, f_gate[0] = 0.654, o_gate[0] = 0.126, c_hat[0] = 0.771 + c_state[0] = 0.593, h_state[0] = 0.067 +Time Step 2: + i_gate[0] = 0.417, f_gate[0] = 0.631, o_gate[0] = 0.112, c_hat[0] = 0.798 + c_state[0] = 0.707, h_state[0] = 0.068 +Time Step 3: + i_gate[0] = 0.383, f_gate[0] = 0.606, o_gate[0] = 0.105, c_hat[0] = 0.860 + c_state[0] = 0.758, h_state[0] = 0.067 +Time Step 4: + i_gate[0] = 0.357, f_gate[0] = 0.595, o_gate[0] = 0.098, c_hat[0] = 0.865 + c_state[0] = 0.759, h_state[0] = 0.063 +Backward Time Step 4: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.081 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.103 +Backward Time Step 2: + Gradient di[0] = 0.011, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.264 +Backward Time Step 1: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017 + Gradient do_[0] = 0.342 +Backward Time Step 0: + Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.042 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.153, c_hat[0] = 0.704 + c_state[0] = 0.369, h_state[0] = 0.054 +Time Step 1: + i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.125, c_hat[0] = 0.772 + c_state[0] = 0.594, h_state[0] = 0.066 +Time Step 2: + i_gate[0] = 0.416, f_gate[0] = 0.630, o_gate[0] = 0.111, c_hat[0] = 0.799 + c_state[0] = 0.707, h_state[0] = 0.068 +Time Step 3: + i_gate[0] = 0.383, f_gate[0] = 0.605, o_gate[0] = 0.104, c_hat[0] = 0.860 + c_state[0] = 0.757, h_state[0] = 0.066 +Time Step 4: + i_gate[0] = 0.357, f_gate[0] = 0.594, o_gate[0] = 0.097, c_hat[0] = 0.866 + c_state[0] = 0.758, h_state[0] = 0.062 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.081 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.102 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.263 +Backward Time Step 1: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.017 + Gradient do_[0] = 0.341 +Backward Time Step 0: + Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.041 + Gradient do_[0] = 0.264 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.151, c_hat[0] = 0.704 + c_state[0] = 0.369, h_state[0] = 0.053 +Time Step 1: + i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.123, c_hat[0] = 0.772 + c_state[0] = 0.594, h_state[0] = 0.066 +Time Step 2: + i_gate[0] = 0.416, f_gate[0] = 0.630, o_gate[0] = 0.110, c_hat[0] = 0.800 + c_state[0] = 0.707, h_state[0] = 0.067 +Time Step 3: + i_gate[0] = 0.382, f_gate[0] = 0.605, o_gate[0] = 0.103, c_hat[0] = 0.861 + c_state[0] = 0.757, h_state[0] = 0.066 +Time Step 4: + i_gate[0] = 0.356, f_gate[0] = 0.594, o_gate[0] = 0.096, c_hat[0] = 0.866 + c_state[0] = 0.758, h_state[0] = 0.061 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.080 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.101 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.263 +Backward Time Step 1: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016 + Gradient do_[0] = 0.341 +Backward Time Step 0: + Gradient di[0] = 0.027, df[0] = 0.023, dc_hat[0] = 0.041 + Gradient do_[0] = 0.264 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.150, c_hat[0] = 0.705 + c_state[0] = 0.369, h_state[0] = 0.053 +Time Step 1: + i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.122, c_hat[0] = 0.773 + c_state[0] = 0.594, h_state[0] = 0.065 +Time Step 2: + i_gate[0] = 0.416, f_gate[0] = 0.630, o_gate[0] = 0.108, c_hat[0] = 0.800 + c_state[0] = 0.707, h_state[0] = 0.066 +Time Step 3: + i_gate[0] = 0.382, f_gate[0] = 0.605, o_gate[0] = 0.101, c_hat[0] = 0.861 + c_state[0] = 0.756, h_state[0] = 0.065 +Time Step 4: + i_gate[0] = 0.356, f_gate[0] = 0.593, o_gate[0] = 0.094, c_hat[0] = 0.867 + c_state[0] = 0.757, h_state[0] = 0.060 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.079 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.101 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.262 +Backward Time Step 1: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016 + Gradient do_[0] = 0.340 +Backward Time Step 0: + Gradient di[0] = 0.027, df[0] = 0.023, dc_hat[0] = 0.041 + Gradient do_[0] = 0.264 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.148, c_hat[0] = 0.705 + c_state[0] = 0.370, h_state[0] = 0.052 +Time Step 1: + i_gate[0] = 0.456, f_gate[0] = 0.654, o_gate[0] = 0.120, c_hat[0] = 0.773 + c_state[0] = 0.594, h_state[0] = 0.064 +Time Step 2: + i_gate[0] = 0.415, f_gate[0] = 0.630, o_gate[0] = 0.107, c_hat[0] = 0.801 + c_state[0] = 0.707, h_state[0] = 0.065 +Time Step 3: + i_gate[0] = 0.381, f_gate[0] = 0.604, o_gate[0] = 0.100, c_hat[0] = 0.862 + c_state[0] = 0.756, h_state[0] = 0.064 +Time Step 4: + i_gate[0] = 0.355, f_gate[0] = 0.593, o_gate[0] = 0.093, c_hat[0] = 0.867 + c_state[0] = 0.756, h_state[0] = 0.059 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.079 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.100 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.008 + Gradient do_[0] = 0.261 +Backward Time Step 1: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016 + Gradient do_[0] = 0.340 +Backward Time Step 0: + Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.040 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.147, c_hat[0] = 0.706 + c_state[0] = 0.370, h_state[0] = 0.052 +Time Step 1: + i_gate[0] = 0.455, f_gate[0] = 0.654, o_gate[0] = 0.119, c_hat[0] = 0.774 + c_state[0] = 0.594, h_state[0] = 0.063 +Time Step 2: + i_gate[0] = 0.415, f_gate[0] = 0.629, o_gate[0] = 0.106, c_hat[0] = 0.802 + c_state[0] = 0.706, h_state[0] = 0.064 +Time Step 3: + i_gate[0] = 0.381, f_gate[0] = 0.604, o_gate[0] = 0.099, c_hat[0] = 0.862 + c_state[0] = 0.755, h_state[0] = 0.063 +Time Step 4: + i_gate[0] = 0.355, f_gate[0] = 0.592, o_gate[0] = 0.092, c_hat[0] = 0.868 + c_state[0] = 0.755, h_state[0] = 0.059 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.078 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.099 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.008 + Gradient do_[0] = 0.260 +Backward Time Step 1: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.016 + Gradient do_[0] = 0.339 +Backward Time Step 0: + Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.040 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.146, c_hat[0] = 0.706 + c_state[0] = 0.370, h_state[0] = 0.052 +Time Step 1: + i_gate[0] = 0.455, f_gate[0] = 0.654, o_gate[0] = 0.118, c_hat[0] = 0.774 + c_state[0] = 0.594, h_state[0] = 0.063 +Time Step 2: + i_gate[0] = 0.415, f_gate[0] = 0.629, o_gate[0] = 0.105, c_hat[0] = 0.802 + c_state[0] = 0.706, h_state[0] = 0.064 +Time Step 3: + i_gate[0] = 0.381, f_gate[0] = 0.603, o_gate[0] = 0.098, c_hat[0] = 0.863 + c_state[0] = 0.755, h_state[0] = 0.062 +Time Step 4: + i_gate[0] = 0.354, f_gate[0] = 0.592, o_gate[0] = 0.091, c_hat[0] = 0.868 + c_state[0] = 0.754, h_state[0] = 0.058 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.078 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.098 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.259 +Backward Time Step 1: + Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.016 + Gradient do_[0] = 0.338 +Backward Time Step 0: + Gradient di[0] = 0.026, df[0] = 0.022, dc_hat[0] = 0.039 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.144, c_hat[0] = 0.707 + c_state[0] = 0.370, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.455, f_gate[0] = 0.654, o_gate[0] = 0.117, c_hat[0] = 0.775 + c_state[0] = 0.594, h_state[0] = 0.062 +Time Step 2: + i_gate[0] = 0.414, f_gate[0] = 0.629, o_gate[0] = 0.103, c_hat[0] = 0.803 + c_state[0] = 0.706, h_state[0] = 0.063 +Time Step 3: + i_gate[0] = 0.380, f_gate[0] = 0.603, o_gate[0] = 0.096, c_hat[0] = 0.863 + c_state[0] = 0.754, h_state[0] = 0.062 +Time Step 4: + i_gate[0] = 0.354, f_gate[0] = 0.591, o_gate[0] = 0.090, c_hat[0] = 0.869 + c_state[0] = 0.753, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.077 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.097 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.258 +Backward Time Step 1: + Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015 + Gradient do_[0] = 0.338 +Backward Time Step 0: + Gradient di[0] = 0.026, df[0] = 0.022, dc_hat[0] = 0.039 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.143, c_hat[0] = 0.707 + c_state[0] = 0.370, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.455, f_gate[0] = 0.653, o_gate[0] = 0.116, c_hat[0] = 0.775 + c_state[0] = 0.594, h_state[0] = 0.062 +Time Step 2: + i_gate[0] = 0.414, f_gate[0] = 0.629, o_gate[0] = 0.102, c_hat[0] = 0.803 + c_state[0] = 0.706, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.380, f_gate[0] = 0.603, o_gate[0] = 0.095, c_hat[0] = 0.864 + c_state[0] = 0.753, h_state[0] = 0.061 +Time Step 4: + i_gate[0] = 0.353, f_gate[0] = 0.591, o_gate[0] = 0.088, c_hat[0] = 0.869 + c_state[0] = 0.752, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.076 +Backward Time Step 3: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.096 +Backward Time Step 2: + Gradient di[0] = 0.010, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.257 +Backward Time Step 1: + Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015 + Gradient do_[0] = 0.337 +Backward Time Step 0: + Gradient di[0] = 0.026, df[0] = 0.022, dc_hat[0] = 0.038 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.142, c_hat[0] = 0.708 + c_state[0] = 0.371, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.114, c_hat[0] = 0.775 + c_state[0] = 0.594, h_state[0] = 0.061 +Time Step 2: + i_gate[0] = 0.414, f_gate[0] = 0.628, o_gate[0] = 0.101, c_hat[0] = 0.804 + c_state[0] = 0.706, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.379, f_gate[0] = 0.602, o_gate[0] = 0.094, c_hat[0] = 0.864 + c_state[0] = 0.753, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.353, f_gate[0] = 0.590, o_gate[0] = 0.087, c_hat[0] = 0.870 + c_state[0] = 0.751, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.076 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.095 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.256 +Backward Time Step 1: + Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015 + Gradient do_[0] = 0.336 +Backward Time Step 0: + Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.038 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.141, c_hat[0] = 0.708 + c_state[0] = 0.371, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.113, c_hat[0] = 0.776 + c_state[0] = 0.594, h_state[0] = 0.060 +Time Step 2: + i_gate[0] = 0.413, f_gate[0] = 0.628, o_gate[0] = 0.100, c_hat[0] = 0.804 + c_state[0] = 0.706, h_state[0] = 0.061 +Time Step 3: + i_gate[0] = 0.379, f_gate[0] = 0.602, o_gate[0] = 0.093, c_hat[0] = 0.864 + c_state[0] = 0.752, h_state[0] = 0.059 +Time Step 4: + i_gate[0] = 0.352, f_gate[0] = 0.590, o_gate[0] = 0.086, c_hat[0] = 0.870 + c_state[0] = 0.750, h_state[0] = 0.055 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.075 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.094 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.256 +Backward Time Step 1: + Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.015 + Gradient do_[0] = 0.336 +Backward Time Step 0: + Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.038 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.140, c_hat[0] = 0.708 + c_state[0] = 0.371, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.112, c_hat[0] = 0.776 + c_state[0] = 0.594, h_state[0] = 0.060 +Time Step 2: + i_gate[0] = 0.413, f_gate[0] = 0.628, o_gate[0] = 0.099, c_hat[0] = 0.804 + c_state[0] = 0.705, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.379, f_gate[0] = 0.602, o_gate[0] = 0.092, c_hat[0] = 0.865 + c_state[0] = 0.752, h_state[0] = 0.059 +Time Step 4: + i_gate[0] = 0.352, f_gate[0] = 0.589, o_gate[0] = 0.085, c_hat[0] = 0.871 + c_state[0] = 0.749, h_state[0] = 0.054 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.074 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.093 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.255 +Backward Time Step 1: + Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.015 + Gradient do_[0] = 0.335 +Backward Time Step 0: + Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.037 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.138, c_hat[0] = 0.709 + c_state[0] = 0.371, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.454, f_gate[0] = 0.653, o_gate[0] = 0.111, c_hat[0] = 0.776 + c_state[0] = 0.595, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.413, f_gate[0] = 0.628, o_gate[0] = 0.098, c_hat[0] = 0.805 + c_state[0] = 0.705, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.378, f_gate[0] = 0.601, o_gate[0] = 0.091, c_hat[0] = 0.865 + c_state[0] = 0.751, h_state[0] = 0.058 +Time Step 4: + i_gate[0] = 0.351, f_gate[0] = 0.589, o_gate[0] = 0.085, c_hat[0] = 0.871 + c_state[0] = 0.748, h_state[0] = 0.054 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.074 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.092 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.254 +Backward Time Step 1: + Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.014 + Gradient do_[0] = 0.334 +Backward Time Step 0: + Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.037 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.137, c_hat[0] = 0.709 + c_state[0] = 0.371, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.110, c_hat[0] = 0.777 + c_state[0] = 0.595, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.412, f_gate[0] = 0.627, o_gate[0] = 0.097, c_hat[0] = 0.805 + c_state[0] = 0.705, h_state[0] = 0.059 +Time Step 3: + i_gate[0] = 0.378, f_gate[0] = 0.601, o_gate[0] = 0.091, c_hat[0] = 0.866 + c_state[0] = 0.750, h_state[0] = 0.058 +Time Step 4: + i_gate[0] = 0.351, f_gate[0] = 0.588, o_gate[0] = 0.084, c_hat[0] = 0.871 + c_state[0] = 0.747, h_state[0] = 0.053 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.073 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.091 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.253 +Backward Time Step 1: + Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.014 + Gradient do_[0] = 0.333 +Backward Time Step 0: + Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.036 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.136, c_hat[0] = 0.709 + c_state[0] = 0.371, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.109, c_hat[0] = 0.777 + c_state[0] = 0.595, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.412, f_gate[0] = 0.627, o_gate[0] = 0.096, c_hat[0] = 0.806 + c_state[0] = 0.705, h_state[0] = 0.059 +Time Step 3: + i_gate[0] = 0.377, f_gate[0] = 0.600, o_gate[0] = 0.090, c_hat[0] = 0.866 + c_state[0] = 0.750, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.350, f_gate[0] = 0.588, o_gate[0] = 0.083, c_hat[0] = 0.872 + c_state[0] = 0.746, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.073 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.090 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.007 + Gradient do_[0] = 0.252 +Backward Time Step 1: + Gradient di[0] = 0.015, df[0] = 0.012, dc_hat[0] = 0.014 + Gradient do_[0] = 0.333 +Backward Time Step 0: + Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.036 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.686, o_gate[0] = 0.135, c_hat[0] = 0.710 + c_state[0] = 0.372, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.108, c_hat[0] = 0.777 + c_state[0] = 0.595, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.412, f_gate[0] = 0.627, o_gate[0] = 0.096, c_hat[0] = 0.806 + c_state[0] = 0.704, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.377, f_gate[0] = 0.600, o_gate[0] = 0.089, c_hat[0] = 0.866 + c_state[0] = 0.749, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.350, f_gate[0] = 0.587, o_gate[0] = 0.082, c_hat[0] = 0.872 + c_state[0] = 0.745, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.072 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.089 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006 + Gradient do_[0] = 0.251 +Backward Time Step 1: + Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.014 + Gradient do_[0] = 0.332 +Backward Time Step 0: + Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.036 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.710 + c_state[0] = 0.372, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.107, c_hat[0] = 0.778 + c_state[0] = 0.595, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.411, f_gate[0] = 0.627, o_gate[0] = 0.095, c_hat[0] = 0.806 + c_state[0] = 0.704, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.376, f_gate[0] = 0.600, o_gate[0] = 0.088, c_hat[0] = 0.867 + c_state[0] = 0.749, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.349, f_gate[0] = 0.587, o_gate[0] = 0.081, c_hat[0] = 0.873 + c_state[0] = 0.744, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.071 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.089 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.006 + Gradient do_[0] = 0.250 +Backward Time Step 1: + Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.014 + Gradient do_[0] = 0.331 +Backward Time Step 0: + Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.133, c_hat[0] = 0.710 + c_state[0] = 0.372, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.453, f_gate[0] = 0.653, o_gate[0] = 0.107, c_hat[0] = 0.778 + c_state[0] = 0.595, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.411, f_gate[0] = 0.626, o_gate[0] = 0.094, c_hat[0] = 0.807 + c_state[0] = 0.704, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.376, f_gate[0] = 0.599, o_gate[0] = 0.087, c_hat[0] = 0.867 + c_state[0] = 0.748, h_state[0] = 0.055 +Time Step 4: + i_gate[0] = 0.349, f_gate[0] = 0.586, o_gate[0] = 0.080, c_hat[0] = 0.873 + c_state[0] = 0.743, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.071 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.001 + Gradient do_[0] = 0.088 +Backward Time Step 2: + Gradient di[0] = 0.009, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.249 +Backward Time Step 1: + Gradient di[0] = 0.015, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.331 +Backward Time Step 0: + Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.132, c_hat[0] = 0.711 + c_state[0] = 0.372, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.106, c_hat[0] = 0.778 + c_state[0] = 0.595, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.411, f_gate[0] = 0.626, o_gate[0] = 0.093, c_hat[0] = 0.807 + c_state[0] = 0.704, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.376, f_gate[0] = 0.599, o_gate[0] = 0.086, c_hat[0] = 0.867 + c_state[0] = 0.747, h_state[0] = 0.055 +Time Step 4: + i_gate[0] = 0.348, f_gate[0] = 0.586, o_gate[0] = 0.080, c_hat[0] = 0.873 + c_state[0] = 0.742, h_state[0] = 0.050 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.070 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.087 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.249 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.330 +Backward Time Step 0: + Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.711 + c_state[0] = 0.372, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.105, c_hat[0] = 0.778 + c_state[0] = 0.595, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.092, c_hat[0] = 0.807 + c_state[0] = 0.704, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.375, f_gate[0] = 0.599, o_gate[0] = 0.086, c_hat[0] = 0.867 + c_state[0] = 0.747, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.348, f_gate[0] = 0.585, o_gate[0] = 0.079, c_hat[0] = 0.874 + c_state[0] = 0.741, h_state[0] = 0.050 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.070 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.086 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.248 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.330 +Backward Time Step 0: + Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.035 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.711 + c_state[0] = 0.372, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.104, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.092, c_hat[0] = 0.808 + c_state[0] = 0.703, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.375, f_gate[0] = 0.598, o_gate[0] = 0.085, c_hat[0] = 0.868 + c_state[0] = 0.746, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.347, f_gate[0] = 0.585, o_gate[0] = 0.078, c_hat[0] = 0.874 + c_state[0] = 0.740, h_state[0] = 0.049 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.069 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.085 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.247 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.329 +Backward Time Step 0: + Gradient di[0] = 0.024, df[0] = 0.020, dc_hat[0] = 0.034 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.130, c_hat[0] = 0.712 + c_state[0] = 0.372, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.103, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.410, f_gate[0] = 0.626, o_gate[0] = 0.091, c_hat[0] = 0.808 + c_state[0] = 0.703, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.374, f_gate[0] = 0.598, o_gate[0] = 0.084, c_hat[0] = 0.868 + c_state[0] = 0.745, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.347, f_gate[0] = 0.584, o_gate[0] = 0.077, c_hat[0] = 0.874 + c_state[0] = 0.739, h_state[0] = 0.049 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.069 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.085 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.246 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.328 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.034 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.129, c_hat[0] = 0.712 + c_state[0] = 0.373, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.452, f_gate[0] = 0.652, o_gate[0] = 0.103, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.090, c_hat[0] = 0.808 + c_state[0] = 0.703, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.374, f_gate[0] = 0.598, o_gate[0] = 0.084, c_hat[0] = 0.868 + c_state[0] = 0.745, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.346, f_gate[0] = 0.584, o_gate[0] = 0.077, c_hat[0] = 0.874 + c_state[0] = 0.738, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.068 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.084 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.246 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.328 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.034 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.712 + c_state[0] = 0.373, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.102, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.090, c_hat[0] = 0.808 + c_state[0] = 0.703, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.374, f_gate[0] = 0.597, o_gate[0] = 0.083, c_hat[0] = 0.868 + c_state[0] = 0.744, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.346, f_gate[0] = 0.584, o_gate[0] = 0.076, c_hat[0] = 0.875 + c_state[0] = 0.737, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.068 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.083 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.245 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.327 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.034 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.713 + c_state[0] = 0.373, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.101, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.409, f_gate[0] = 0.625, o_gate[0] = 0.089, c_hat[0] = 0.809 + c_state[0] = 0.702, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.373, f_gate[0] = 0.597, o_gate[0] = 0.082, c_hat[0] = 0.869 + c_state[0] = 0.743, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.345, f_gate[0] = 0.583, o_gate[0] = 0.076, c_hat[0] = 0.875 + c_state[0] = 0.736, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.067 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.083 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.244 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.327 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.127, c_hat[0] = 0.713 + c_state[0] = 0.373, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.101, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.408, f_gate[0] = 0.625, o_gate[0] = 0.088, c_hat[0] = 0.809 + c_state[0] = 0.702, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.373, f_gate[0] = 0.597, o_gate[0] = 0.082, c_hat[0] = 0.869 + c_state[0] = 0.743, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.345, f_gate[0] = 0.583, o_gate[0] = 0.075, c_hat[0] = 0.875 + c_state[0] = 0.735, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.067 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.082 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.244 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.126, c_hat[0] = 0.713 + c_state[0] = 0.373, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.100, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.408, f_gate[0] = 0.625, o_gate[0] = 0.088, c_hat[0] = 0.809 + c_state[0] = 0.702, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.372, f_gate[0] = 0.597, o_gate[0] = 0.081, c_hat[0] = 0.869 + c_state[0] = 0.742, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.344, f_gate[0] = 0.582, o_gate[0] = 0.075, c_hat[0] = 0.876 + c_state[0] = 0.734, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.067 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.081 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.243 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.126, c_hat[0] = 0.713 + c_state[0] = 0.373, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.451, f_gate[0] = 0.652, o_gate[0] = 0.100, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.408, f_gate[0] = 0.625, o_gate[0] = 0.087, c_hat[0] = 0.809 + c_state[0] = 0.702, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.372, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.869 + c_state[0] = 0.742, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.344, f_gate[0] = 0.582, o_gate[0] = 0.074, c_hat[0] = 0.876 + c_state[0] = 0.733, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.066 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.081 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.242 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.125, c_hat[0] = 0.714 + c_state[0] = 0.373, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.099, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.407, f_gate[0] = 0.625, o_gate[0] = 0.087, c_hat[0] = 0.809 + c_state[0] = 0.701, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.371, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.870 + c_state[0] = 0.741, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.343, f_gate[0] = 0.581, o_gate[0] = 0.074, c_hat[0] = 0.876 + c_state[0] = 0.732, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.066 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.080 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.242 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.125, c_hat[0] = 0.714 + c_state[0] = 0.373, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.099, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.407, f_gate[0] = 0.624, o_gate[0] = 0.087, c_hat[0] = 0.810 + c_state[0] = 0.701, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.371, f_gate[0] = 0.596, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.740, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.343, f_gate[0] = 0.581, o_gate[0] = 0.074, c_hat[0] = 0.876 + c_state[0] = 0.730, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.066 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.080 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.241 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.124, c_hat[0] = 0.714 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.407, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.701, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.370, f_gate[0] = 0.596, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.740, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.342, f_gate[0] = 0.581, o_gate[0] = 0.073, c_hat[0] = 0.876 + c_state[0] = 0.729, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.066 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.079 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.241 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.124, c_hat[0] = 0.714 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.407, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.701, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.370, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.739, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.342, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.728, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.065 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.079 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.241 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.124, c_hat[0] = 0.715 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.406, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.700, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.370, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.738, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.341, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.727, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.065 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.079 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.324 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.123, c_hat[0] = 0.715 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.450, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.406, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.700, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.369, f_gate[0] = 0.595, o_gate[0] = 0.079, c_hat[0] = 0.870 + c_state[0] = 0.738, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.340, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.726, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.065 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.078 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.324 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.686, o_gate[0] = 0.123, c_hat[0] = 0.715 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.449, f_gate[0] = 0.652, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.406, f_gate[0] = 0.624, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.700, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.369, f_gate[0] = 0.595, o_gate[0] = 0.079, c_hat[0] = 0.870 + c_state[0] = 0.737, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.340, f_gate[0] = 0.580, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.725, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.065 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.078 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.324 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.123, c_hat[0] = 0.716 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.405, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.700, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.368, f_gate[0] = 0.595, o_gate[0] = 0.079, c_hat[0] = 0.870 + c_state[0] = 0.737, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.339, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.724, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.065 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.078 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.123, c_hat[0] = 0.716 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.405, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.699, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.368, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.736, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.338, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.723, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.065 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.078 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.716 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.405, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.810 + c_state[0] = 0.699, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.367, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.735, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.337, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.722, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.064 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.078 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.716 + c_state[0] = 0.375, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.449, f_gate[0] = 0.653, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.404, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.809 + c_state[0] = 0.699, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.366, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.735, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.337, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.721, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.064 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.077 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.717 + c_state[0] = 0.375, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.449, f_gate[0] = 0.654, o_gate[0] = 0.098, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.404, f_gate[0] = 0.625, o_gate[0] = 0.086, c_hat[0] = 0.809 + c_state[0] = 0.699, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.366, f_gate[0] = 0.595, o_gate[0] = 0.080, c_hat[0] = 0.870 + c_state[0] = 0.734, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.336, f_gate[0] = 0.579, o_gate[0] = 0.073, c_hat[0] = 0.877 + c_state[0] = 0.720, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.064 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.077 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.124, c_hat[0] = 0.717 + c_state[0] = 0.375, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.449, f_gate[0] = 0.654, o_gate[0] = 0.099, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.403, f_gate[0] = 0.626, o_gate[0] = 0.087, c_hat[0] = 0.809 + c_state[0] = 0.699, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.365, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.870 + c_state[0] = 0.734, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.335, f_gate[0] = 0.579, o_gate[0] = 0.074, c_hat[0] = 0.877 + c_state[0] = 0.718, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.064 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.077 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.240 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.125, c_hat[0] = 0.717 + c_state[0] = 0.375, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.448, f_gate[0] = 0.654, o_gate[0] = 0.099, c_hat[0] = 0.780 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.403, f_gate[0] = 0.626, o_gate[0] = 0.087, c_hat[0] = 0.809 + c_state[0] = 0.698, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.364, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.870 + c_state[0] = 0.733, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.334, f_gate[0] = 0.579, o_gate[0] = 0.074, c_hat[0] = 0.877 + c_state[0] = 0.717, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.063 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.077 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.239 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.125, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.448, f_gate[0] = 0.654, o_gate[0] = 0.100, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.403, f_gate[0] = 0.627, o_gate[0] = 0.088, c_hat[0] = 0.808 + c_state[0] = 0.698, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.363, f_gate[0] = 0.596, o_gate[0] = 0.081, c_hat[0] = 0.869 + c_state[0] = 0.732, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.333, f_gate[0] = 0.579, o_gate[0] = 0.075, c_hat[0] = 0.877 + c_state[0] = 0.715, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.063 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.077 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.239 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.126, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.448, f_gate[0] = 0.655, o_gate[0] = 0.100, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.402, f_gate[0] = 0.627, o_gate[0] = 0.088, c_hat[0] = 0.808 + c_state[0] = 0.698, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.363, f_gate[0] = 0.596, o_gate[0] = 0.082, c_hat[0] = 0.869 + c_state[0] = 0.731, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.579, o_gate[0] = 0.075, c_hat[0] = 0.876 + c_state[0] = 0.714, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.063 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.076 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.239 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.126, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.448, f_gate[0] = 0.655, o_gate[0] = 0.101, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.402, f_gate[0] = 0.627, o_gate[0] = 0.089, c_hat[0] = 0.808 + c_state[0] = 0.698, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.362, f_gate[0] = 0.597, o_gate[0] = 0.083, c_hat[0] = 0.869 + c_state[0] = 0.730, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.579, o_gate[0] = 0.076, c_hat[0] = 0.876 + c_state[0] = 0.712, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.062 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.076 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.239 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.327 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.127, c_hat[0] = 0.718 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.448, f_gate[0] = 0.656, o_gate[0] = 0.101, c_hat[0] = 0.779 + c_state[0] = 0.595, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.401, f_gate[0] = 0.628, o_gate[0] = 0.089, c_hat[0] = 0.807 + c_state[0] = 0.697, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.361, f_gate[0] = 0.597, o_gate[0] = 0.083, c_hat[0] = 0.869 + c_state[0] = 0.729, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.328, f_gate[0] = 0.579, o_gate[0] = 0.076, c_hat[0] = 0.876 + c_state[0] = 0.710, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.061 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.075 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.239 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.327 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.127, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.447, f_gate[0] = 0.656, o_gate[0] = 0.102, c_hat[0] = 0.778 + c_state[0] = 0.595, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.400, f_gate[0] = 0.628, o_gate[0] = 0.090, c_hat[0] = 0.807 + c_state[0] = 0.697, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.359, f_gate[0] = 0.597, o_gate[0] = 0.084, c_hat[0] = 0.869 + c_state[0] = 0.728, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.327, f_gate[0] = 0.579, o_gate[0] = 0.077, c_hat[0] = 0.876 + c_state[0] = 0.708, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.060 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.075 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.238 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.327 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.447, f_gate[0] = 0.656, o_gate[0] = 0.102, c_hat[0] = 0.778 + c_state[0] = 0.595, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.400, f_gate[0] = 0.629, o_gate[0] = 0.090, c_hat[0] = 0.807 + c_state[0] = 0.696, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.358, f_gate[0] = 0.597, o_gate[0] = 0.084, c_hat[0] = 0.868 + c_state[0] = 0.727, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.325, f_gate[0] = 0.579, o_gate[0] = 0.077, c_hat[0] = 0.876 + c_state[0] = 0.706, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.059 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.074 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.237 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.327 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.263 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.447, f_gate[0] = 0.657, o_gate[0] = 0.102, c_hat[0] = 0.778 + c_state[0] = 0.594, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.399, f_gate[0] = 0.629, o_gate[0] = 0.090, c_hat[0] = 0.806 + c_state[0] = 0.696, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.357, f_gate[0] = 0.598, o_gate[0] = 0.084, c_hat[0] = 0.868 + c_state[0] = 0.726, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.324, f_gate[0] = 0.579, o_gate[0] = 0.078, c_hat[0] = 0.876 + c_state[0] = 0.704, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.058 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.072 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.236 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.326 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.447, f_gate[0] = 0.657, o_gate[0] = 0.102, c_hat[0] = 0.778 + c_state[0] = 0.594, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.398, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.806 + c_state[0] = 0.695, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.356, f_gate[0] = 0.598, o_gate[0] = 0.085, c_hat[0] = 0.868 + c_state[0] = 0.724, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.322, f_gate[0] = 0.579, o_gate[0] = 0.078, c_hat[0] = 0.876 + c_state[0] = 0.702, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.056 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.070 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.234 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.325 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.262 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.446, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777 + c_state[0] = 0.594, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.398, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.806 + c_state[0] = 0.694, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.355, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.868 + c_state[0] = 0.723, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.321, f_gate[0] = 0.579, o_gate[0] = 0.078, c_hat[0] = 0.876 + c_state[0] = 0.699, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.054 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.068 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.232 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.324 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.261 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.446, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777 + c_state[0] = 0.594, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.397, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.806 + c_state[0] = 0.693, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.354, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.868 + c_state[0] = 0.721, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.319, f_gate[0] = 0.578, o_gate[0] = 0.078, c_hat[0] = 0.875 + c_state[0] = 0.697, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.051 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.066 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.230 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.322 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.033 + Gradient do_[0] = 0.260 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.446, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777 + c_state[0] = 0.593, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.396, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.805 + c_state[0] = 0.693, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.353, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.868 + c_state[0] = 0.720, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.318, f_gate[0] = 0.578, o_gate[0] = 0.078, c_hat[0] = 0.875 + c_state[0] = 0.694, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.049 +Backward Time Step 3: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.063 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.228 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.321 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.260 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.657, o_gate[0] = 0.103, c_hat[0] = 0.777 + c_state[0] = 0.593, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.396, f_gate[0] = 0.629, o_gate[0] = 0.091, c_hat[0] = 0.805 + c_state[0] = 0.692, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.352, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.867 + c_state[0] = 0.718, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.316, f_gate[0] = 0.578, o_gate[0] = 0.079, c_hat[0] = 0.875 + c_state[0] = 0.692, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000 + Gradient do_[0] = 0.045 +Backward Time Step 3: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.059 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.225 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.013 + Gradient do_[0] = 0.319 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.259 +Epoch 100, Train Loss=0.009895, Weight Norm=12.102183 +Sample Predictions at Epoch 100: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 65.10 | 63.87 | 1.23 | +| 193 | 2024-10-14 | 65.13 | 66.55 | 1.42 | +| 194 | 2024-10-15 | 65.40 | 66.00 | 0.60 | +| 195 | 2024-10-16 | 65.47 | 67.20 | 1.73 | +| 196 | 2024-10-17 | 65.64 | 66.76 | 1.12 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.658, o_gate[0] = 0.103, c_hat[0] = 0.776 + c_state[0] = 0.593, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.395, f_gate[0] = 0.630, o_gate[0] = 0.091, c_hat[0] = 0.805 + c_state[0] = 0.691, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.350, f_gate[0] = 0.597, o_gate[0] = 0.085, c_hat[0] = 0.867 + c_state[0] = 0.716, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.315, f_gate[0] = 0.577, o_gate[0] = 0.079, c_hat[0] = 0.875 + c_state[0] = 0.689, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000 + Gradient do_[0] = 0.039 +Backward Time Step 3: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.054 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.221 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.013 + Gradient do_[0] = 0.318 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.258 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.658, o_gate[0] = 0.103, c_hat[0] = 0.776 + c_state[0] = 0.592, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.394, f_gate[0] = 0.630, o_gate[0] = 0.092, c_hat[0] = 0.804 + c_state[0] = 0.690, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.349, f_gate[0] = 0.597, o_gate[0] = 0.086, c_hat[0] = 0.867 + c_state[0] = 0.715, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.313, f_gate[0] = 0.577, o_gate[0] = 0.079, c_hat[0] = 0.875 + c_state[0] = 0.687, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000 + Gradient do_[0] = 0.029 +Backward Time Step 3: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.046 +Backward Time Step 2: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.006 + Gradient do_[0] = 0.215 +Backward Time Step 1: + Gradient di[0] = 0.014, df[0] = 0.010, dc_hat[0] = 0.013 + Gradient do_[0] = 0.315 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.257 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.129, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.658, o_gate[0] = 0.104, c_hat[0] = 0.775 + c_state[0] = 0.592, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.393, f_gate[0] = 0.630, o_gate[0] = 0.092, c_hat[0] = 0.804 + c_state[0] = 0.689, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.348, f_gate[0] = 0.597, o_gate[0] = 0.086, c_hat[0] = 0.866 + c_state[0] = 0.713, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.577, o_gate[0] = 0.080, c_hat[0] = 0.874 + c_state[0] = 0.684, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000 + Gradient do_[0] = 0.011 +Backward Time Step 3: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000 + Gradient do_[0] = 0.032 +Backward Time Step 2: + Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.205 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.312 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.256 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.658, o_gate[0] = 0.104, c_hat[0] = 0.775 + c_state[0] = 0.592, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.393, f_gate[0] = 0.631, o_gate[0] = 0.093, c_hat[0] = 0.803 + c_state[0] = 0.688, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.598, o_gate[0] = 0.087, c_hat[0] = 0.866 + c_state[0] = 0.711, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.310, f_gate[0] = 0.578, o_gate[0] = 0.080, c_hat[0] = 0.874 + c_state[0] = 0.682, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = 0.000, df[0] = 0.000, dc_hat[0] = 0.000 + Gradient do_[0] = -0.020 +Backward Time Step 3: + Gradient di[0] = 0.001, df[0] = 0.000, dc_hat[0] = 0.000 + Gradient do_[0] = 0.004 +Backward Time Step 2: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.005 + Gradient do_[0] = 0.185 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.305 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.254 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.131, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.659, o_gate[0] = 0.105, c_hat[0] = 0.774 + c_state[0] = 0.591, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.392, f_gate[0] = 0.631, o_gate[0] = 0.093, c_hat[0] = 0.802 + c_state[0] = 0.687, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.599, o_gate[0] = 0.088, c_hat[0] = 0.865 + c_state[0] = 0.710, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.308, f_gate[0] = 0.579, o_gate[0] = 0.081, c_hat[0] = 0.872 + c_state[0] = 0.680, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.000 + Gradient do_[0] = -0.076 +Backward Time Step 3: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.047 +Backward Time Step 2: + Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.005 + Gradient do_[0] = 0.146 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.290 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.250 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.132, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.443, f_gate[0] = 0.660, o_gate[0] = 0.106, c_hat[0] = 0.773 + c_state[0] = 0.591, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.391, f_gate[0] = 0.633, o_gate[0] = 0.095, c_hat[0] = 0.800 + c_state[0] = 0.686, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.600, o_gate[0] = 0.089, c_hat[0] = 0.863 + c_state[0] = 0.709, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.306, f_gate[0] = 0.581, o_gate[0] = 0.083, c_hat[0] = 0.871 + c_state[0] = 0.678, h_state[0] = 0.049 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.160 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.135 +Backward Time Step 2: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.004 + Gradient do_[0] = 0.075 +Backward Time Step 1: + Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.011 + Gradient do_[0] = 0.259 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.242 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.443, f_gate[0] = 0.660, o_gate[0] = 0.108, c_hat[0] = 0.771 + c_state[0] = 0.590, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.634, o_gate[0] = 0.096, c_hat[0] = 0.798 + c_state[0] = 0.685, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.342, f_gate[0] = 0.603, o_gate[0] = 0.091, c_hat[0] = 0.861 + c_state[0] = 0.708, h_state[0] = 0.055 +Time Step 4: + i_gate[0] = 0.304, f_gate[0] = 0.585, o_gate[0] = 0.085, c_hat[0] = 0.868 + c_state[0] = 0.678, h_state[0] = 0.050 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.248 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.247 +Backward Time Step 2: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.024 +Backward Time Step 1: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.010 + Gradient do_[0] = 0.207 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.031 + Gradient do_[0] = 0.228 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.661, o_gate[0] = 0.110, c_hat[0] = 0.770 + c_state[0] = 0.589, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.389, f_gate[0] = 0.636, o_gate[0] = 0.098, c_hat[0] = 0.795 + c_state[0] = 0.684, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.341, f_gate[0] = 0.606, o_gate[0] = 0.093, c_hat[0] = 0.859 + c_state[0] = 0.707, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.302, f_gate[0] = 0.589, o_gate[0] = 0.087, c_hat[0] = 0.865 + c_state[0] = 0.678, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.264 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.002 + Gradient do_[0] = -0.300 +Backward Time Step 2: + Gradient di[0] = 0.002, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = -0.088 +Backward Time Step 1: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.009 + Gradient do_[0] = 0.160 +Backward Time Step 0: + Gradient di[0] = 0.021, df[0] = 0.018, dc_hat[0] = 0.030 + Gradient do_[0] = 0.213 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.138, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.662, o_gate[0] = 0.111, c_hat[0] = 0.768 + c_state[0] = 0.589, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.638, o_gate[0] = 0.100, c_hat[0] = 0.793 + c_state[0] = 0.683, h_state[0] = 0.059 +Time Step 3: + i_gate[0] = 0.339, f_gate[0] = 0.609, o_gate[0] = 0.095, c_hat[0] = 0.856 + c_state[0] = 0.706, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.300, f_gate[0] = 0.594, o_gate[0] = 0.089, c_hat[0] = 0.861 + c_state[0] = 0.678, h_state[0] = 0.053 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.121 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.152 +Backward Time Step 2: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = 0.027 +Backward Time Step 1: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.011 + Gradient do_[0] = 0.208 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.031 + Gradient do_[0] = 0.224 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.138, c_hat[0] = 0.720 + c_state[0] = 0.376, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.662, o_gate[0] = 0.111, c_hat[0] = 0.767 + c_state[0] = 0.588, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.638, o_gate[0] = 0.100, c_hat[0] = 0.791 + c_state[0] = 0.682, h_state[0] = 0.059 +Time Step 3: + i_gate[0] = 0.338, f_gate[0] = 0.609, o_gate[0] = 0.095, c_hat[0] = 0.855 + c_state[0] = 0.704, h_state[0] = 0.058 +Time Step 4: + i_gate[0] = 0.300, f_gate[0] = 0.594, o_gate[0] = 0.090, c_hat[0] = 0.860 + c_state[0] = 0.676, h_state[0] = 0.053 +Backward Time Step 4: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.002 + Gradient do_[0] = 0.238 +Backward Time Step 3: + Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.003 + Gradient do_[0] = 0.318 +Backward Time Step 2: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.010 + Gradient do_[0] = 0.458 +Backward Time Step 1: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.018 + Gradient do_[0] = 0.453 +Backward Time Step 0: + Gradient di[0] = 0.027, df[0] = 0.022, dc_hat[0] = 0.037 + Gradient do_[0] = 0.294 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.661, o_gate[0] = 0.110, c_hat[0] = 0.768 + c_state[0] = 0.588, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.635, o_gate[0] = 0.098, c_hat[0] = 0.793 + c_state[0] = 0.681, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.340, f_gate[0] = 0.603, o_gate[0] = 0.093, c_hat[0] = 0.857 + c_state[0] = 0.702, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.302, f_gate[0] = 0.585, o_gate[0] = 0.087, c_hat[0] = 0.864 + c_state[0] = 0.672, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.241 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.002 + Gradient do_[0] = -0.264 +Backward Time Step 2: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.053 +Backward Time Step 1: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.010 + Gradient do_[0] = 0.176 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.030 + Gradient do_[0] = 0.216 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.142, c_hat[0] = 0.720 + c_state[0] = 0.377, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.662, o_gate[0] = 0.116, c_hat[0] = 0.771 + c_state[0] = 0.588, h_state[0] = 0.061 +Time Step 2: + i_gate[0] = 0.384, f_gate[0] = 0.638, o_gate[0] = 0.105, c_hat[0] = 0.797 + c_state[0] = 0.681, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.334, f_gate[0] = 0.608, o_gate[0] = 0.100, c_hat[0] = 0.860 + c_state[0] = 0.701, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.295, f_gate[0] = 0.593, o_gate[0] = 0.095, c_hat[0] = 0.865 + c_state[0] = 0.671, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.242 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.003 + Gradient do_[0] = -0.337 +Backward Time Step 2: + Gradient di[0] = 0.000, df[0] = 0.000, dc_hat[0] = 0.000 + Gradient do_[0] = -0.159 +Backward Time Step 1: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.008 + Gradient do_[0] = 0.090 +Backward Time Step 0: + Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.029 + Gradient do_[0] = 0.187 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.144, c_hat[0] = 0.720 + c_state[0] = 0.377, h_state[0] = 0.052 +Time Step 1: + i_gate[0] = 0.439, f_gate[0] = 0.663, o_gate[0] = 0.117, c_hat[0] = 0.769 + c_state[0] = 0.587, h_state[0] = 0.062 +Time Step 2: + i_gate[0] = 0.382, f_gate[0] = 0.640, o_gate[0] = 0.106, c_hat[0] = 0.794 + c_state[0] = 0.679, h_state[0] = 0.063 +Time Step 3: + i_gate[0] = 0.330, f_gate[0] = 0.613, o_gate[0] = 0.102, c_hat[0] = 0.857 + c_state[0] = 0.699, h_state[0] = 0.062 +Time Step 4: + i_gate[0] = 0.292, f_gate[0] = 0.602, o_gate[0] = 0.098, c_hat[0] = 0.860 + c_state[0] = 0.672, h_state[0] = 0.058 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.000 + Gradient do_[0] = -0.055 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.099 +Backward Time Step 2: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.004 + Gradient do_[0] = 0.052 +Backward Time Step 1: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.011 + Gradient do_[0] = 0.208 +Backward Time Step 0: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.032 + Gradient do_[0] = 0.220 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.143, c_hat[0] = 0.720 + c_state[0] = 0.377, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.438, f_gate[0] = 0.664, o_gate[0] = 0.116, c_hat[0] = 0.768 + c_state[0] = 0.586, h_state[0] = 0.061 +Time Step 2: + i_gate[0] = 0.380, f_gate[0] = 0.641, o_gate[0] = 0.105, c_hat[0] = 0.792 + c_state[0] = 0.676, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.615, o_gate[0] = 0.101, c_hat[0] = 0.854 + c_state[0] = 0.696, h_state[0] = 0.061 +Time Step 4: + i_gate[0] = 0.289, f_gate[0] = 0.606, o_gate[0] = 0.098, c_hat[0] = 0.857 + c_state[0] = 0.669, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = 0.008, df[0] = 0.007, dc_hat[0] = 0.004 + Gradient do_[0] = 0.402 +Backward Time Step 3: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.006 + Gradient do_[0] = 0.643 +Backward Time Step 2: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.017 + Gradient do_[0] = 0.855 +Backward Time Step 1: + Gradient di[0] = 0.028, df[0] = 0.022, dc_hat[0] = 0.027 + Gradient do_[0] = 0.766 +Backward Time Step 0: + Gradient di[0] = 0.034, df[0] = 0.028, dc_hat[0] = 0.048 + Gradient do_[0] = 0.405 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.719 + c_state[0] = 0.375, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.438, f_gate[0] = 0.662, o_gate[0] = 0.110, c_hat[0] = 0.766 + c_state[0] = 0.584, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.382, f_gate[0] = 0.637, o_gate[0] = 0.099, c_hat[0] = 0.791 + c_state[0] = 0.674, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.607, o_gate[0] = 0.094, c_hat[0] = 0.855 + c_state[0] = 0.692, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.292, f_gate[0] = 0.594, o_gate[0] = 0.090, c_hat[0] = 0.860 + c_state[0] = 0.663, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.005 + Gradient do_[0] = 0.606 +Backward Time Step 3: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.008 + Gradient do_[0] = 0.892 +Backward Time Step 2: + Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.019 + Gradient do_[0] = 1.045 +Backward Time Step 1: + Gradient di[0] = 0.029, df[0] = 0.023, dc_hat[0] = 0.028 + Gradient do_[0] = 0.845 +Backward Time Step 0: + Gradient di[0] = 0.034, df[0] = 0.028, dc_hat[0] = 0.047 + Gradient do_[0] = 0.420 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.132, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.660, o_gate[0] = 0.105, c_hat[0] = 0.765 + c_state[0] = 0.585, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.386, f_gate[0] = 0.634, o_gate[0] = 0.094, c_hat[0] = 0.791 + c_state[0] = 0.676, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.337, f_gate[0] = 0.602, o_gate[0] = 0.089, c_hat[0] = 0.856 + c_state[0] = 0.695, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.299, f_gate[0] = 0.585, o_gate[0] = 0.084, c_hat[0] = 0.862 + c_state[0] = 0.665, h_state[0] = 0.049 +Backward Time Step 4: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.005 + Gradient do_[0] = 0.732 +Backward Time Step 3: + Gradient di[0] = 0.016, df[0] = 0.012, dc_hat[0] = 0.008 + Gradient do_[0] = 0.900 +Backward Time Step 2: + Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.017 + Gradient do_[0] = 0.953 +Backward Time Step 1: + Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.023 + Gradient do_[0] = 0.698 +Backward Time Step 0: + Gradient di[0] = 0.029, df[0] = 0.024, dc_hat[0] = 0.041 + Gradient do_[0] = 0.359 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.660, o_gate[0] = 0.105, c_hat[0] = 0.769 + c_state[0] = 0.586, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.385, f_gate[0] = 0.632, o_gate[0] = 0.094, c_hat[0] = 0.797 + c_state[0] = 0.677, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.335, f_gate[0] = 0.599, o_gate[0] = 0.089, c_hat[0] = 0.862 + c_state[0] = 0.694, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.296, f_gate[0] = 0.582, o_gate[0] = 0.084, c_hat[0] = 0.869 + c_state[0] = 0.662, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.006 + Gradient do_[0] = 0.814 +Backward Time Step 3: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.008 + Gradient do_[0] = 0.998 +Backward Time Step 2: + Gradient di[0] = 0.024, df[0] = 0.019, dc_hat[0] = 0.018 + Gradient do_[0] = 1.057 +Backward Time Step 1: + Gradient di[0] = 0.025, df[0] = 0.020, dc_hat[0] = 0.024 + Gradient do_[0] = 0.757 +Backward Time Step 0: + Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.042 + Gradient do_[0] = 0.381 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.125, c_hat[0] = 0.718 + c_state[0] = 0.374, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.657, o_gate[0] = 0.099, c_hat[0] = 0.769 + c_state[0] = 0.585, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.628, o_gate[0] = 0.089, c_hat[0] = 0.797 + c_state[0] = 0.677, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.340, f_gate[0] = 0.593, o_gate[0] = 0.083, c_hat[0] = 0.862 + c_state[0] = 0.694, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.302, f_gate[0] = 0.572, o_gate[0] = 0.077, c_hat[0] = 0.871 + c_state[0] = 0.660, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000 + Gradient do_[0] = 0.044 +Backward Time Step 3: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.000 + Gradient do_[0] = 0.045 +Backward Time Step 2: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.005 + Gradient do_[0] = 0.200 +Backward Time Step 1: + Gradient di[0] = 0.012, df[0] = 0.009, dc_hat[0] = 0.011 + Gradient do_[0] = 0.291 +Backward Time Step 0: + Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030 + Gradient do_[0] = 0.243 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.659, o_gate[0] = 0.102, c_hat[0] = 0.770 + c_state[0] = 0.586, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.385, f_gate[0] = 0.630, o_gate[0] = 0.091, c_hat[0] = 0.799 + c_state[0] = 0.677, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.336, f_gate[0] = 0.597, o_gate[0] = 0.086, c_hat[0] = 0.864 + c_state[0] = 0.694, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.297, f_gate[0] = 0.579, o_gate[0] = 0.081, c_hat[0] = 0.872 + c_state[0] = 0.661, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.005 + Gradient do_[0] = 0.809 +Backward Time Step 3: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.008 + Gradient do_[0] = 0.947 +Backward Time Step 2: + Gradient di[0] = 0.022, df[0] = 0.017, dc_hat[0] = 0.016 + Gradient do_[0] = 0.989 +Backward Time Step 1: + Gradient di[0] = 0.023, df[0] = 0.018, dc_hat[0] = 0.022 + Gradient do_[0] = 0.703 +Backward Time Step 0: + Gradient di[0] = 0.028, df[0] = 0.023, dc_hat[0] = 0.040 + Gradient do_[0] = 0.364 +Time Step 0: + i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.122, c_hat[0] = 0.717 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.656, o_gate[0] = 0.097, c_hat[0] = 0.770 + c_state[0] = 0.585, h_state[0] = 0.051 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.626, o_gate[0] = 0.086, c_hat[0] = 0.799 + c_state[0] = 0.676, h_state[0] = 0.051 +Time Step 3: + i_gate[0] = 0.340, f_gate[0] = 0.590, o_gate[0] = 0.081, c_hat[0] = 0.865 + c_state[0] = 0.693, h_state[0] = 0.049 +Time Step 4: + i_gate[0] = 0.303, f_gate[0] = 0.568, o_gate[0] = 0.075, c_hat[0] = 0.873 + c_state[0] = 0.658, h_state[0] = 0.043 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.003 + Gradient do_[0] = -0.428 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = -0.004, dc_hat[0] = -0.003 + Gradient do_[0] = -0.370 +Backward Time Step 2: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = -0.096 +Backward Time Step 1: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009 + Gradient do_[0] = 0.189 +Backward Time Step 0: + Gradient di[0] = 0.019, df[0] = 0.016, dc_hat[0] = 0.028 + Gradient do_[0] = 0.219 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.127, c_hat[0] = 0.718 + c_state[0] = 0.374, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.102, c_hat[0] = 0.770 + c_state[0] = 0.585, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.385, f_gate[0] = 0.630, o_gate[0] = 0.091, c_hat[0] = 0.800 + c_state[0] = 0.676, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.336, f_gate[0] = 0.596, o_gate[0] = 0.086, c_hat[0] = 0.864 + c_state[0] = 0.693, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.297, f_gate[0] = 0.577, o_gate[0] = 0.081, c_hat[0] = 0.873 + c_state[0] = 0.660, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.003 + Gradient do_[0] = 0.411 +Backward Time Step 3: + Gradient di[0] = 0.009, df[0] = 0.006, dc_hat[0] = 0.004 + Gradient do_[0] = 0.480 +Backward Time Step 2: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.011 + Gradient do_[0] = 0.582 +Backward Time Step 1: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.017 + Gradient do_[0] = 0.494 +Backward Time Step 0: + Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.035 + Gradient do_[0] = 0.303 +Time Step 0: + i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.123, c_hat[0] = 0.717 + c_state[0] = 0.374, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.656, o_gate[0] = 0.098, c_hat[0] = 0.771 + c_state[0] = 0.585, h_state[0] = 0.051 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.625, o_gate[0] = 0.087, c_hat[0] = 0.801 + c_state[0] = 0.675, h_state[0] = 0.051 +Time Step 3: + i_gate[0] = 0.338, f_gate[0] = 0.589, o_gate[0] = 0.082, c_hat[0] = 0.866 + c_state[0] = 0.690, h_state[0] = 0.049 +Time Step 4: + i_gate[0] = 0.301, f_gate[0] = 0.567, o_gate[0] = 0.076, c_hat[0] = 0.875 + c_state[0] = 0.655, h_state[0] = 0.044 +Backward Time Step 4: + Gradient di[0] = -0.012, df[0] = -0.009, dc_hat[0] = -0.005 + Gradient do_[0] = -0.762 +Backward Time Step 3: + Gradient di[0] = -0.011, df[0] = -0.008, dc_hat[0] = -0.005 + Gradient do_[0] = -0.678 +Backward Time Step 2: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.326 +Backward Time Step 1: + Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.007 + Gradient do_[0] = 0.102 +Backward Time Step 0: + Gradient di[0] = 0.018, df[0] = 0.015, dc_hat[0] = 0.026 + Gradient do_[0] = 0.197 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.128, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.439, f_gate[0] = 0.659, o_gate[0] = 0.103, c_hat[0] = 0.771 + c_state[0] = 0.585, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.384, f_gate[0] = 0.630, o_gate[0] = 0.092, c_hat[0] = 0.801 + c_state[0] = 0.675, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.334, f_gate[0] = 0.596, o_gate[0] = 0.088, c_hat[0] = 0.865 + c_state[0] = 0.691, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.295, f_gate[0] = 0.578, o_gate[0] = 0.082, c_hat[0] = 0.873 + c_state[0] = 0.657, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.003 + Gradient do_[0] = -0.387 +Backward Time Step 3: + Gradient di[0] = -0.008, df[0] = -0.006, dc_hat[0] = -0.004 + Gradient do_[0] = -0.469 +Backward Time Step 2: + Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.255 +Backward Time Step 1: + Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.006 + Gradient do_[0] = 0.057 +Backward Time Step 0: + Gradient di[0] = 0.018, df[0] = 0.015, dc_hat[0] = 0.025 + Gradient do_[0] = 0.174 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.438, f_gate[0] = 0.660, o_gate[0] = 0.104, c_hat[0] = 0.769 + c_state[0] = 0.584, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.382, f_gate[0] = 0.633, o_gate[0] = 0.094, c_hat[0] = 0.798 + c_state[0] = 0.674, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.601, o_gate[0] = 0.090, c_hat[0] = 0.862 + c_state[0] = 0.690, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.292, f_gate[0] = 0.586, o_gate[0] = 0.085, c_hat[0] = 0.869 + c_state[0] = 0.659, h_state[0] = 0.049 +Backward Time Step 4: + Gradient di[0] = 0.054, df[0] = 0.043, dc_hat[0] = 0.021 + Gradient do_[0] = 3.052 +Backward Time Step 3: + Gradient di[0] = 0.082, df[0] = 0.062, dc_hat[0] = 0.036 + Gradient do_[0] = 4.449 +Backward Time Step 2: + Gradient di[0] = 0.094, df[0] = 0.073, dc_hat[0] = 0.069 + Gradient do_[0] = 4.583 +Backward Time Step 1: + Gradient di[0] = 0.088, df[0] = 0.069, dc_hat[0] = 0.083 + Gradient do_[0] = 3.049 +Backward Time Step 0: + Gradient di[0] = 0.074, df[0] = 0.061, dc_hat[0] = 0.104 + Gradient do_[0] = 1.150 +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.715 + c_state[0] = 0.372, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.657, o_gate[0] = 0.099, c_hat[0] = 0.766 + c_state[0] = 0.582, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.629, o_gate[0] = 0.089, c_hat[0] = 0.794 + c_state[0] = 0.674, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.339, f_gate[0] = 0.596, o_gate[0] = 0.084, c_hat[0] = 0.860 + c_state[0] = 0.693, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.302, f_gate[0] = 0.578, o_gate[0] = 0.078, c_hat[0] = 0.868 + c_state[0] = 0.663, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = -0.003, dc_hat[0] = -0.001 + Gradient do_[0] = -0.212 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.236 +Backward Time Step 2: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.033 +Backward Time Step 1: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009 + Gradient do_[0] = 0.180 +Backward Time Step 0: + Gradient di[0] = 0.019, df[0] = 0.016, dc_hat[0] = 0.028 + Gradient do_[0] = 0.212 +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.126, c_hat[0] = 0.715 + c_state[0] = 0.372, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.101, c_hat[0] = 0.764 + c_state[0] = 0.581, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.631, o_gate[0] = 0.090, c_hat[0] = 0.792 + c_state[0] = 0.673, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.339, f_gate[0] = 0.598, o_gate[0] = 0.085, c_hat[0] = 0.858 + c_state[0] = 0.693, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.301, f_gate[0] = 0.581, o_gate[0] = 0.080, c_hat[0] = 0.865 + c_state[0] = 0.664, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.177 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.211 +Backward Time Step 2: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.020 +Backward Time Step 1: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009 + Gradient do_[0] = 0.179 +Backward Time Step 0: + Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028 + Gradient do_[0] = 0.210 +Time Step 0: + i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.127, c_hat[0] = 0.715 + c_state[0] = 0.372, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.102, c_hat[0] = 0.764 + c_state[0] = 0.581, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.631, o_gate[0] = 0.091, c_hat[0] = 0.791 + c_state[0] = 0.673, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.339, f_gate[0] = 0.599, o_gate[0] = 0.086, c_hat[0] = 0.857 + c_state[0] = 0.693, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.301, f_gate[0] = 0.583, o_gate[0] = 0.081, c_hat[0] = 0.863 + c_state[0] = 0.664, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = -0.018, df[0] = -0.014, dc_hat[0] = -0.007 + Gradient do_[0] = -1.049 +Backward Time Step 3: + Gradient di[0] = -0.023, df[0] = -0.018, dc_hat[0] = -0.011 + Gradient do_[0] = -1.330 +Backward Time Step 2: + Gradient di[0] = -0.016, df[0] = -0.013, dc_hat[0] = -0.013 + Gradient do_[0] = -1.026 +Backward Time Step 1: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.005 + Gradient do_[0] = -0.377 +Backward Time Step 0: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.015 + Gradient do_[0] = 0.045 +Time Step 0: + i_gate[0] = 0.521, f_gate[0] = 0.686, o_gate[0] = 0.133, c_hat[0] = 0.716 + c_state[0] = 0.373, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.438, f_gate[0] = 0.661, o_gate[0] = 0.107, c_hat[0] = 0.763 + c_state[0] = 0.581, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.384, f_gate[0] = 0.636, o_gate[0] = 0.096, c_hat[0] = 0.790 + c_state[0] = 0.673, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.334, f_gate[0] = 0.607, o_gate[0] = 0.092, c_hat[0] = 0.855 + c_state[0] = 0.693, h_state[0] = 0.055 +Time Step 4: + i_gate[0] = 0.296, f_gate[0] = 0.594, o_gate[0] = 0.087, c_hat[0] = 0.860 + c_state[0] = 0.667, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = 0.036, df[0] = 0.029, dc_hat[0] = 0.016 + Gradient do_[0] = 2.047 +Backward Time Step 3: + Gradient di[0] = 0.061, df[0] = 0.047, dc_hat[0] = 0.029 + Gradient do_[0] = 3.284 +Backward Time Step 2: + Gradient di[0] = 0.075, df[0] = 0.059, dc_hat[0] = 0.058 + Gradient do_[0] = 3.545 +Backward Time Step 1: + Gradient di[0] = 0.075, df[0] = 0.059, dc_hat[0] = 0.073 + Gradient do_[0] = 2.526 +Backward Time Step 0: + Gradient di[0] = 0.066, df[0] = 0.055, dc_hat[0] = 0.094 + Gradient do_[0] = 0.993 +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.127, c_hat[0] = 0.714 + c_state[0] = 0.371, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.659, o_gate[0] = 0.101, c_hat[0] = 0.760 + c_state[0] = 0.580, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.633, o_gate[0] = 0.091, c_hat[0] = 0.786 + c_state[0] = 0.673, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.342, f_gate[0] = 0.603, o_gate[0] = 0.086, c_hat[0] = 0.852 + c_state[0] = 0.698, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.306, f_gate[0] = 0.587, o_gate[0] = 0.080, c_hat[0] = 0.859 + c_state[0] = 0.672, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.004 + Gradient do_[0] = -0.516 +Backward Time Step 3: + Gradient di[0] = -0.011, df[0] = -0.009, dc_hat[0] = -0.006 + Gradient do_[0] = -0.654 +Backward Time Step 2: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.004 + Gradient do_[0] = -0.415 +Backward Time Step 1: + Gradient di[0] = 0.003, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.023 + Gradient do_[0] = 0.147 +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.131, c_hat[0] = 0.714 + c_state[0] = 0.371, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.661, o_gate[0] = 0.105, c_hat[0] = 0.759 + c_state[0] = 0.580, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.637, o_gate[0] = 0.094, c_hat[0] = 0.784 + c_state[0] = 0.674, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.341, f_gate[0] = 0.609, o_gate[0] = 0.090, c_hat[0] = 0.849 + c_state[0] = 0.699, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.304, f_gate[0] = 0.597, o_gate[0] = 0.084, c_hat[0] = 0.854 + c_state[0] = 0.677, h_state[0] = 0.050 +Backward Time Step 4: + Gradient di[0] = 0.032, df[0] = 0.025, dc_hat[0] = 0.015 + Gradient do_[0] = 1.885 +Backward Time Step 3: + Gradient di[0] = 0.051, df[0] = 0.039, dc_hat[0] = 0.025 + Gradient do_[0] = 2.845 +Backward Time Step 2: + Gradient di[0] = 0.061, df[0] = 0.048, dc_hat[0] = 0.049 + Gradient do_[0] = 2.960 +Backward Time Step 1: + Gradient di[0] = 0.060, df[0] = 0.048, dc_hat[0] = 0.060 + Gradient do_[0] = 2.036 +Backward Time Step 0: + Gradient di[0] = 0.054, df[0] = 0.045, dc_hat[0] = 0.077 + Gradient do_[0] = 0.797 +Time Step 0: + i_gate[0] = 0.519, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.713 + c_state[0] = 0.370, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.443, f_gate[0] = 0.659, o_gate[0] = 0.100, c_hat[0] = 0.757 + c_state[0] = 0.579, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.394, f_gate[0] = 0.634, o_gate[0] = 0.089, c_hat[0] = 0.780 + c_state[0] = 0.675, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.349, f_gate[0] = 0.605, o_gate[0] = 0.084, c_hat[0] = 0.847 + c_state[0] = 0.704, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.313, f_gate[0] = 0.590, o_gate[0] = 0.078, c_hat[0] = 0.853 + c_state[0] = 0.682, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = -0.051, df[0] = -0.040, dc_hat[0] = -0.024 + Gradient do_[0] = -3.259 +Backward Time Step 3: + Gradient di[0] = -0.066, df[0] = -0.050, dc_hat[0] = -0.034 + Gradient do_[0] = -3.942 +Backward Time Step 2: + Gradient di[0] = -0.055, df[0] = -0.043, dc_hat[0] = -0.045 + Gradient do_[0] = -3.142 +Backward Time Step 1: + Gradient di[0] = -0.032, df[0] = -0.026, dc_hat[0] = -0.033 + Gradient do_[0] = -1.414 +Backward Time Step 0: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.006 + Gradient do_[0] = -0.215 +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.131, c_hat[0] = 0.715 + c_state[0] = 0.372, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.662, o_gate[0] = 0.105, c_hat[0] = 0.758 + c_state[0] = 0.581, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.640, o_gate[0] = 0.095, c_hat[0] = 0.782 + c_state[0] = 0.676, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.343, f_gate[0] = 0.613, o_gate[0] = 0.090, c_hat[0] = 0.847 + c_state[0] = 0.705, h_state[0] = 0.055 +Time Step 4: + i_gate[0] = 0.306, f_gate[0] = 0.602, o_gate[0] = 0.085, c_hat[0] = 0.851 + c_state[0] = 0.684, h_state[0] = 0.050 +Backward Time Step 4: + Gradient di[0] = 0.030, df[0] = 0.024, dc_hat[0] = 0.014 + Gradient do_[0] = 1.828 +Backward Time Step 3: + Gradient di[0] = 0.050, df[0] = 0.038, dc_hat[0] = 0.025 + Gradient do_[0] = 2.803 +Backward Time Step 2: + Gradient di[0] = 0.061, df[0] = 0.048, dc_hat[0] = 0.050 + Gradient do_[0] = 2.949 +Backward Time Step 1: + Gradient di[0] = 0.061, df[0] = 0.048, dc_hat[0] = 0.061 + Gradient do_[0] = 2.054 +Backward Time Step 0: + Gradient di[0] = 0.054, df[0] = 0.045, dc_hat[0] = 0.077 + Gradient do_[0] = 0.805 +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.713 + c_state[0] = 0.370, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.660, o_gate[0] = 0.100, c_hat[0] = 0.755 + c_state[0] = 0.580, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.396, f_gate[0] = 0.637, o_gate[0] = 0.089, c_hat[0] = 0.778 + c_state[0] = 0.678, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.351, f_gate[0] = 0.609, o_gate[0] = 0.084, c_hat[0] = 0.845 + c_state[0] = 0.709, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.316, f_gate[0] = 0.595, o_gate[0] = 0.078, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.023 +Backward Time Step 3: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.018 +Backward Time Step 2: + Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.005 + Gradient do_[0] = 0.152 +Backward Time Step 1: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.012 + Gradient do_[0] = 0.267 +Backward Time Step 0: + Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030 + Gradient do_[0] = 0.236 +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.685, o_gate[0] = 0.125, c_hat[0] = 0.713 + c_state[0] = 0.370, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.660, o_gate[0] = 0.100, c_hat[0] = 0.756 + c_state[0] = 0.580, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.397, f_gate[0] = 0.636, o_gate[0] = 0.090, c_hat[0] = 0.779 + c_state[0] = 0.678, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.353, f_gate[0] = 0.607, o_gate[0] = 0.084, c_hat[0] = 0.845 + c_state[0] = 0.710, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.318, f_gate[0] = 0.592, o_gate[0] = 0.078, c_hat[0] = 0.850 + c_state[0] = 0.691, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = -0.118, df[0] = -0.091, dc_hat[0] = -0.056 + Gradient do_[0] = -7.581 +Backward Time Step 3: + Gradient di[0] = -0.147, df[0] = -0.110, dc_hat[0] = -0.077 + Gradient do_[0] = -8.762 +Backward Time Step 2: + Gradient di[0] = -0.124, df[0] = -0.098, dc_hat[0] = -0.104 + Gradient do_[0] = -6.873 +Backward Time Step 1: + Gradient di[0] = -0.076, df[0] = -0.060, dc_hat[0] = -0.078 + Gradient do_[0] = -3.099 +Backward Time Step 0: + Gradient di[0] = -0.025, df[0] = -0.021, dc_hat[0] = -0.036 + Gradient do_[0] = -0.600 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.719 + c_state[0] = 0.375, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.665, o_gate[0] = 0.105, c_hat[0] = 0.762 + c_state[0] = 0.588, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.394, f_gate[0] = 0.644, o_gate[0] = 0.095, c_hat[0] = 0.784 + c_state[0] = 0.687, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.617, o_gate[0] = 0.091, c_hat[0] = 0.849 + c_state[0] = 0.719, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.606, o_gate[0] = 0.085, c_hat[0] = 0.852 + c_state[0] = 0.701, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = -0.020, df[0] = -0.016, dc_hat[0] = -0.009 + Gradient do_[0] = -1.201 +Backward Time Step 3: + Gradient di[0] = -0.031, df[0] = -0.023, dc_hat[0] = -0.016 + Gradient do_[0] = -1.741 +Backward Time Step 2: + Gradient di[0] = -0.026, df[0] = -0.020, dc_hat[0] = -0.021 + Gradient do_[0] = -1.501 +Backward Time Step 1: + Gradient di[0] = -0.015, df[0] = -0.012, dc_hat[0] = -0.015 + Gradient do_[0] = -0.740 +Backward Time Step 0: + Gradient di[0] = 0.004, df[0] = 0.004, dc_hat[0] = 0.006 + Gradient do_[0] = -0.069 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.667, o_gate[0] = 0.111, c_hat[0] = 0.762 + c_state[0] = 0.588, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.647, o_gate[0] = 0.101, c_hat[0] = 0.784 + c_state[0] = 0.686, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.342, f_gate[0] = 0.623, o_gate[0] = 0.097, c_hat[0] = 0.847 + c_state[0] = 0.717, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.306, f_gate[0] = 0.616, o_gate[0] = 0.092, c_hat[0] = 0.848 + c_state[0] = 0.702, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.002 + Gradient do_[0] = -0.297 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = -0.007, dc_hat[0] = -0.005 + Gradient do_[0] = -0.498 +Backward Time Step 2: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003 + Gradient do_[0] = -0.368 +Backward Time Step 1: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.088 +Backward Time Step 0: + Gradient di[0] = 0.015, df[0] = 0.013, dc_hat[0] = 0.022 + Gradient do_[0] = 0.118 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.719 + c_state[0] = 0.376, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.668, o_gate[0] = 0.111, c_hat[0] = 0.760 + c_state[0] = 0.586, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.650, o_gate[0] = 0.101, c_hat[0] = 0.781 + c_state[0] = 0.684, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.338, f_gate[0] = 0.628, o_gate[0] = 0.098, c_hat[0] = 0.844 + c_state[0] = 0.715, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.303, f_gate[0] = 0.625, o_gate[0] = 0.094, c_hat[0] = 0.843 + c_state[0] = 0.702, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = 0.018, df[0] = 0.014, dc_hat[0] = 0.009 + Gradient do_[0] = 1.004 +Backward Time Step 3: + Gradient di[0] = 0.033, df[0] = 0.026, dc_hat[0] = 0.017 + Gradient do_[0] = 1.789 +Backward Time Step 2: + Gradient di[0] = 0.048, df[0] = 0.038, dc_hat[0] = 0.039 + Gradient do_[0] = 2.196 +Backward Time Step 1: + Gradient di[0] = 0.056, df[0] = 0.044, dc_hat[0] = 0.055 + Gradient do_[0] = 1.787 +Backward Time Step 0: + Gradient di[0] = 0.055, df[0] = 0.045, dc_hat[0] = 0.077 + Gradient do_[0] = 0.781 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.131, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.443, f_gate[0] = 0.667, o_gate[0] = 0.106, c_hat[0] = 0.758 + c_state[0] = 0.586, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.393, f_gate[0] = 0.648, o_gate[0] = 0.096, c_hat[0] = 0.778 + c_state[0] = 0.686, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.625, o_gate[0] = 0.091, c_hat[0] = 0.842 + c_state[0] = 0.720, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.618, o_gate[0] = 0.086, c_hat[0] = 0.843 + c_state[0] = 0.707, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.115 +Backward Time Step 3: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.167 +Backward Time Step 2: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.006 +Backward Time Step 1: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.009 + Gradient do_[0] = 0.166 +Backward Time Step 0: + Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028 + Gradient do_[0] = 0.203 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.130, c_hat[0] = 0.717 + c_state[0] = 0.375, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.666, o_gate[0] = 0.105, c_hat[0] = 0.757 + c_state[0] = 0.586, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.395, f_gate[0] = 0.648, o_gate[0] = 0.095, c_hat[0] = 0.777 + c_state[0] = 0.686, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.349, f_gate[0] = 0.623, o_gate[0] = 0.090, c_hat[0] = 0.841 + c_state[0] = 0.721, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.314, f_gate[0] = 0.615, o_gate[0] = 0.085, c_hat[0] = 0.842 + c_state[0] = 0.708, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = -0.010, df[0] = -0.008, dc_hat[0] = -0.005 + Gradient do_[0] = -0.587 +Backward Time Step 3: + Gradient di[0] = -0.015, df[0] = -0.011, dc_hat[0] = -0.008 + Gradient do_[0] = -0.848 +Backward Time Step 2: + Gradient di[0] = -0.010, df[0] = -0.008, dc_hat[0] = -0.008 + Gradient do_[0] = -0.663 +Backward Time Step 1: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.249 +Backward Time Step 0: + Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.017 + Gradient do_[0] = 0.075 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.444, f_gate[0] = 0.668, o_gate[0] = 0.108, c_hat[0] = 0.756 + c_state[0] = 0.586, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.395, f_gate[0] = 0.651, o_gate[0] = 0.098, c_hat[0] = 0.774 + c_state[0] = 0.687, h_state[0] = 0.059 +Time Step 3: + i_gate[0] = 0.349, f_gate[0] = 0.629, o_gate[0] = 0.094, c_hat[0] = 0.837 + c_state[0] = 0.725, h_state[0] = 0.058 +Time Step 4: + i_gate[0] = 0.315, f_gate[0] = 0.624, o_gate[0] = 0.089, c_hat[0] = 0.836 + c_state[0] = 0.715, h_state[0] = 0.055 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = -0.006, dc_hat[0] = -0.004 + Gradient do_[0] = -0.465 +Backward Time Step 3: + Gradient di[0] = -0.013, df[0] = -0.010, dc_hat[0] = -0.007 + Gradient do_[0] = -0.690 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.006 + Gradient do_[0] = -0.531 +Backward Time Step 1: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.180 +Backward Time Step 0: + Gradient di[0] = 0.014, df[0] = 0.011, dc_hat[0] = 0.020 + Gradient do_[0] = 0.095 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.136, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.443, f_gate[0] = 0.669, o_gate[0] = 0.110, c_hat[0] = 0.755 + c_state[0] = 0.585, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.394, f_gate[0] = 0.654, o_gate[0] = 0.101, c_hat[0] = 0.771 + c_state[0] = 0.686, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.348, f_gate[0] = 0.634, o_gate[0] = 0.097, c_hat[0] = 0.833 + c_state[0] = 0.725, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.314, f_gate[0] = 0.631, o_gate[0] = 0.092, c_hat[0] = 0.830 + c_state[0] = 0.717, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.003 + Gradient do_[0] = -0.322 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = -0.007, dc_hat[0] = -0.005 + Gradient do_[0] = -0.489 +Backward Time Step 2: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003 + Gradient do_[0] = -0.350 +Backward Time Step 1: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003 + Gradient do_[0] = -0.073 +Backward Time Step 0: + Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.022 + Gradient do_[0] = 0.124 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.138, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.443, f_gate[0] = 0.669, o_gate[0] = 0.111, c_hat[0] = 0.753 + c_state[0] = 0.585, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.392, f_gate[0] = 0.655, o_gate[0] = 0.102, c_hat[0] = 0.769 + c_state[0] = 0.684, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.636, o_gate[0] = 0.098, c_hat[0] = 0.830 + c_state[0] = 0.723, h_state[0] = 0.061 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.635, o_gate[0] = 0.094, c_hat[0] = 0.825 + c_state[0] = 0.716, h_state[0] = 0.058 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003 + Gradient do_[0] = -0.307 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = -0.007, dc_hat[0] = -0.005 + Gradient do_[0] = -0.475 +Backward Time Step 2: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.004 + Gradient do_[0] = -0.352 +Backward Time Step 1: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.086 +Backward Time Step 0: + Gradient di[0] = 0.015, df[0] = 0.013, dc_hat[0] = 0.022 + Gradient do_[0] = 0.115 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.139, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.670, o_gate[0] = 0.112, c_hat[0] = 0.752 + c_state[0] = 0.584, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.391, f_gate[0] = 0.657, o_gate[0] = 0.102, c_hat[0] = 0.766 + c_state[0] = 0.682, h_state[0] = 0.061 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.639, o_gate[0] = 0.099, c_hat[0] = 0.827 + c_state[0] = 0.721, h_state[0] = 0.061 +Time Step 4: + i_gate[0] = 0.310, f_gate[0] = 0.640, o_gate[0] = 0.095, c_hat[0] = 0.820 + c_state[0] = 0.715, h_state[0] = 0.058 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.116 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.184 +Backward Time Step 2: + Gradient di[0] = 0.002, df[0] = 0.002, dc_hat[0] = 0.002 + Gradient do_[0] = -0.045 +Backward Time Step 1: + Gradient di[0] = 0.008, df[0] = 0.007, dc_hat[0] = 0.009 + Gradient do_[0] = 0.131 +Backward Time Step 0: + Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028 + Gradient do_[0] = 0.187 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.670, o_gate[0] = 0.111, c_hat[0] = 0.751 + c_state[0] = 0.583, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.392, f_gate[0] = 0.656, o_gate[0] = 0.101, c_hat[0] = 0.765 + c_state[0] = 0.682, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.637, o_gate[0] = 0.097, c_hat[0] = 0.826 + c_state[0] = 0.720, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.637, o_gate[0] = 0.093, c_hat[0] = 0.819 + c_state[0] = 0.714, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003 + Gradient do_[0] = -0.265 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.004 + Gradient do_[0] = -0.391 +Backward Time Step 2: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.260 +Backward Time Step 1: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.004 + Gradient do_[0] = -0.023 +Backward Time Step 0: + Gradient di[0] = 0.016, df[0] = 0.014, dc_hat[0] = 0.023 + Gradient do_[0] = 0.134 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.139, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.670, o_gate[0] = 0.112, c_hat[0] = 0.751 + c_state[0] = 0.583, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.391, f_gate[0] = 0.658, o_gate[0] = 0.103, c_hat[0] = 0.763 + c_state[0] = 0.682, h_state[0] = 0.061 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.641, o_gate[0] = 0.099, c_hat[0] = 0.824 + c_state[0] = 0.721, h_state[0] = 0.061 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.642, o_gate[0] = 0.095, c_hat[0] = 0.816 + c_state[0] = 0.717, h_state[0] = 0.059 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.223 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.004 + Gradient do_[0] = -0.346 +Backward Time Step 2: + Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.227 +Backward Time Step 1: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005 + Gradient do_[0] = -0.003 +Backward Time Step 0: + Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024 + Gradient do_[0] = 0.139 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.140, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.671, o_gate[0] = 0.113, c_hat[0] = 0.750 + c_state[0] = 0.582, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.389, f_gate[0] = 0.659, o_gate[0] = 0.103, c_hat[0] = 0.761 + c_state[0] = 0.679, h_state[0] = 0.061 +Time Step 3: + i_gate[0] = 0.342, f_gate[0] = 0.643, o_gate[0] = 0.100, c_hat[0] = 0.821 + c_state[0] = 0.718, h_state[0] = 0.061 +Time Step 4: + i_gate[0] = 0.309, f_gate[0] = 0.646, o_gate[0] = 0.096, c_hat[0] = 0.811 + c_state[0] = 0.715, h_state[0] = 0.059 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.066 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.111 +Backward Time Step 2: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = 0.030 +Backward Time Step 1: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.010 + Gradient do_[0] = 0.184 +Backward Time Step 0: + Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030 + Gradient do_[0] = 0.204 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.137, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.670, o_gate[0] = 0.111, c_hat[0] = 0.749 + c_state[0] = 0.581, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.658, o_gate[0] = 0.101, c_hat[0] = 0.760 + c_state[0] = 0.679, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.641, o_gate[0] = 0.098, c_hat[0] = 0.820 + c_state[0] = 0.717, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.642, o_gate[0] = 0.094, c_hat[0] = 0.811 + c_state[0] = 0.713, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.214 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004 + Gradient do_[0] = -0.316 +Backward Time Step 2: + Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.189 +Backward Time Step 1: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005 + Gradient do_[0] = 0.023 +Backward Time Step 0: + Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024 + Gradient do_[0] = 0.146 +Time Step 0: + i_gate[0] = 0.523, f_gate[0] = 0.687, o_gate[0] = 0.140, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.671, o_gate[0] = 0.113, c_hat[0] = 0.749 + c_state[0] = 0.581, h_state[0] = 0.059 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.660, o_gate[0] = 0.103, c_hat[0] = 0.760 + c_state[0] = 0.679, h_state[0] = 0.061 +Time Step 3: + i_gate[0] = 0.342, f_gate[0] = 0.646, o_gate[0] = 0.100, c_hat[0] = 0.819 + c_state[0] = 0.718, h_state[0] = 0.062 +Time Step 4: + i_gate[0] = 0.309, f_gate[0] = 0.650, o_gate[0] = 0.097, c_hat[0] = 0.808 + c_state[0] = 0.716, h_state[0] = 0.059 +Backward Time Step 4: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.132 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003 + Gradient do_[0] = -0.222 +Backward Time Step 2: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = -0.102 +Backward Time Step 1: + Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.007 + Gradient do_[0] = 0.083 +Backward Time Step 0: + Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.026 + Gradient do_[0] = 0.166 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.139, c_hat[0] = 0.718 + c_state[0] = 0.375, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.439, f_gate[0] = 0.671, o_gate[0] = 0.111, c_hat[0] = 0.747 + c_state[0] = 0.580, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.660, o_gate[0] = 0.102, c_hat[0] = 0.757 + c_state[0] = 0.676, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.341, f_gate[0] = 0.645, o_gate[0] = 0.099, c_hat[0] = 0.817 + c_state[0] = 0.715, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.308, f_gate[0] = 0.650, o_gate[0] = 0.095, c_hat[0] = 0.805 + c_state[0] = 0.712, h_state[0] = 0.058 +Backward Time Step 4: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.003 + Gradient do_[0] = 0.280 +Backward Time Step 3: + Gradient di[0] = 0.008, df[0] = 0.006, dc_hat[0] = 0.005 + Gradient do_[0] = 0.426 +Backward Time Step 2: + Gradient di[0] = 0.016, df[0] = 0.013, dc_hat[0] = 0.015 + Gradient do_[0] = 0.655 +Backward Time Step 1: + Gradient di[0] = 0.023, df[0] = 0.019, dc_hat[0] = 0.025 + Gradient do_[0] = 0.663 +Backward Time Step 0: + Gradient di[0] = 0.032, df[0] = 0.026, dc_hat[0] = 0.045 + Gradient do_[0] = 0.385 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.133, c_hat[0] = 0.717 + c_state[0] = 0.374, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.670, o_gate[0] = 0.106, c_hat[0] = 0.745 + c_state[0] = 0.580, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.392, f_gate[0] = 0.658, o_gate[0] = 0.097, c_hat[0] = 0.755 + c_state[0] = 0.677, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.640, o_gate[0] = 0.093, c_hat[0] = 0.817 + c_state[0] = 0.716, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.313, f_gate[0] = 0.642, o_gate[0] = 0.088, c_hat[0] = 0.807 + c_state[0] = 0.712, h_state[0] = 0.054 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.065 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.001 + Gradient do_[0] = -0.087 +Backward Time Step 2: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.004 + Gradient do_[0] = 0.070 +Backward Time Step 1: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.011 + Gradient do_[0] = 0.213 +Backward Time Step 0: + Gradient di[0] = 0.020, df[0] = 0.017, dc_hat[0] = 0.029 + Gradient do_[0] = 0.214 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.717 + c_state[0] = 0.375, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.671, o_gate[0] = 0.107, c_hat[0] = 0.746 + c_state[0] = 0.581, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.393, f_gate[0] = 0.660, o_gate[0] = 0.097, c_hat[0] = 0.756 + c_state[0] = 0.680, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.645, o_gate[0] = 0.094, c_hat[0] = 0.816 + c_state[0] = 0.722, h_state[0] = 0.058 +Time Step 4: + i_gate[0] = 0.315, f_gate[0] = 0.647, o_gate[0] = 0.089, c_hat[0] = 0.805 + c_state[0] = 0.721, h_state[0] = 0.055 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004 + Gradient do_[0] = -0.379 +Backward Time Step 3: + Gradient di[0] = -0.010, df[0] = -0.008, dc_hat[0] = -0.006 + Gradient do_[0] = -0.550 +Backward Time Step 2: + Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.005 + Gradient do_[0] = -0.447 +Backward Time Step 1: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.167 +Backward Time Step 0: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.018 + Gradient do_[0] = 0.077 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.135, c_hat[0] = 0.717 + c_state[0] = 0.375, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.108, c_hat[0] = 0.745 + c_state[0] = 0.580, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.662, o_gate[0] = 0.098, c_hat[0] = 0.754 + c_state[0] = 0.677, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.648, o_gate[0] = 0.095, c_hat[0] = 0.813 + c_state[0] = 0.718, h_state[0] = 0.058 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.653, o_gate[0] = 0.091, c_hat[0] = 0.800 + c_state[0] = 0.718, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = -0.001, df[0] = -0.001, dc_hat[0] = -0.001 + Gradient do_[0] = -0.072 +Backward Time Step 3: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.118 +Backward Time Step 2: + Gradient di[0] = 0.003, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = 0.022 +Backward Time Step 1: + Gradient di[0] = 0.009, df[0] = 0.007, dc_hat[0] = 0.010 + Gradient do_[0] = 0.176 +Backward Time Step 0: + Gradient di[0] = 0.020, df[0] = 0.016, dc_hat[0] = 0.028 + Gradient do_[0] = 0.199 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.132, c_hat[0] = 0.717 + c_state[0] = 0.374, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.671, o_gate[0] = 0.106, c_hat[0] = 0.744 + c_state[0] = 0.579, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.391, f_gate[0] = 0.660, o_gate[0] = 0.096, c_hat[0] = 0.752 + c_state[0] = 0.676, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.645, o_gate[0] = 0.092, c_hat[0] = 0.812 + c_state[0] = 0.717, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.314, f_gate[0] = 0.649, o_gate[0] = 0.088, c_hat[0] = 0.799 + c_state[0] = 0.716, h_state[0] = 0.054 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = -0.003, dc_hat[0] = -0.002 + Gradient do_[0] = -0.200 +Backward Time Step 3: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003 + Gradient do_[0] = -0.284 +Backward Time Step 2: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.153 +Backward Time Step 1: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.006 + Gradient do_[0] = 0.047 +Backward Time Step 0: + Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024 + Gradient do_[0] = 0.151 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.133, c_hat[0] = 0.717 + c_state[0] = 0.374, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.106, c_hat[0] = 0.743 + c_state[0] = 0.579, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.662, o_gate[0] = 0.097, c_hat[0] = 0.751 + c_state[0] = 0.676, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.648, o_gate[0] = 0.093, c_hat[0] = 0.810 + c_state[0] = 0.717, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.652, o_gate[0] = 0.089, c_hat[0] = 0.797 + c_state[0] = 0.716, h_state[0] = 0.055 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003 + Gradient do_[0] = -0.237 +Backward Time Step 3: + Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004 + Gradient do_[0] = -0.348 +Backward Time Step 2: + Gradient di[0] = -0.002, df[0] = -0.001, dc_hat[0] = -0.002 + Gradient do_[0] = -0.234 +Backward Time Step 1: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.004 + Gradient do_[0] = -0.016 +Backward Time Step 0: + Gradient di[0] = 0.015, df[0] = 0.013, dc_hat[0] = 0.022 + Gradient do_[0] = 0.126 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.687, o_gate[0] = 0.134, c_hat[0] = 0.717 + c_state[0] = 0.374, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.439, f_gate[0] = 0.672, o_gate[0] = 0.107, c_hat[0] = 0.743 + c_state[0] = 0.578, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.663, o_gate[0] = 0.097, c_hat[0] = 0.749 + c_state[0] = 0.674, h_state[0] = 0.057 +Time Step 3: + i_gate[0] = 0.343, f_gate[0] = 0.650, o_gate[0] = 0.093, c_hat[0] = 0.807 + c_state[0] = 0.715, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.656, o_gate[0] = 0.090, c_hat[0] = 0.793 + c_state[0] = 0.715, h_state[0] = 0.055 +Backward Time Step 4: + Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.008 + Gradient do_[0] = 0.773 +Backward Time Step 3: + Gradient di[0] = 0.019, df[0] = 0.015, dc_hat[0] = 0.012 + Gradient do_[0] = 1.095 +Backward Time Step 2: + Gradient di[0] = 0.030, df[0] = 0.025, dc_hat[0] = 0.028 + Gradient do_[0] = 1.401 +Backward Time Step 1: + Gradient di[0] = 0.038, df[0] = 0.030, dc_hat[0] = 0.040 + Gradient do_[0] = 1.224 +Backward Time Step 0: + Gradient di[0] = 0.043, df[0] = 0.036, dc_hat[0] = 0.061 + Gradient do_[0] = 0.599 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.671, o_gate[0] = 0.101, c_hat[0] = 0.740 + c_state[0] = 0.577, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.393, f_gate[0] = 0.660, o_gate[0] = 0.091, c_hat[0] = 0.747 + c_state[0] = 0.674, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.349, f_gate[0] = 0.645, o_gate[0] = 0.087, c_hat[0] = 0.807 + c_state[0] = 0.716, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.317, f_gate[0] = 0.647, o_gate[0] = 0.082, c_hat[0] = 0.795 + c_state[0] = 0.715, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.166 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003 + Gradient do_[0] = -0.225 +Backward Time Step 2: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = -0.080 +Backward Time Step 1: + Gradient di[0] = 0.007, df[0] = 0.005, dc_hat[0] = 0.007 + Gradient do_[0] = 0.102 +Backward Time Step 0: + Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.025 + Gradient do_[0] = 0.172 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.129, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.442, f_gate[0] = 0.672, o_gate[0] = 0.102, c_hat[0] = 0.740 + c_state[0] = 0.577, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.393, f_gate[0] = 0.662, o_gate[0] = 0.092, c_hat[0] = 0.746 + c_state[0] = 0.675, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.349, f_gate[0] = 0.648, o_gate[0] = 0.088, c_hat[0] = 0.806 + c_state[0] = 0.719, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.317, f_gate[0] = 0.651, o_gate[0] = 0.083, c_hat[0] = 0.793 + c_state[0] = 0.719, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.005 + Gradient do_[0] = -0.522 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = -0.010, dc_hat[0] = -0.008 + Gradient do_[0] = -0.710 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.008 + Gradient do_[0] = -0.602 +Backward Time Step 1: + Gradient di[0] = -0.003, df[0] = -0.003, dc_hat[0] = -0.003 + Gradient do_[0] = -0.274 +Backward Time Step 0: + Gradient di[0] = 0.010, df[0] = 0.008, dc_hat[0] = 0.014 + Gradient do_[0] = 0.039 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.132, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.104, c_hat[0] = 0.739 + c_state[0] = 0.576, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.664, o_gate[0] = 0.094, c_hat[0] = 0.744 + c_state[0] = 0.673, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.652, o_gate[0] = 0.090, c_hat[0] = 0.803 + c_state[0] = 0.716, h_state[0] = 0.056 +Time Step 4: + i_gate[0] = 0.313, f_gate[0] = 0.658, o_gate[0] = 0.086, c_hat[0] = 0.787 + c_state[0] = 0.718, h_state[0] = 0.053 +Backward Time Step 4: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = 0.002 +Backward Time Step 3: + Gradient di[0] = -0.001, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.012 +Backward Time Step 2: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005 + Gradient do_[0] = 0.140 +Backward Time Step 1: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.012 + Gradient do_[0] = 0.262 +Backward Time Step 0: + Gradient di[0] = 0.021, df[0] = 0.017, dc_hat[0] = 0.030 + Gradient do_[0] = 0.229 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.128, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.672, o_gate[0] = 0.101, c_hat[0] = 0.738 + c_state[0] = 0.576, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.393, f_gate[0] = 0.662, o_gate[0] = 0.091, c_hat[0] = 0.743 + c_state[0] = 0.673, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.348, f_gate[0] = 0.648, o_gate[0] = 0.087, c_hat[0] = 0.803 + c_state[0] = 0.715, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.316, f_gate[0] = 0.652, o_gate[0] = 0.082, c_hat[0] = 0.789 + c_state[0] = 0.716, h_state[0] = 0.050 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.003 + Gradient do_[0] = -0.312 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.005 + Gradient do_[0] = -0.416 +Backward Time Step 2: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003 + Gradient do_[0] = -0.286 +Backward Time Step 1: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003 + Gradient do_[0] = -0.049 +Backward Time Step 0: + Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.020 + Gradient do_[0] = 0.116 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.673, o_gate[0] = 0.103, c_hat[0] = 0.738 + c_state[0] = 0.576, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.391, f_gate[0] = 0.665, o_gate[0] = 0.093, c_hat[0] = 0.742 + c_state[0] = 0.673, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.653, o_gate[0] = 0.089, c_hat[0] = 0.801 + c_state[0] = 0.716, h_state[0] = 0.055 +Time Step 4: + i_gate[0] = 0.314, f_gate[0] = 0.659, o_gate[0] = 0.085, c_hat[0] = 0.784 + c_state[0] = 0.718, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.165 +Backward Time Step 3: + Gradient di[0] = -0.004, df[0] = -0.003, dc_hat[0] = -0.003 + Gradient do_[0] = -0.237 +Backward Time Step 2: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = -0.103 +Backward Time Step 1: + Gradient di[0] = 0.006, df[0] = 0.005, dc_hat[0] = 0.007 + Gradient do_[0] = 0.083 +Backward Time Step 0: + Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.024 + Gradient do_[0] = 0.163 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.129, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.101, c_hat[0] = 0.737 + c_state[0] = 0.575, h_state[0] = 0.052 +Time Step 2: + i_gate[0] = 0.391, f_gate[0] = 0.664, o_gate[0] = 0.091, c_hat[0] = 0.740 + c_state[0] = 0.672, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.652, o_gate[0] = 0.087, c_hat[0] = 0.799 + c_state[0] = 0.715, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.316, f_gate[0] = 0.657, o_gate[0] = 0.082, c_hat[0] = 0.783 + c_state[0] = 0.717, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.004, dc_hat[0] = -0.003 + Gradient do_[0] = -0.295 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.005 + Gradient do_[0] = -0.400 +Backward Time Step 2: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003 + Gradient do_[0] = -0.274 +Backward Time Step 1: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003 + Gradient do_[0] = -0.040 +Backward Time Step 0: + Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.020 + Gradient do_[0] = 0.118 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.131, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.439, f_gate[0] = 0.673, o_gate[0] = 0.103, c_hat[0] = 0.737 + c_state[0] = 0.574, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.389, f_gate[0] = 0.666, o_gate[0] = 0.093, c_hat[0] = 0.739 + c_state[0] = 0.670, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.655, o_gate[0] = 0.089, c_hat[0] = 0.797 + c_state[0] = 0.714, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.313, f_gate[0] = 0.663, o_gate[0] = 0.085, c_hat[0] = 0.778 + c_state[0] = 0.717, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.074 +Backward Time Step 3: + Gradient di[0] = 0.001, df[0] = 0.001, dc_hat[0] = 0.001 + Gradient do_[0] = 0.082 +Backward Time Step 2: + Gradient di[0] = 0.007, df[0] = 0.006, dc_hat[0] = 0.007 + Gradient do_[0] = 0.239 +Backward Time Step 1: + Gradient di[0] = 0.013, df[0] = 0.010, dc_hat[0] = 0.014 + Gradient do_[0] = 0.332 +Backward Time Step 0: + Gradient di[0] = 0.022, df[0] = 0.018, dc_hat[0] = 0.032 + Gradient do_[0] = 0.253 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.126, c_hat[0] = 0.714 + c_state[0] = 0.373, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.672, o_gate[0] = 0.099, c_hat[0] = 0.736 + c_state[0] = 0.574, h_state[0] = 0.051 +Time Step 2: + i_gate[0] = 0.392, f_gate[0] = 0.664, o_gate[0] = 0.089, c_hat[0] = 0.739 + c_state[0] = 0.671, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.348, f_gate[0] = 0.651, o_gate[0] = 0.085, c_hat[0] = 0.798 + c_state[0] = 0.714, h_state[0] = 0.052 +Time Step 4: + i_gate[0] = 0.316, f_gate[0] = 0.656, o_gate[0] = 0.080, c_hat[0] = 0.781 + c_state[0] = 0.716, h_state[0] = 0.049 +Backward Time Step 4: + Gradient di[0] = -0.004, df[0] = -0.004, dc_hat[0] = -0.003 + Gradient do_[0] = -0.298 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = -0.005, dc_hat[0] = -0.005 + Gradient do_[0] = -0.393 +Backward Time Step 2: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.261 +Backward Time Step 1: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003 + Gradient do_[0] = -0.032 +Backward Time Step 0: + Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.020 + Gradient do_[0] = 0.120 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.130, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.439, f_gate[0] = 0.674, o_gate[0] = 0.102, c_hat[0] = 0.737 + c_state[0] = 0.574, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.389, f_gate[0] = 0.667, o_gate[0] = 0.092, c_hat[0] = 0.739 + c_state[0] = 0.671, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.657, o_gate[0] = 0.088, c_hat[0] = 0.797 + c_state[0] = 0.715, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.314, f_gate[0] = 0.664, o_gate[0] = 0.084, c_hat[0] = 0.778 + c_state[0] = 0.719, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = -0.006, df[0] = -0.005, dc_hat[0] = -0.004 + Gradient do_[0] = -0.396 +Backward Time Step 3: + Gradient di[0] = -0.009, df[0] = -0.008, dc_hat[0] = -0.006 + Gradient do_[0] = -0.548 +Backward Time Step 2: + Gradient di[0] = -0.005, df[0] = -0.005, dc_hat[0] = -0.006 + Gradient do_[0] = -0.439 +Backward Time Step 1: + Gradient di[0] = -0.000, df[0] = -0.000, dc_hat[0] = -0.000 + Gradient do_[0] = -0.158 +Backward Time Step 0: + Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.017 + Gradient do_[0] = 0.075 +Time Step 0: + i_gate[0] = 0.522, f_gate[0] = 0.686, o_gate[0] = 0.133, c_hat[0] = 0.715 + c_state[0] = 0.373, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.437, f_gate[0] = 0.674, o_gate[0] = 0.104, c_hat[0] = 0.736 + c_state[0] = 0.573, h_state[0] = 0.054 +Time Step 2: + i_gate[0] = 0.386, f_gate[0] = 0.669, o_gate[0] = 0.094, c_hat[0] = 0.737 + c_state[0] = 0.668, h_state[0] = 0.055 +Time Step 3: + i_gate[0] = 0.341, f_gate[0] = 0.660, o_gate[0] = 0.090, c_hat[0] = 0.792 + c_state[0] = 0.711, h_state[0] = 0.055 +Time Step 4: + i_gate[0] = 0.310, f_gate[0] = 0.670, o_gate[0] = 0.086, c_hat[0] = 0.772 + c_state[0] = 0.715, h_state[0] = 0.053 +Backward Time Step 4: + Gradient di[0] = 0.131, df[0] = 0.118, dc_hat[0] = 0.099 + Gradient do_[0] = 9.057 +Backward Time Step 3: + Gradient di[0] = 0.196, df[0] = 0.163, dc_hat[0] = 0.140 + Gradient do_[0] = 11.981 +Backward Time Step 2: + Gradient di[0] = 0.241, df[0] = 0.204, dc_hat[0] = 0.243 + Gradient do_[0] = 12.870 +Backward Time Step 1: + Gradient di[0] = 0.255, df[0] = 0.209, dc_hat[0] = 0.283 + Gradient do_[0] = 9.439 +Backward Time Step 0: + Gradient di[0] = 0.214, df[0] = 0.178, dc_hat[0] = 0.306 + Gradient do_[0] = 3.567 +Time Step 0: + i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.127, c_hat[0] = 0.699 + c_state[0] = 0.362, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.435, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.721 + c_state[0] = 0.556, h_state[0] = 0.050 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.661, o_gate[0] = 0.089, c_hat[0] = 0.723 + c_state[0] = 0.647, h_state[0] = 0.051 +Time Step 3: + i_gate[0] = 0.342, f_gate[0] = 0.650, o_gate[0] = 0.085, c_hat[0] = 0.783 + c_state[0] = 0.688, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.310, f_gate[0] = 0.656, o_gate[0] = 0.080, c_hat[0] = 0.763 + c_state[0] = 0.688, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = 0.004, df[0] = 0.003, dc_hat[0] = 0.003 + Gradient do_[0] = 0.284 +Backward Time Step 3: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.004 + Gradient do_[0] = 0.351 +Backward Time Step 2: + Gradient di[0] = 0.012, df[0] = 0.010, dc_hat[0] = 0.012 + Gradient do_[0] = 0.508 +Backward Time Step 1: + Gradient di[0] = 0.017, df[0] = 0.014, dc_hat[0] = 0.020 + Gradient do_[0] = 0.509 +Backward Time Step 0: + Gradient di[0] = 0.025, df[0] = 0.021, dc_hat[0] = 0.037 + Gradient do_[0] = 0.306 +Time Step 0: + i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.122, c_hat[0] = 0.698 + c_state[0] = 0.361, h_state[0] = 0.042 +Time Step 1: + i_gate[0] = 0.437, f_gate[0] = 0.668, o_gate[0] = 0.095, c_hat[0] = 0.719 + c_state[0] = 0.555, h_state[0] = 0.048 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.659, o_gate[0] = 0.085, c_hat[0] = 0.722 + c_state[0] = 0.648, h_state[0] = 0.048 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.646, o_gate[0] = 0.080, c_hat[0] = 0.784 + c_state[0] = 0.690, h_state[0] = 0.048 +Time Step 4: + i_gate[0] = 0.315, f_gate[0] = 0.649, o_gate[0] = 0.075, c_hat[0] = 0.767 + c_state[0] = 0.690, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.006 + Gradient do_[0] = -0.590 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = -0.010, dc_hat[0] = -0.009 + Gradient do_[0] = -0.736 +Backward Time Step 2: + Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.009 + Gradient do_[0] = -0.587 +Backward Time Step 1: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003 + Gradient do_[0] = -0.250 +Backward Time Step 0: + Gradient di[0] = 0.009, df[0] = 0.008, dc_hat[0] = 0.014 + Gradient do_[0] = 0.046 +Time Step 0: + i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.126, c_hat[0] = 0.699 + c_state[0] = 0.361, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.435, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.720 + c_state[0] = 0.555, h_state[0] = 0.050 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.661, o_gate[0] = 0.088, c_hat[0] = 0.721 + c_state[0] = 0.647, h_state[0] = 0.050 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.651, o_gate[0] = 0.084, c_hat[0] = 0.782 + c_state[0] = 0.689, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.656, o_gate[0] = 0.079, c_hat[0] = 0.762 + c_state[0] = 0.690, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = -0.005, df[0] = -0.004, dc_hat[0] = -0.004 + Gradient do_[0] = -0.324 +Backward Time Step 3: + Gradient di[0] = -0.007, df[0] = -0.006, dc_hat[0] = -0.006 + Gradient do_[0] = -0.425 +Backward Time Step 2: + Gradient di[0] = -0.003, df[0] = -0.002, dc_hat[0] = -0.003 + Gradient do_[0] = -0.283 +Backward Time Step 1: + Gradient di[0] = 0.003, df[0] = 0.002, dc_hat[0] = 0.003 + Gradient do_[0] = -0.041 +Backward Time Step 0: + Gradient di[0] = 0.014, df[0] = 0.012, dc_hat[0] = 0.021 + Gradient do_[0] = 0.117 +Time Step 0: + i_gate[0] = 0.517, f_gate[0] = 0.682, o_gate[0] = 0.127, c_hat[0] = 0.699 + c_state[0] = 0.361, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.435, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.719 + c_state[0] = 0.554, h_state[0] = 0.050 +Time Step 2: + i_gate[0] = 0.386, f_gate[0] = 0.663, o_gate[0] = 0.088, c_hat[0] = 0.719 + c_state[0] = 0.645, h_state[0] = 0.050 +Time Step 3: + i_gate[0] = 0.342, f_gate[0] = 0.653, o_gate[0] = 0.084, c_hat[0] = 0.779 + c_state[0] = 0.687, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.310, f_gate[0] = 0.659, o_gate[0] = 0.079, c_hat[0] = 0.758 + c_state[0] = 0.688, h_state[0] = 0.047 +Backward Time Step 4: + Gradient di[0] = 0.057, df[0] = 0.052, dc_hat[0] = 0.047 + Gradient do_[0] = 4.169 +Backward Time Step 3: + Gradient di[0] = 0.083, df[0] = 0.070, dc_hat[0] = 0.064 + Gradient do_[0] = 5.294 +Backward Time Step 2: + Gradient di[0] = 0.101, df[0] = 0.088, dc_hat[0] = 0.111 + Gradient do_[0] = 5.538 +Backward Time Step 1: + Gradient di[0] = 0.107, df[0] = 0.090, dc_hat[0] = 0.127 + Gradient do_[0] = 4.011 +Backward Time Step 0: + Gradient di[0] = 0.093, df[0] = 0.078, dc_hat[0] = 0.140 + Gradient do_[0] = 1.518 +Time Step 0: + i_gate[0] = 0.515, f_gate[0] = 0.680, o_gate[0] = 0.121, c_hat[0] = 0.692 + c_state[0] = 0.356, h_state[0] = 0.042 +Time Step 1: + i_gate[0] = 0.436, f_gate[0] = 0.666, o_gate[0] = 0.094, c_hat[0] = 0.711 + c_state[0] = 0.547, h_state[0] = 0.047 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.658, o_gate[0] = 0.084, c_hat[0] = 0.711 + c_state[0] = 0.637, h_state[0] = 0.047 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.646, o_gate[0] = 0.079, c_hat[0] = 0.775 + c_state[0] = 0.680, h_state[0] = 0.046 +Time Step 4: + i_gate[0] = 0.315, f_gate[0] = 0.649, o_gate[0] = 0.073, c_hat[0] = 0.755 + c_state[0] = 0.679, h_state[0] = 0.043 +Backward Time Step 4: + Gradient di[0] = -0.008, df[0] = -0.007, dc_hat[0] = -0.007 + Gradient do_[0] = -0.589 +Backward Time Step 3: + Gradient di[0] = -0.012, df[0] = -0.010, dc_hat[0] = -0.009 + Gradient do_[0] = -0.727 +Backward Time Step 2: + Gradient di[0] = -0.007, df[0] = -0.007, dc_hat[0] = -0.008 + Gradient do_[0] = -0.557 +Backward Time Step 1: + Gradient di[0] = -0.002, df[0] = -0.002, dc_hat[0] = -0.002 + Gradient do_[0] = -0.216 +Backward Time Step 0: + Gradient di[0] = 0.010, df[0] = 0.009, dc_hat[0] = 0.016 + Gradient do_[0] = 0.061 +Time Step 0: + i_gate[0] = 0.515, f_gate[0] = 0.680, o_gate[0] = 0.123, c_hat[0] = 0.692 + c_state[0] = 0.357, h_state[0] = 0.042 +Time Step 1: + i_gate[0] = 0.434, f_gate[0] = 0.667, o_gate[0] = 0.095, c_hat[0] = 0.710 + c_state[0] = 0.546, h_state[0] = 0.047 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.661, o_gate[0] = 0.085, c_hat[0] = 0.708 + c_state[0] = 0.635, h_state[0] = 0.048 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.650, o_gate[0] = 0.080, c_hat[0] = 0.770 + c_state[0] = 0.679, h_state[0] = 0.047 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.656, o_gate[0] = 0.074, c_hat[0] = 0.748 + c_state[0] = 0.679, h_state[0] = 0.044 +Backward Time Step 4: + Gradient di[0] = 0.196, df[0] = 0.180, dc_hat[0] = 0.168 + Gradient do_[0] = 14.917 +Backward Time Step 3: + Gradient di[0] = 0.279, df[0] = 0.238, dc_hat[0] = 0.225 + Gradient do_[0] = 18.334 +Backward Time Step 2: + Gradient di[0] = 0.313, df[0] = 0.276, dc_hat[0] = 0.360 + Gradient do_[0] = 17.925 +Backward Time Step 1: + Gradient di[0] = 0.306, df[0] = 0.260, dc_hat[0] = 0.378 + Gradient do_[0] = 12.037 +Backward Time Step 0: + Gradient di[0] = 0.230, df[0] = 0.197, dc_hat[0] = 0.357 + Gradient do_[0] = 4.052 +Time Step 0: + i_gate[0] = 0.509, f_gate[0] = 0.675, o_gate[0] = 0.118, c_hat[0] = 0.671 + c_state[0] = 0.342, h_state[0] = 0.039 +Time Step 1: + i_gate[0] = 0.432, f_gate[0] = 0.660, o_gate[0] = 0.091, c_hat[0] = 0.686 + c_state[0] = 0.521, h_state[0] = 0.044 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.651, o_gate[0] = 0.081, c_hat[0] = 0.684 + c_state[0] = 0.604, h_state[0] = 0.044 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.638, o_gate[0] = 0.075, c_hat[0] = 0.752 + c_state[0] = 0.645, h_state[0] = 0.043 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.641, o_gate[0] = 0.069, c_hat[0] = 0.730 + c_state[0] = 0.641, h_state[0] = 0.039 +Backward Time Step 4: + Gradient di[0] = 0.117, df[0] = 0.110, dc_hat[0] = 0.109 + Gradient do_[0] = 8.973 +Backward Time Step 3: + Gradient di[0] = 0.161, df[0] = 0.140, dc_hat[0] = 0.142 + Gradient do_[0] = 10.594 +Backward Time Step 2: + Gradient di[0] = 0.171, df[0] = 0.156, dc_hat[0] = 0.217 + Gradient do_[0] = 9.845 +Backward Time Step 1: + Gradient di[0] = 0.157, df[0] = 0.139, dc_hat[0] = 0.214 + Gradient do_[0] = 6.245 +Backward Time Step 0: + Gradient di[0] = 0.115, df[0] = 0.101, dc_hat[0] = 0.192 + Gradient do_[0] = 2.017 +Time Step 0: + i_gate[0] = 0.505, f_gate[0] = 0.670, o_gate[0] = 0.113, c_hat[0] = 0.652 + c_state[0] = 0.329, h_state[0] = 0.036 +Time Step 1: + i_gate[0] = 0.431, f_gate[0] = 0.654, o_gate[0] = 0.087, c_hat[0] = 0.664 + c_state[0] = 0.502, h_state[0] = 0.040 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.644, o_gate[0] = 0.077, c_hat[0] = 0.660 + c_state[0] = 0.579, h_state[0] = 0.040 +Time Step 3: + i_gate[0] = 0.348, f_gate[0] = 0.629, o_gate[0] = 0.072, c_hat[0] = 0.735 + c_state[0] = 0.620, h_state[0] = 0.040 +Time Step 4: + i_gate[0] = 0.316, f_gate[0] = 0.629, o_gate[0] = 0.065, c_hat[0] = 0.715 + c_state[0] = 0.616, h_state[0] = 0.036 +Backward Time Step 4: + Gradient di[0] = -56.088, df[0] = -53.271, dc_hat[0] = -55.975 + Gradient do_[0] = -4363.406 +Backward Time Step 3: + Gradient di[0] = -74.442, df[0] = -65.558, dc_hat[0] = -71.455 + Gradient do_[0] = -4935.278 +Backward Time Step 2: + Gradient di[0] = -70.799, df[0] = -66.648, dc_hat[0] = -98.798 + Gradient do_[0] = -4198.427 +Backward Time Step 1: + Gradient di[0] = -57.482, df[0] = -52.277, dc_hat[0] = -85.194 + Gradient do_[0] = -2388.240 +Backward Time Step 0: + Gradient di[0] = -34.825, df[0] = -31.658, dc_hat[0] = -62.105 + Gradient do_[0] = -669.594 +Time Step 0: + i_gate[0] = 0.518, f_gate[0] = 0.681, o_gate[0] = 0.118, c_hat[0] = 0.680 + c_state[0] = 0.352, h_state[0] = 0.040 +Time Step 1: + i_gate[0] = 0.441, f_gate[0] = 0.668, o_gate[0] = 0.092, c_hat[0] = 0.697 + c_state[0] = 0.542, h_state[0] = 0.046 +Time Step 2: + i_gate[0] = 0.396, f_gate[0] = 0.662, o_gate[0] = 0.082, c_hat[0] = 0.697 + c_state[0] = 0.635, h_state[0] = 0.046 +Time Step 3: + i_gate[0] = 0.355, f_gate[0] = 0.651, o_gate[0] = 0.077, c_hat[0] = 0.763 + c_state[0] = 0.685, h_state[0] = 0.046 +Time Step 4: + i_gate[0] = 0.324, f_gate[0] = 0.655, o_gate[0] = 0.071, c_hat[0] = 0.744 + c_state[0] = 0.689, h_state[0] = 0.042 +Backward Time Step 4: + Gradient di[0] = 0.109, df[0] = 0.099, dc_hat[0] = 0.097 + Gradient do_[0] = 8.840 +Backward Time Step 3: + Gradient di[0] = 0.148, df[0] = 0.125, dc_hat[0] = 0.125 + Gradient do_[0] = 10.154 +Backward Time Step 2: + Gradient di[0] = 0.158, df[0] = 0.140, dc_hat[0] = 0.193 + Gradient do_[0] = 9.353 +Backward Time Step 1: + Gradient di[0] = 0.145, df[0] = 0.125, dc_hat[0] = 0.191 + Gradient do_[0] = 5.876 +Backward Time Step 0: + Gradient di[0] = 0.105, df[0] = 0.092, dc_hat[0] = 0.173 + Gradient do_[0] = 1.881 +Time Step 0: + i_gate[0] = 0.514, f_gate[0] = 0.677, o_gate[0] = 0.113, c_hat[0] = 0.661 + c_state[0] = 0.340, h_state[0] = 0.037 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.663, o_gate[0] = 0.088, c_hat[0] = 0.676 + c_state[0] = 0.523, h_state[0] = 0.042 +Time Step 2: + i_gate[0] = 0.397, f_gate[0] = 0.655, o_gate[0] = 0.078, c_hat[0] = 0.675 + c_state[0] = 0.611, h_state[0] = 0.043 +Time Step 3: + i_gate[0] = 0.358, f_gate[0] = 0.642, o_gate[0] = 0.073, c_hat[0] = 0.748 + c_state[0] = 0.660, h_state[0] = 0.042 +Time Step 4: + i_gate[0] = 0.326, f_gate[0] = 0.644, o_gate[0] = 0.066, c_hat[0] = 0.729 + c_state[0] = 0.663, h_state[0] = 0.039 +Backward Time Step 4: + Gradient di[0] = 0.072, df[0] = 0.067, dc_hat[0] = 0.069 + Gradient do_[0] = 5.966 +Backward Time Step 3: + Gradient di[0] = 0.093, df[0] = 0.080, dc_hat[0] = 0.086 + Gradient do_[0] = 6.507 +Backward Time Step 2: + Gradient di[0] = 0.094, df[0] = 0.086, dc_hat[0] = 0.125 + Gradient do_[0] = 5.621 +Backward Time Step 1: + Gradient di[0] = 0.081, df[0] = 0.072, dc_hat[0] = 0.116 + Gradient do_[0] = 3.310 +Backward Time Step 0: + Gradient di[0] = 0.059, df[0] = 0.053, dc_hat[0] = 0.103 + Gradient do_[0] = 1.045 +Time Step 0: + i_gate[0] = 0.512, f_gate[0] = 0.674, o_gate[0] = 0.111, c_hat[0] = 0.645 + c_state[0] = 0.330, h_state[0] = 0.035 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.658, o_gate[0] = 0.086, c_hat[0] = 0.658 + c_state[0] = 0.506, h_state[0] = 0.040 +Time Step 2: + i_gate[0] = 0.399, f_gate[0] = 0.647, o_gate[0] = 0.076, c_hat[0] = 0.657 + c_state[0] = 0.590, h_state[0] = 0.040 +Time Step 3: + i_gate[0] = 0.360, f_gate[0] = 0.631, o_gate[0] = 0.071, c_hat[0] = 0.735 + c_state[0] = 0.637, h_state[0] = 0.040 +Time Step 4: + i_gate[0] = 0.329, f_gate[0] = 0.629, o_gate[0] = 0.064, c_hat[0] = 0.720 + c_state[0] = 0.637, h_state[0] = 0.036 +Backward Time Step 4: + Gradient di[0] = -8.832, df[0] = -8.159, dc_hat[0] = -8.814 + Gradient do_[0] = -713.266 +Backward Time Step 3: + Gradient di[0] = -11.515, df[0] = -9.987, dc_hat[0] = -11.228 + Gradient do_[0] = -788.504 +Backward Time Step 2: + Gradient di[0] = -10.567, df[0] = -9.907, dc_hat[0] = -15.177 + Gradient do_[0] = -645.772 +Backward Time Step 1: + Gradient di[0] = -8.282, df[0] = -7.570, dc_hat[0] = -12.760 + Gradient do_[0] = -354.749 +Backward Time Step 0: + Gradient di[0] = -4.946, df[0] = -4.548, dc_hat[0] = -9.188 + Gradient do_[0] = -98.564 +Time Step 0: + i_gate[0] = 0.500, f_gate[0] = 0.663, o_gate[0] = 0.106, c_hat[0] = 0.615 + c_state[0] = 0.307, h_state[0] = 0.032 +Time Step 1: + i_gate[0] = 0.427, f_gate[0] = 0.647, o_gate[0] = 0.081, c_hat[0] = 0.619 + c_state[0] = 0.463, h_state[0] = 0.035 +Time Step 2: + i_gate[0] = 0.387, f_gate[0] = 0.636, o_gate[0] = 0.072, c_hat[0] = 0.610 + c_state[0] = 0.531, h_state[0] = 0.035 +Time Step 3: + i_gate[0] = 0.349, f_gate[0] = 0.621, o_gate[0] = 0.067, c_hat[0] = 0.688 + c_state[0] = 0.570, h_state[0] = 0.034 +Time Step 4: + i_gate[0] = 0.319, f_gate[0] = 0.620, o_gate[0] = 0.060, c_hat[0] = 0.660 + c_state[0] = 0.564, h_state[0] = 0.031 +Backward Time Step 4: + Gradient di[0] = -10177084.000, df[0] = -10364855.000, dc_hat[0] = -12777321.000 + Gradient do_[0] = -811651072.000 +Backward Time Step 3: + Gradient di[0] = -13232801.000, df[0] = -12364564.000, dc_hat[0] = -15529369.000 + Gradient do_[0] = -888755200.000 +Backward Time Step 2: + Gradient di[0] = -12179485.000, df[0] = -12395052.000, dc_hat[0] = -20466140.000 + Gradient do_[0] = -744923136.000 +Backward Time Step 1: + Gradient di[0] = -9939050.000, df[0] = -9687769.000, dc_hat[0] = -17272750.000 + Gradient do_[0] = -429558976.000 +Backward Time Step 0: + Gradient di[0] = -6349600.000, df[0] = -6121459.000, dc_hat[0] = -12844551.000 + Gradient do_[0] = -127568376.000 +Time Step 0: + i_gate[0] = 0.512, f_gate[0] = 0.674, o_gate[0] = 0.111, c_hat[0] = 0.645 + c_state[0] = 0.330, h_state[0] = 0.035 +Time Step 1: + i_gate[0] = 0.438, f_gate[0] = 0.664, o_gate[0] = 0.086, c_hat[0] = 0.649 + c_state[0] = 0.504, h_state[0] = 0.040 +Time Step 2: + i_gate[0] = 0.398, f_gate[0] = 0.658, o_gate[0] = 0.077, c_hat[0] = 0.635 + c_state[0] = 0.585, h_state[0] = 0.040 +Time Step 3: + i_gate[0] = 0.360, f_gate[0] = 0.648, o_gate[0] = 0.072, c_hat[0] = 0.702 + c_state[0] = 0.632, h_state[0] = 0.040 +Time Step 4: + i_gate[0] = 0.332, f_gate[0] = 0.652, o_gate[0] = 0.066, c_hat[0] = 0.666 + c_state[0] = 0.633, h_state[0] = 0.037 +Backward Time Step 4: + Gradient di[0] = 1830121.000, df[0] = 1833165.750, dc_hat[0] = 2289939.750 + Gradient do_[0] = 153543648.000 +Backward Time Step 3: + Gradient di[0] = 2171240.000, df[0] = 1982507.250, dc_hat[0] = 2450665.000 + Gradient do_[0] = 152030768.000 +Backward Time Step 2: + Gradient di[0] = 2050281.875, df[0] = 1994436.375, dc_hat[0] = 3197066.500 + Gradient do_[0] = 127804216.000 +Backward Time Step 1: + Gradient di[0] = 1785827.250, df[0] = 1654820.375, dc_hat[0] = 2832199.250 + Gradient do_[0] = 77136624.000 +Backward Time Step 0: + Gradient di[0] = 1196132.250, df[0] = 1099798.750, dc_hat[0] = 2222246.250 + Gradient do_[0] = 23799746.000 +Time Step 0: + i_gate[0] = 0.500, f_gate[0] = 0.663, o_gate[0] = 0.106, c_hat[0] = 0.615 + c_state[0] = 0.307, h_state[0] = 0.032 +Time Step 1: + i_gate[0] = 0.428, f_gate[0] = 0.648, o_gate[0] = 0.082, c_hat[0] = 0.615 + c_state[0] = 0.462, h_state[0] = 0.035 +Time Step 2: + i_gate[0] = 0.389, f_gate[0] = 0.639, o_gate[0] = 0.073, c_hat[0] = 0.601 + c_state[0] = 0.529, h_state[0] = 0.035 +Time Step 3: + i_gate[0] = 0.352, f_gate[0] = 0.626, o_gate[0] = 0.068, c_hat[0] = 0.676 + c_state[0] = 0.569, h_state[0] = 0.035 +Time Step 4: + i_gate[0] = 0.323, f_gate[0] = 0.627, o_gate[0] = 0.062, c_hat[0] = 0.640 + c_state[0] = 0.564, h_state[0] = 0.032 +Backward Time Step 4: + Gradient di[0] = -14507731.000, df[0] = -15202469.000, dc_hat[0] = -19762446.000 + Gradient do_[0] = -1152882304.000 +Backward Time Step 3: + Gradient di[0] = -17390244.000, df[0] = -16539071.000, dc_hat[0] = -21584336.000 + Gradient do_[0] = -1158783104.000 +Backward Time Step 2: + Gradient di[0] = -15275506.000, df[0] = -15774069.000, dc_hat[0] = -26616044.000 + Gradient do_[0] = -929913600.000 +Backward Time Step 1: + Gradient di[0] = -12382044.000, df[0] = -12159019.000, dc_hat[0] = -21929236.000 + Gradient do_[0] = -533257888.000 +Backward Time Step 0: + Gradient di[0] = -7962733.000, df[0] = -7676632.500, dc_hat[0] = -16107743.000 + Gradient do_[0] = -159977440.000 +Time Step 0: + i_gate[0] = 0.512, f_gate[0] = 0.674, o_gate[0] = 0.111, c_hat[0] = 0.645 + c_state[0] = 0.330, h_state[0] = 0.035 +Time Step 1: + i_gate[0] = 0.440, f_gate[0] = 0.667, o_gate[0] = 0.087, c_hat[0] = 0.643 + c_state[0] = 0.503, h_state[0] = 0.040 +Time Step 2: + i_gate[0] = 0.401, f_gate[0] = 0.663, o_gate[0] = 0.078, c_hat[0] = 0.622 + c_state[0] = 0.583, h_state[0] = 0.041 +Time Step 3: + i_gate[0] = 0.366, f_gate[0] = 0.657, o_gate[0] = 0.074, c_hat[0] = 0.683 + c_state[0] = 0.633, h_state[0] = 0.042 +Time Step 4: + i_gate[0] = 0.339, f_gate[0] = 0.665, o_gate[0] = 0.069, c_hat[0] = 0.635 + c_state[0] = 0.636, h_state[0] = 0.039 +Backward Time Step 4: + Gradient di[0] = -5284627.500, df[0] = -5499826.000, dc_hat[0] = -7516888.500 + Gradient do_[0] = -443141696.000 +Backward Time Step 3: + Gradient di[0] = -5776481.500, df[0] = -5396417.000, dc_hat[0] = -7108135.500 + Gradient do_[0] = -401221184.000 +Backward Time Step 2: + Gradient di[0] = -4912044.500, df[0] = -4865726.000, dc_hat[0] = -8071287.500 + Gradient do_[0] = -304769280.000 +Backward Time Step 1: + Gradient di[0] = -4016747.000, df[0] = -3753353.750, dc_hat[0] = -6529669.500 + Gradient do_[0] = -172796720.000 +Backward Time Step 0: + Gradient di[0] = -2605996.250, df[0] = -2396115.750, dc_hat[0] = -4841576.500 + Gradient do_[0] = -51852164.000 +Time Step 0: + i_gate[0] = 0.524, f_gate[0] = 0.685, o_gate[0] = 0.116, c_hat[0] = 0.673 + c_state[0] = 0.353, h_state[0] = 0.039 +Time Step 1: + i_gate[0] = 0.450, f_gate[0] = 0.682, o_gate[0] = 0.091, c_hat[0] = 0.674 + c_state[0] = 0.544, h_state[0] = 0.045 +Time Step 2: + i_gate[0] = 0.410, f_gate[0] = 0.682, o_gate[0] = 0.082, c_hat[0] = 0.653 + c_state[0] = 0.639, h_state[0] = 0.046 +Time Step 3: + i_gate[0] = 0.375, f_gate[0] = 0.678, o_gate[0] = 0.078, c_hat[0] = 0.707 + c_state[0] = 0.699, h_state[0] = 0.047 +Time Step 4: + i_gate[0] = 0.350, f_gate[0] = 0.689, o_gate[0] = 0.073, c_hat[0] = 0.661 + c_state[0] = 0.712, h_state[0] = 0.045 +Backward Time Step 4: + Gradient di[0] = -394027.438, df[0] = -387097.312, dc_hat[0] = -516322.500 + Gradient do_[0] = -34995756.000 +Backward Time Step 3: + Gradient di[0] = -386826.500, df[0] = -345401.062, dc_hat[0] = -437262.750 + Gradient do_[0] = -28305248.000 +Backward Time Step 2: + Gradient di[0] = -327092.719, df[0] = -306033.969, dc_hat[0] = -486670.875 + Gradient do_[0] = -20845446.000 +Backward Time Step 1: + Gradient di[0] = -276198.375, df[0] = -244745.938, dc_hat[0] = -405795.938 + Gradient do_[0] = -11949717.000 +Backward Time Step 0: + Gradient di[0] = -190172.344, df[0] = -167523.609, dc_hat[0] = -325352.812 + Gradient do_[0] = -3752702.250 +Time Step 0: + i_gate[0] = 0.537, f_gate[0] = 0.696, o_gate[0] = 0.121, c_hat[0] = 0.699 + c_state[0] = 0.375, h_state[0] = 0.043 +Time Step 1: + i_gate[0] = 0.460, f_gate[0] = 0.698, o_gate[0] = 0.096, c_hat[0] = 0.705 + c_state[0] = 0.586, h_state[0] = 0.051 +Time Step 2: + i_gate[0] = 0.420, f_gate[0] = 0.701, o_gate[0] = 0.087, c_hat[0] = 0.686 + c_state[0] = 0.698, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.384, f_gate[0] = 0.700, o_gate[0] = 0.083, c_hat[0] = 0.734 + c_state[0] = 0.771, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.360, f_gate[0] = 0.713, o_gate[0] = 0.079, c_hat[0] = 0.693 + c_state[0] = 0.799, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = -612.691, df[0] = -559.560, dc_hat[0] = -717.083 + Gradient do_[0] = -57677.473 +Backward Time Step 3: + Gradient di[0] = -840.684, df[0] = -711.329, dc_hat[0] = -856.875 + Gradient do_[0] = -64818.344 +Backward Time Step 2: + Gradient di[0] = -862.133, df[0] = -758.825, dc_hat[0] = -1148.269 + Gradient do_[0] = -56389.848 +Backward Time Step 1: + Gradient di[0] = -830.599, df[0] = -697.583, dc_hat[0] = -1095.456 + Gradient do_[0] = -36096.195 +Backward Time Step 0: + Gradient di[0] = -627.929, df[0] = -532.063, dc_hat[0] = -991.501 + Gradient do_[0] = -12305.578 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.686, o_gate[0] = 0.116, c_hat[0] = 0.673 + c_state[0] = 0.355, h_state[0] = 0.040 +Time Step 1: + i_gate[0] = 0.448, f_gate[0] = 0.689, o_gate[0] = 0.090, c_hat[0] = 0.672 + c_state[0] = 0.546, h_state[0] = 0.045 +Time Step 2: + i_gate[0] = 0.408, f_gate[0] = 0.694, o_gate[0] = 0.081, c_hat[0] = 0.639 + c_state[0] = 0.640, h_state[0] = 0.046 +Time Step 3: + i_gate[0] = 0.374, f_gate[0] = 0.696, o_gate[0] = 0.078, c_hat[0] = 0.680 + c_state[0] = 0.700, h_state[0] = 0.047 +Time Step 4: + i_gate[0] = 0.352, f_gate[0] = 0.712, o_gate[0] = 0.074, c_hat[0] = 0.617 + c_state[0] = 0.715, h_state[0] = 0.046 +Backward Time Step 4: + Gradient di[0] = -30159064.000, df[0] = -31308278.000, dc_hat[0] = -46714880.000 + Gradient do_[0] = -2836994048.000 +Backward Time Step 3: + Gradient di[0] = -33148728.000, df[0] = -30640866.000, dc_hat[0] = -41779908.000 + Gradient do_[0] = -2531370496.000 +Backward Time Step 2: + Gradient di[0] = -29900568.000, df[0] = -28534246.000, dc_hat[0] = -46689516.000 + Gradient do_[0] = -1971881728.000 +Backward Time Step 1: + Gradient di[0] = -27403146.000, df[0] = -24341240.000, dc_hat[0] = -40497740.000 + Gradient do_[0] = -1205203968.000 +Backward Time Step 0: + Gradient di[0] = -21644432.000, df[0] = -19071374.000, dc_hat[0] = -37251524.000 + Gradient do_[0] = -427794048.000 +Time Step 0: + i_gate[0] = 0.540, f_gate[0] = 0.697, o_gate[0] = 0.121, c_hat[0] = 0.699 + c_state[0] = 0.377, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.460, f_gate[0] = 0.706, o_gate[0] = 0.095, c_hat[0] = 0.700 + c_state[0] = 0.588, h_state[0] = 0.050 +Time Step 2: + i_gate[0] = 0.419, f_gate[0] = 0.714, o_gate[0] = 0.086, c_hat[0] = 0.670 + c_state[0] = 0.701, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.386, f_gate[0] = 0.718, o_gate[0] = 0.083, c_hat[0] = 0.710 + c_state[0] = 0.777, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.364, f_gate[0] = 0.734, o_gate[0] = 0.080, c_hat[0] = 0.659 + c_state[0] = 0.810, h_state[0] = 0.054 +Backward Time Step 4: + Gradient di[0] = 23767.682, df[0] = 22323.666, dc_hat[0] = 32096.670 + Gradient do_[0] = 2357644.750 +Backward Time Step 3: + Gradient di[0] = 41180.008, df[0] = 35585.730, dc_hat[0] = 46778.484 + Gradient do_[0] = 3311088.000 +Backward Time Step 2: + Gradient di[0] = 46190.203, df[0] = 41303.711, dc_hat[0] = 65493.723 + Gradient do_[0] = 3133149.250 +Backward Time Step 1: + Gradient di[0] = 48082.953, df[0] = 40545.488, dc_hat[0] = 64964.941 + Gradient do_[0] = 2134384.000 +Backward Time Step 0: + Gradient di[0] = 41183.887, df[0] = 34908.141, dc_hat[0] = 65427.656 + Gradient do_[0] = 808568.750 +Time Step 0: + i_gate[0] = 0.552, f_gate[0] = 0.707, o_gate[0] = 0.127, c_hat[0] = 0.724 + c_state[0] = 0.399, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.472, f_gate[0] = 0.715, o_gate[0] = 0.102, c_hat[0] = 0.735 + c_state[0] = 0.632, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.431, f_gate[0] = 0.722, o_gate[0] = 0.093, c_hat[0] = 0.718 + c_state[0] = 0.765, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.396, f_gate[0] = 0.725, o_gate[0] = 0.090, c_hat[0] = 0.761 + c_state[0] = 0.856, h_state[0] = 0.062 +Time Step 4: + i_gate[0] = 0.373, f_gate[0] = 0.740, o_gate[0] = 0.087, c_hat[0] = 0.726 + c_state[0] = 0.905, h_state[0] = 0.062 +Backward Time Step 4: + Gradient di[0] = -29233.355, df[0] = -24491.543, dc_hat[0] = -30383.992 + Gradient do_[0] = -2954344.000 +Backward Time Step 3: + Gradient di[0] = -52481.242, df[0] = -41677.879, dc_hat[0] = -48051.535 + Gradient do_[0] = -4294995.500 +Backward Time Step 2: + Gradient di[0] = -61363.242, df[0] = -50566.457, dc_hat[0] = -72869.398 + Gradient do_[0] = -4136427.750 +Backward Time Step 1: + Gradient di[0] = -65615.867, df[0] = -52181.082, dc_hat[0] = -77656.203 + Gradient do_[0] = -2870320.500 +Backward Time Step 0: + Gradient di[0] = -54277.484, df[0] = -44412.965, dc_hat[0] = -79750.430 + Gradient do_[0] = -1059924.625 +Time Step 0: + i_gate[0] = 0.540, f_gate[0] = 0.697, o_gate[0] = 0.121, c_hat[0] = 0.699 + c_state[0] = 0.377, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.459, f_gate[0] = 0.704, o_gate[0] = 0.095, c_hat[0] = 0.701 + c_state[0] = 0.588, h_state[0] = 0.050 +Time Step 2: + i_gate[0] = 0.417, f_gate[0] = 0.711, o_gate[0] = 0.087, c_hat[0] = 0.674 + c_state[0] = 0.699, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.383, f_gate[0] = 0.714, o_gate[0] = 0.084, c_hat[0] = 0.716 + c_state[0] = 0.773, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.361, f_gate[0] = 0.730, o_gate[0] = 0.080, c_hat[0] = 0.666 + c_state[0] = 0.804, h_state[0] = 0.054 +Backward Time Step 4: + Gradient di[0] = 2297.680, df[0] = 2152.850, dc_hat[0] = 3001.092 + Gradient do_[0] = 223294.453 +Backward Time Step 3: + Gradient di[0] = 3262.886, df[0] = 2814.169, dc_hat[0] = 3602.709 + Gradient do_[0] = 257844.609 +Backward Time Step 2: + Gradient di[0] = 3457.547, df[0] = 3084.812, dc_hat[0] = 4812.047 + Gradient do_[0] = 231493.328 +Backward Time Step 1: + Gradient di[0] = 3578.472, df[0] = 3014.284, dc_hat[0] = 4794.924 + Gradient do_[0] = 157911.969 +Backward Time Step 0: + Gradient di[0] = 3146.943, df[0] = 2667.401, dc_hat[0] = 4999.458 + Gradient do_[0] = 61784.242 +Time Step 0: + i_gate[0] = 0.549, f_gate[0] = 0.707, o_gate[0] = 0.126, c_hat[0] = 0.724 + c_state[0] = 0.397, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.469, f_gate[0] = 0.714, o_gate[0] = 0.101, c_hat[0] = 0.736 + c_state[0] = 0.629, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.427, f_gate[0] = 0.719, o_gate[0] = 0.093, c_hat[0] = 0.720 + c_state[0] = 0.759, h_state[0] = 0.059 +Time Step 3: + i_gate[0] = 0.392, f_gate[0] = 0.720, o_gate[0] = 0.090, c_hat[0] = 0.766 + c_state[0] = 0.846, h_state[0] = 0.062 +Time Step 4: + i_gate[0] = 0.368, f_gate[0] = 0.734, o_gate[0] = 0.086, c_hat[0] = 0.733 + c_state[0] = 0.892, h_state[0] = 0.061 +Backward Time Step 4: + Gradient di[0] = -60609.941, df[0] = -50891.809, dc_hat[0] = -60457.770 + Gradient do_[0] = -5991625.000 +Backward Time Step 3: + Gradient di[0] = -102356.219, df[0] = -81417.055, dc_hat[0] = -90722.039 + Gradient do_[0] = -8224900.500 +Backward Time Step 2: + Gradient di[0] = -119492.172, df[0] = -98569.125, dc_hat[0] = -139545.172 + Gradient do_[0] = -7957394.500 +Backward Time Step 1: + Gradient di[0] = -130363.156, df[0] = -103762.203, dc_hat[0] = -153146.953 + Gradient do_[0] = -5673442.000 +Backward Time Step 0: + Gradient di[0] = -112263.250, df[0] = -91755.070, dc_hat[0] = -163922.516 + Gradient do_[0] = -2187659.000 +Time Step 0: + i_gate[0] = 0.537, f_gate[0] = 0.697, o_gate[0] = 0.121, c_hat[0] = 0.699 + c_state[0] = 0.375, h_state[0] = 0.043 +Time Step 1: + i_gate[0] = 0.455, f_gate[0] = 0.703, o_gate[0] = 0.095, c_hat[0] = 0.702 + c_state[0] = 0.583, h_state[0] = 0.050 +Time Step 2: + i_gate[0] = 0.412, f_gate[0] = 0.708, o_gate[0] = 0.086, c_hat[0] = 0.676 + c_state[0] = 0.692, h_state[0] = 0.052 +Time Step 3: + i_gate[0] = 0.377, f_gate[0] = 0.710, o_gate[0] = 0.084, c_hat[0] = 0.720 + c_state[0] = 0.762, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.354, f_gate[0] = 0.725, o_gate[0] = 0.080, c_hat[0] = 0.672 + c_state[0] = 0.790, h_state[0] = 0.053 +Backward Time Step 4: + Gradient di[0] = 92738.383, df[0] = 87251.102, dc_hat[0] = 117006.883 + Gradient do_[0] = 8777577.000 +Backward Time Step 3: + Gradient di[0] = 129194.812, df[0] = 111758.961, dc_hat[0] = 138556.125 + Gradient do_[0] = 9997700.000 +Backward Time Step 2: + Gradient di[0] = 137160.281, df[0] = 122586.156, dc_hat[0] = 187496.391 + Gradient do_[0] = 9057979.000 +Backward Time Step 1: + Gradient di[0] = 144032.594, df[0] = 121417.914, dc_hat[0] = 190921.922 + Gradient do_[0] = 6316864.500 +Backward Time Step 0: + Gradient di[0] = 129841.461, df[0] = 109960.500, dc_hat[0] = 205019.844 + Gradient do_[0] = 2544487.750 +Time Step 0: + i_gate[0] = 0.546, f_gate[0] = 0.706, o_gate[0] = 0.125, c_hat[0] = 0.724 + c_state[0] = 0.395, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.466, f_gate[0] = 0.711, o_gate[0] = 0.100, c_hat[0] = 0.735 + c_state[0] = 0.623, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.423, f_gate[0] = 0.714, o_gate[0] = 0.092, c_hat[0] = 0.720 + c_state[0] = 0.750, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.386, f_gate[0] = 0.713, o_gate[0] = 0.089, c_hat[0] = 0.769 + c_state[0] = 0.831, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.361, f_gate[0] = 0.726, o_gate[0] = 0.085, c_hat[0] = 0.738 + c_state[0] = 0.870, h_state[0] = 0.059 +Backward Time Step 4: + Gradient di[0] = -73739.633, df[0] = -62558.195, dc_hat[0] = -71340.320 + Gradient do_[0] = -7072098.500 +Backward Time Step 3: + Gradient di[0] = -115174.148, df[0] = -92261.594, dc_hat[0] = -99852.391 + Gradient do_[0] = -9059656.000 +Backward Time Step 2: + Gradient di[0] = -133115.000, df[0] = -110395.594, dc_hat[0] = -153889.094 + Gradient do_[0] = -8771943.000 +Backward Time Step 1: + Gradient di[0] = -146943.406, df[0] = -117314.648, dc_hat[0] = -171686.094 + Gradient do_[0] = -6387727.500 +Backward Time Step 0: + Gradient di[0] = -130555.727, df[0] = -106661.094, dc_hat[0] = -189395.234 + Gradient do_[0] = -2550051.000 +Time Step 0: + i_gate[0] = 0.534, f_gate[0] = 0.696, o_gate[0] = 0.120, c_hat[0] = 0.699 + c_state[0] = 0.373, h_state[0] = 0.043 +Time Step 1: + i_gate[0] = 0.451, f_gate[0] = 0.700, o_gate[0] = 0.094, c_hat[0] = 0.706 + c_state[0] = 0.579, h_state[0] = 0.049 +Time Step 2: + i_gate[0] = 0.408, f_gate[0] = 0.704, o_gate[0] = 0.086, c_hat[0] = 0.681 + c_state[0] = 0.685, h_state[0] = 0.051 +Time Step 3: + i_gate[0] = 0.372, f_gate[0] = 0.705, o_gate[0] = 0.083, c_hat[0] = 0.725 + c_state[0] = 0.753, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.349, f_gate[0] = 0.719, o_gate[0] = 0.079, c_hat[0] = 0.677 + c_state[0] = 0.777, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = 146083.219, df[0] = 138027.844, dc_hat[0] = 179603.078 + Gradient do_[0] = 13608191.000 +Backward Time Step 3: + Gradient di[0] = 214381.219, df[0] = 185719.750, dc_hat[0] = 223562.547 + Gradient do_[0] = 16385177.000 +Backward Time Step 2: + Gradient di[0] = 233747.078, df[0] = 208624.250, dc_hat[0] = 311168.469 + Gradient do_[0] = 15302966.000 +Backward Time Step 1: + Gradient di[0] = 251770.547, df[0] = 211850.641, dc_hat[0] = 326147.375 + Gradient do_[0] = 11002393.000 +Backward Time Step 0: + Gradient di[0] = 232949.734, df[0] = 197219.656, dc_hat[0] = 365494.250 + Gradient do_[0] = 4577073.000 +Time Step 0: + i_gate[0] = 0.543, f_gate[0] = 0.706, o_gate[0] = 0.124, c_hat[0] = 0.724 + c_state[0] = 0.393, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.460, f_gate[0] = 0.710, o_gate[0] = 0.100, c_hat[0] = 0.740 + c_state[0] = 0.620, h_state[0] = 0.055 +Time Step 2: + i_gate[0] = 0.417, f_gate[0] = 0.712, o_gate[0] = 0.092, c_hat[0] = 0.727 + c_state[0] = 0.745, h_state[0] = 0.058 +Time Step 3: + i_gate[0] = 0.380, f_gate[0] = 0.710, o_gate[0] = 0.089, c_hat[0] = 0.776 + c_state[0] = 0.823, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.354, f_gate[0] = 0.722, o_gate[0] = 0.085, c_hat[0] = 0.746 + c_state[0] = 0.859, h_state[0] = 0.059 +Backward Time Step 4: + Gradient di[0] = -23692.381, df[0] = -20102.199, dc_hat[0] = -21781.342 + Gradient do_[0] = -2212562.750 +Backward Time Step 3: + Gradient di[0] = -32930.461, df[0] = -26356.469, dc_hat[0] = -27275.008 + Gradient do_[0] = -2536505.500 +Backward Time Step 2: + Gradient di[0] = -37235.426, df[0] = -30745.373, dc_hat[0] = -41349.879 + Gradient do_[0] = -2411869.250 +Backward Time Step 1: + Gradient di[0] = -40695.391, df[0] = -32348.102, dc_hat[0] = -46010.855 + Gradient do_[0] = -1748439.625 +Backward Time Step 0: + Gradient di[0] = -35967.871, df[0] = -29355.129, dc_hat[0] = -51856.578 + Gradient do_[0] = -701117.938 +Time Step 0: + i_gate[0] = 0.531, f_gate[0] = 0.696, o_gate[0] = 0.119, c_hat[0] = 0.699 + c_state[0] = 0.371, h_state[0] = 0.042 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.700, o_gate[0] = 0.094, c_hat[0] = 0.712 + c_state[0] = 0.577, h_state[0] = 0.049 +Time Step 2: + i_gate[0] = 0.402, f_gate[0] = 0.704, o_gate[0] = 0.086, c_hat[0] = 0.690 + c_state[0] = 0.683, h_state[0] = 0.051 +Time Step 3: + i_gate[0] = 0.366, f_gate[0] = 0.703, o_gate[0] = 0.083, c_hat[0] = 0.733 + c_state[0] = 0.749, h_state[0] = 0.053 +Time Step 4: + i_gate[0] = 0.343, f_gate[0] = 0.717, o_gate[0] = 0.079, c_hat[0] = 0.688 + c_state[0] = 0.773, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = 16726.080, df[0] = 15708.189, dc_hat[0] = 19520.246 + Gradient do_[0] = 1530749.875 +Backward Time Step 3: + Gradient di[0] = 22060.404, df[0] = 19020.312, dc_hat[0] = 21953.279 + Gradient do_[0] = 1664023.250 +Backward Time Step 2: + Gradient di[0] = 23266.469, df[0] = 20592.277, dc_hat[0] = 29578.615 + Gradient do_[0] = 1502980.625 +Backward Time Step 1: + Gradient di[0] = 24542.098, df[0] = 20508.000, dc_hat[0] = 30567.152 + Gradient do_[0] = 1060830.625 +Backward Time Step 0: + Gradient di[0] = 22313.322, df[0] = 18877.002, dc_hat[0] = 34798.410 + Gradient do_[0] = 437646.750 +Time Step 0: + i_gate[0] = 0.519, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.710 + c_state[0] = 0.368, h_state[0] = 0.040 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.687, o_gate[0] = 0.090, c_hat[0] = 0.728 + c_state[0] = 0.568, h_state[0] = 0.046 +Time Step 2: + i_gate[0] = 0.391, f_gate[0] = 0.689, o_gate[0] = 0.082, c_hat[0] = 0.711 + c_state[0] = 0.669, h_state[0] = 0.048 +Time Step 3: + i_gate[0] = 0.356, f_gate[0] = 0.687, o_gate[0] = 0.079, c_hat[0] = 0.755 + c_state[0] = 0.728, h_state[0] = 0.049 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.700, o_gate[0] = 0.075, c_hat[0] = 0.714 + c_state[0] = 0.746, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = 56733.000, df[0] = 52717.938, dc_hat[0] = 58204.023 + Gradient do_[0] = 5016909.500 +Backward Time Step 3: + Gradient di[0] = 73999.547, df[0] = 63219.883, dc_hat[0] = 65488.531 + Gradient do_[0] = 5461911.000 +Backward Time Step 2: + Gradient di[0] = 76348.141, df[0] = 66592.289, dc_hat[0] = 87197.977 + Gradient do_[0] = 4871231.500 +Backward Time Step 1: + Gradient di[0] = 77240.484, df[0] = 63865.691, dc_hat[0] = 87976.352 + Gradient do_[0] = 3350873.750 +Backward Time Step 0: + Gradient di[0] = 67010.461, df[0] = 55816.094, dc_hat[0] = 97109.961 + Gradient do_[0] = 1328409.875 +Time Step 0: + i_gate[0] = 0.514, f_gate[0] = 0.697, o_gate[0] = 0.118, c_hat[0] = 0.734 + c_state[0] = 0.378, h_state[0] = 0.043 +Time Step 1: + i_gate[0] = 0.432, f_gate[0] = 0.695, o_gate[0] = 0.095, c_hat[0] = 0.756 + c_state[0] = 0.589, h_state[0] = 0.050 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.694, o_gate[0] = 0.087, c_hat[0] = 0.748 + c_state[0] = 0.701, h_state[0] = 0.053 +Time Step 3: + i_gate[0] = 0.353, f_gate[0] = 0.689, o_gate[0] = 0.084, c_hat[0] = 0.796 + c_state[0] = 0.765, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.327, f_gate[0] = 0.700, o_gate[0] = 0.079, c_hat[0] = 0.772 + c_state[0] = 0.787, h_state[0] = 0.052 +Backward Time Step 4: + Gradient di[0] = 154.828, df[0] = 133.981, dc_hat[0] = 120.274 + Gradient do_[0] = 13343.745 +Backward Time Step 3: + Gradient di[0] = 173.806, df[0] = 140.975, dc_hat[0] = 123.466 + Gradient do_[0] = 12560.574 +Backward Time Step 2: + Gradient di[0] = 176.425, df[0] = 145.992, dc_hat[0] = 170.174 + Gradient do_[0] = 10878.720 +Backward Time Step 1: + Gradient di[0] = 174.288, df[0] = 138.468, dc_hat[0] = 174.060 + Gradient do_[0] = 7293.926 +Backward Time Step 0: + Gradient di[0] = 143.380, df[0] = 115.107, dc_hat[0] = 185.480 + Gradient do_[0] = 2741.481 +Time Step 0: + i_gate[0] = 0.508, f_gate[0] = 0.703, o_gate[0] = 0.122, c_hat[0] = 0.751 + c_state[0] = 0.381, h_state[0] = 0.044 +Time Step 1: + i_gate[0] = 0.428, f_gate[0] = 0.700, o_gate[0] = 0.099, c_hat[0] = 0.775 + c_state[0] = 0.599, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.388, f_gate[0] = 0.698, o_gate[0] = 0.091, c_hat[0] = 0.774 + c_state[0] = 0.718, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.351, f_gate[0] = 0.690, o_gate[0] = 0.088, c_hat[0] = 0.825 + c_state[0] = 0.785, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.323, f_gate[0] = 0.698, o_gate[0] = 0.082, c_hat[0] = 0.812 + c_state[0] = 0.810, h_state[0] = 0.055 +Backward Time Step 4: + Gradient di[0] = 0.060, df[0] = 0.050, dc_hat[0] = 0.037 + Gradient do_[0] = 5.055 +Backward Time Step 3: + Gradient di[0] = 0.077, df[0] = 0.061, dc_hat[0] = 0.046 + Gradient do_[0] = 5.456 +Backward Time Step 2: + Gradient di[0] = 0.092, df[0] = 0.074, dc_hat[0] = 0.078 + Gradient do_[0] = 5.301 +Backward Time Step 1: + Gradient di[0] = 0.097, df[0] = 0.075, dc_hat[0] = 0.088 + Gradient do_[0] = 3.749 +Backward Time Step 0: + Gradient di[0] = 0.087, df[0] = 0.068, dc_hat[0] = 0.103 + Gradient do_[0] = 1.474 +Time Step 0: + i_gate[0] = 0.508, f_gate[0] = 0.703, o_gate[0] = 0.125, c_hat[0] = 0.756 + c_state[0] = 0.384, h_state[0] = 0.046 +Time Step 1: + i_gate[0] = 0.430, f_gate[0] = 0.699, o_gate[0] = 0.103, c_hat[0] = 0.783 + c_state[0] = 0.605, h_state[0] = 0.056 +Time Step 2: + i_gate[0] = 0.390, f_gate[0] = 0.695, o_gate[0] = 0.095, c_hat[0] = 0.785 + c_state[0] = 0.727, h_state[0] = 0.059 +Time Step 3: + i_gate[0] = 0.353, f_gate[0] = 0.685, o_gate[0] = 0.091, c_hat[0] = 0.837 + c_state[0] = 0.794, h_state[0] = 0.060 +Time Step 4: + i_gate[0] = 0.325, f_gate[0] = 0.689, o_gate[0] = 0.085, c_hat[0] = 0.829 + c_state[0] = 0.816, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = 0.011, df[0] = 0.009, dc_hat[0] = 0.006 + Gradient do_[0] = 0.949 +Backward Time Step 3: + Gradient di[0] = 0.017, df[0] = 0.013, dc_hat[0] = 0.009 + Gradient do_[0] = 1.188 +Backward Time Step 2: + Gradient di[0] = 0.026, df[0] = 0.021, dc_hat[0] = 0.021 + Gradient do_[0] = 1.362 +Backward Time Step 1: + Gradient di[0] = 0.031, df[0] = 0.024, dc_hat[0] = 0.027 + Gradient do_[0] = 1.057 +Backward Time Step 0: + Gradient di[0] = 0.034, df[0] = 0.027, dc_hat[0] = 0.040 + Gradient do_[0] = 0.490 +Time Step 0: + i_gate[0] = 0.508, f_gate[0] = 0.704, o_gate[0] = 0.130, c_hat[0] = 0.758 + c_state[0] = 0.385, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.427, f_gate[0] = 0.699, o_gate[0] = 0.108, c_hat[0] = 0.787 + c_state[0] = 0.606, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.385, f_gate[0] = 0.694, o_gate[0] = 0.100, c_hat[0] = 0.792 + c_state[0] = 0.726, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.684, o_gate[0] = 0.096, c_hat[0] = 0.844 + c_state[0] = 0.788, h_state[0] = 0.063 +Time Step 4: + i_gate[0] = 0.316, f_gate[0] = 0.689, o_gate[0] = 0.091, c_hat[0] = 0.836 + c_state[0] = 0.807, h_state[0] = 0.061 +Backward Time Step 4: + Gradient di[0] = -0.016, df[0] = -0.013, dc_hat[0] = -0.008 + Gradient do_[0] = -1.133 +Backward Time Step 3: + Gradient di[0] = -0.022, df[0] = -0.017, dc_hat[0] = -0.011 + Gradient do_[0] = -1.317 +Backward Time Step 2: + Gradient di[0] = -0.018, df[0] = -0.014, dc_hat[0] = -0.014 + Gradient do_[0] = -1.112 +Backward Time Step 1: + Gradient di[0] = -0.011, df[0] = -0.008, dc_hat[0] = -0.009 + Gradient do_[0] = -0.534 +Backward Time Step 0: + Gradient di[0] = 0.005, df[0] = 0.004, dc_hat[0] = 0.005 + Gradient do_[0] = -0.041 +Time Step 0: + i_gate[0] = 0.509, f_gate[0] = 0.704, o_gate[0] = 0.136, c_hat[0] = 0.760 + c_state[0] = 0.387, h_state[0] = 0.050 +Time Step 1: + i_gate[0] = 0.425, f_gate[0] = 0.700, o_gate[0] = 0.113, c_hat[0] = 0.791 + c_state[0] = 0.606, h_state[0] = 0.061 +Time Step 2: + i_gate[0] = 0.380, f_gate[0] = 0.696, o_gate[0] = 0.106, c_hat[0] = 0.797 + c_state[0] = 0.725, h_state[0] = 0.066 +Time Step 3: + i_gate[0] = 0.338, f_gate[0] = 0.687, o_gate[0] = 0.103, c_hat[0] = 0.848 + c_state[0] = 0.784, h_state[0] = 0.068 +Time Step 4: + i_gate[0] = 0.307, f_gate[0] = 0.693, o_gate[0] = 0.099, c_hat[0] = 0.841 + c_state[0] = 0.802, h_state[0] = 0.066 +Backward Time Step 4: + Gradient di[0] = -0.170, df[0] = -0.140, dc_hat[0] = -0.085 + Gradient do_[0] = -11.423 +Backward Time Step 3: + Gradient di[0] = -0.344, df[0] = -0.268, dc_hat[0] = -0.172 + Gradient do_[0] = -20.123 +Backward Time Step 2: + Gradient di[0] = -0.466, df[0] = -0.365, dc_hat[0] = -0.343 + Gradient do_[0] = -23.660 +Backward Time Step 1: + Gradient di[0] = -0.480, df[0] = -0.365, dc_hat[0] = -0.394 + Gradient do_[0] = -16.983 +Backward Time Step 0: + Gradient di[0] = -0.362, df[0] = -0.279, dc_hat[0] = -0.410 + Gradient do_[0] = -6.113 +Time Step 0: + i_gate[0] = 0.508, f_gate[0] = 0.703, o_gate[0] = 0.132, c_hat[0] = 0.753 + c_state[0] = 0.383, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.419, f_gate[0] = 0.698, o_gate[0] = 0.109, c_hat[0] = 0.786 + c_state[0] = 0.596, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.371, f_gate[0] = 0.694, o_gate[0] = 0.102, c_hat[0] = 0.793 + c_state[0] = 0.708, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.327, f_gate[0] = 0.685, o_gate[0] = 0.100, c_hat[0] = 0.845 + c_state[0] = 0.761, h_state[0] = 0.064 +Time Step 4: + i_gate[0] = 0.295, f_gate[0] = 0.693, o_gate[0] = 0.097, c_hat[0] = 0.838 + c_state[0] = 0.775, h_state[0] = 0.063 +Backward Time Step 4: + Gradient di[0] = -0.461, df[0] = -0.390, dc_hat[0] = -0.232 + Gradient do_[0] = -30.819 +Backward Time Step 3: + Gradient di[0] = -1.192, df[0] = -0.948, dc_hat[0] = -0.600 + Gradient do_[0] = -70.159 +Backward Time Step 2: + Gradient di[0] = -1.779, df[0] = -1.417, dc_hat[0] = -1.322 + Gradient do_[0] = -91.300 +Backward Time Step 1: + Gradient di[0] = -1.916, df[0] = -1.474, dc_hat[0] = -1.605 + Gradient do_[0] = -69.009 +Backward Time Step 0: + Gradient di[0] = -1.529, df[0] = -1.193, dc_hat[0] = -1.789 + Gradient do_[0] = -26.138 +Time Step 0: + i_gate[0] = 0.508, f_gate[0] = 0.701, o_gate[0] = 0.133, c_hat[0] = 0.742 + c_state[0] = 0.377, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.694, o_gate[0] = 0.110, c_hat[0] = 0.775 + c_state[0] = 0.581, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.360, f_gate[0] = 0.689, o_gate[0] = 0.103, c_hat[0] = 0.782 + c_state[0] = 0.682, h_state[0] = 0.061 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.681, o_gate[0] = 0.102, c_hat[0] = 0.836 + c_state[0] = 0.726, h_state[0] = 0.063 +Time Step 4: + i_gate[0] = 0.280, f_gate[0] = 0.690, o_gate[0] = 0.100, c_hat[0] = 0.830 + c_state[0] = 0.734, h_state[0] = 0.062 +Backward Time Step 4: + Gradient di[0] = -2.784, df[0] = -2.454, dc_hat[0] = -1.451 + Gradient do_[0] = -171.761 +Backward Time Step 3: + Gradient di[0] = -6.392, df[0] = -5.257, dc_hat[0] = -3.346 + Gradient do_[0] = -353.260 +Backward Time Step 2: + Gradient di[0] = -9.690, df[0] = -7.939, dc_hat[0] = -7.502 + Gradient do_[0] = -475.423 +Backward Time Step 1: + Gradient di[0] = -10.680, df[0] = -8.384, dc_hat[0] = -9.369 + Gradient do_[0] = -373.815 +Backward Time Step 0: + Gradient di[0] = -8.665, df[0] = -6.864, dc_hat[0] = -10.660 + Gradient do_[0] = -145.118 +Epoch 200, Train Loss=0.009384, Weight Norm=12.257568 +Sample Predictions at Epoch 200: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 56.99 | 63.87 | 6.88 | +| 193 | 2024-10-14 | 56.25 | 66.55 | 10.30 | +| 194 | 2024-10-15 | 56.34 | 66.00 | 9.66 | +| 195 | 2024-10-16 | 57.58 | 67.20 | 9.62 | +| 196 | 2024-10-17 | 57.06 | 66.76 | 9.70 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.520, f_gate[0] = 0.711, o_gate[0] = 0.139, c_hat[0] = 0.763 + c_state[0] = 0.397, h_state[0] = 0.052 +Time Step 1: + i_gate[0] = 0.420, f_gate[0] = 0.705, o_gate[0] = 0.116, c_hat[0] = 0.793 + c_state[0] = 0.613, h_state[0] = 0.063 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.701, o_gate[0] = 0.110, c_hat[0] = 0.800 + c_state[0] = 0.721, h_state[0] = 0.068 +Time Step 3: + i_gate[0] = 0.316, f_gate[0] = 0.695, o_gate[0] = 0.110, c_hat[0] = 0.850 + c_state[0] = 0.770, h_state[0] = 0.071 +Time Step 4: + i_gate[0] = 0.283, f_gate[0] = 0.707, o_gate[0] = 0.109, c_hat[0] = 0.845 + c_state[0] = 0.783, h_state[0] = 0.072 +Backward Time Step 4: + Gradient di[0] = 1522.780, df[0] = 1300.379, dc_hat[0] = 718.487 + Gradient do_[0] = 93034.031 +Backward Time Step 3: + Gradient di[0] = 2982.618, df[0] = 2393.300, dc_hat[0] = 1423.484 + Gradient do_[0] = 164322.766 +Backward Time Step 2: + Gradient di[0] = 4506.055, df[0] = 3574.061, dc_hat[0] = 3189.600 + Gradient do_[0] = 220299.234 +Backward Time Step 1: + Gradient di[0] = 5144.803, df[0] = 3903.392, dc_hat[0] = 4150.191 + Gradient do_[0] = 179017.641 +Backward Time Step 0: + Gradient di[0] = 4264.341, df[0] = 3270.638, dc_hat[0] = 4866.328 + Gradient do_[0] = 70828.250 +Time Step 0: + i_gate[0] = 0.508, f_gate[0] = 0.701, o_gate[0] = 0.133, c_hat[0] = 0.741 + c_state[0] = 0.377, h_state[0] = 0.048 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.693, o_gate[0] = 0.110, c_hat[0] = 0.770 + c_state[0] = 0.578, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.358, f_gate[0] = 0.687, o_gate[0] = 0.103, c_hat[0] = 0.773 + c_state[0] = 0.674, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.310, f_gate[0] = 0.677, o_gate[0] = 0.101, c_hat[0] = 0.826 + c_state[0] = 0.713, h_state[0] = 0.062 +Time Step 4: + i_gate[0] = 0.277, f_gate[0] = 0.685, o_gate[0] = 0.098, c_hat[0] = 0.817 + c_state[0] = 0.715, h_state[0] = 0.060 +Backward Time Step 4: + Gradient di[0] = -0.295, df[0] = -0.267, dc_hat[0] = -0.166 + Gradient do_[0] = -18.131 +Backward Time Step 3: + Gradient di[0] = -0.647, df[0] = -0.542, dc_hat[0] = -0.360 + Gradient do_[0] = -35.647 +Backward Time Step 2: + Gradient di[0] = -0.958, df[0] = -0.796, dc_hat[0] = -0.776 + Gradient do_[0] = -47.114 +Backward Time Step 1: + Gradient di[0] = -1.029, df[0] = -0.813, dc_hat[0] = -0.926 + Gradient do_[0] = -36.118 +Backward Time Step 0: + Gradient di[0] = -0.780, df[0] = -0.618, dc_hat[0] = -0.962 + Gradient do_[0] = -13.164 +Time Step 0: + i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750 + c_state[0] = 0.386, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.414, f_gate[0] = 0.697, o_gate[0] = 0.116, c_hat[0] = 0.777 + c_state[0] = 0.590, h_state[0] = 0.061 +Time Step 2: + i_gate[0] = 0.356, f_gate[0] = 0.692, o_gate[0] = 0.110, c_hat[0] = 0.782 + c_state[0] = 0.687, h_state[0] = 0.066 +Time Step 3: + i_gate[0] = 0.307, f_gate[0] = 0.685, o_gate[0] = 0.109, c_hat[0] = 0.834 + c_state[0] = 0.726, h_state[0] = 0.068 +Time Step 4: + i_gate[0] = 0.273, f_gate[0] = 0.695, o_gate[0] = 0.109, c_hat[0] = 0.826 + c_state[0] = 0.731, h_state[0] = 0.068 +Backward Time Step 4: + Gradient di[0] = -1635.764, df[0] = -1467.936, dc_hat[0] = -865.277 + Gradient do_[0] = -93575.961 +Backward Time Step 3: + Gradient di[0] = -3132.884, df[0] = -2612.606, dc_hat[0] = -1653.787 + Gradient do_[0] = -162956.734 +Backward Time Step 2: + Gradient di[0] = -4653.047, df[0] = -3829.213, dc_hat[0] = -3597.654 + Gradient do_[0] = -218080.344 +Backward Time Step 1: + Gradient di[0] = -5223.136, df[0] = -4079.051, dc_hat[0] = -4536.766 + Gradient do_[0] = -176568.062 +Backward Time Step 0: + Gradient di[0] = -4193.728, df[0] = -3282.793, dc_hat[0] = -5045.143 + Gradient do_[0] = -68450.875 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.715, o_gate[0] = 0.145, c_hat[0] = 0.771 + c_state[0] = 0.406, h_state[0] = 0.056 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.709, o_gate[0] = 0.122, c_hat[0] = 0.800 + c_state[0] = 0.626, h_state[0] = 0.068 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.707, o_gate[0] = 0.118, c_hat[0] = 0.808 + c_state[0] = 0.735, h_state[0] = 0.074 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.704, o_gate[0] = 0.120, c_hat[0] = 0.856 + c_state[0] = 0.786, h_state[0] = 0.079 +Time Step 4: + i_gate[0] = 0.282, f_gate[0] = 0.719, o_gate[0] = 0.121, c_hat[0] = 0.852 + c_state[0] = 0.805, h_state[0] = 0.081 +Backward Time Step 4: + Gradient di[0] = 76061.062, df[0] = 63983.848, dc_hat[0] = 33997.895 + Gradient do_[0] = 4362782.000 +Backward Time Step 3: + Gradient di[0] = 132061.312, df[0] = 105149.242, dc_hat[0] = 59839.422 + Gradient do_[0] = 6877376.000 +Backward Time Step 2: + Gradient di[0] = 193161.328, df[0] = 151514.859, dc_hat[0] = 130319.930 + Gradient do_[0] = 9016668.000 +Backward Time Step 1: + Gradient di[0] = 226880.391, df[0] = 170017.641, dc_hat[0] = 176419.766 + Gradient do_[0] = 7626913.500 +Backward Time Step 0: + Gradient di[0] = 198442.781, df[0] = 150441.016, dc_hat[0] = 220976.109 + Gradient do_[0] = 3218233.250 +Time Step 0: + i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750 + c_state[0] = 0.386, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.414, f_gate[0] = 0.697, o_gate[0] = 0.116, c_hat[0] = 0.779 + c_state[0] = 0.591, h_state[0] = 0.062 +Time Step 2: + i_gate[0] = 0.356, f_gate[0] = 0.692, o_gate[0] = 0.111, c_hat[0] = 0.784 + c_state[0] = 0.689, h_state[0] = 0.066 +Time Step 3: + i_gate[0] = 0.306, f_gate[0] = 0.685, o_gate[0] = 0.110, c_hat[0] = 0.836 + c_state[0] = 0.728, h_state[0] = 0.069 +Time Step 4: + i_gate[0] = 0.273, f_gate[0] = 0.696, o_gate[0] = 0.110, c_hat[0] = 0.829 + c_state[0] = 0.733, h_state[0] = 0.068 +Backward Time Step 4: + Gradient di[0] = -1534.954, df[0] = -1372.850, dc_hat[0] = -794.810 + Gradient do_[0] = -87191.062 +Backward Time Step 3: + Gradient di[0] = -2954.472, df[0] = -2457.093, dc_hat[0] = -1531.078 + Gradient do_[0] = -152687.891 +Backward Time Step 2: + Gradient di[0] = -4402.364, df[0] = -3610.604, dc_hat[0] = -3353.604 + Gradient do_[0] = -205079.594 +Backward Time Step 1: + Gradient di[0] = -4900.543, df[0] = -3817.786, dc_hat[0] = -4219.986 + Gradient do_[0] = -164928.219 +Backward Time Step 0: + Gradient di[0] = -3810.322, df[0] = -2982.668, dc_hat[0] = -4583.897 + Gradient do_[0] = -62192.859 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.715, o_gate[0] = 0.145, c_hat[0] = 0.771 + c_state[0] = 0.406, h_state[0] = 0.056 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.709, o_gate[0] = 0.123, c_hat[0] = 0.802 + c_state[0] = 0.627, h_state[0] = 0.068 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.707, o_gate[0] = 0.119, c_hat[0] = 0.810 + c_state[0] = 0.737, h_state[0] = 0.075 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.704, o_gate[0] = 0.121, c_hat[0] = 0.859 + c_state[0] = 0.788, h_state[0] = 0.079 +Time Step 4: + i_gate[0] = 0.282, f_gate[0] = 0.719, o_gate[0] = 0.122, c_hat[0] = 0.855 + c_state[0] = 0.807, h_state[0] = 0.082 +Backward Time Step 4: + Gradient di[0] = 145734.156, df[0] = 122238.758, dc_hat[0] = 63949.379 + Gradient do_[0] = 8315011.500 +Backward Time Step 3: + Gradient di[0] = 252169.734, df[0] = 200293.594, dc_hat[0] = 112308.781 + Gradient do_[0] = 13058593.000 +Backward Time Step 2: + Gradient di[0] = 368026.375, df[0] = 287816.188, dc_hat[0] = 244754.734 + Gradient do_[0] = 17082716.000 +Backward Time Step 1: + Gradient di[0] = 428825.781, df[0] = 320646.031, dc_hat[0] = 330641.000 + Gradient do_[0] = 14355376.000 +Backward Time Step 0: + Gradient di[0] = 365769.125, df[0] = 277292.438, dc_hat[0] = 407302.500 + Gradient do_[0] = 5931837.500 +Time Step 0: + i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750 + c_state[0] = 0.386, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.414, f_gate[0] = 0.698, o_gate[0] = 0.117, c_hat[0] = 0.781 + c_state[0] = 0.593, h_state[0] = 0.062 +Time Step 2: + i_gate[0] = 0.357, f_gate[0] = 0.693, o_gate[0] = 0.111, c_hat[0] = 0.788 + c_state[0] = 0.692, h_state[0] = 0.067 +Time Step 3: + i_gate[0] = 0.307, f_gate[0] = 0.686, o_gate[0] = 0.111, c_hat[0] = 0.839 + c_state[0] = 0.732, h_state[0] = 0.069 +Time Step 4: + i_gate[0] = 0.274, f_gate[0] = 0.697, o_gate[0] = 0.110, c_hat[0] = 0.832 + c_state[0] = 0.738, h_state[0] = 0.069 +Backward Time Step 4: + Gradient di[0] = -878.506, df[0] = -781.224, dc_hat[0] = -446.047 + Gradient do_[0] = -49924.977 +Backward Time Step 3: + Gradient di[0] = -1681.915, df[0] = -1391.690, dc_hat[0] = -855.288 + Gradient do_[0] = -86817.898 +Backward Time Step 2: + Gradient di[0] = -2499.135, df[0] = -2038.982, dc_hat[0] = -1873.171 + Gradient do_[0] = -116011.289 +Backward Time Step 1: + Gradient di[0] = -2761.230, df[0] = -2143.854, dc_hat[0] = -2352.998 + Gradient do_[0] = -92560.633 +Backward Time Step 0: + Gradient di[0] = -2100.726, df[0] = -1644.420, dc_hat[0] = -2527.218 + Gradient do_[0] = -34288.543 +Time Step 0: + i_gate[0] = 0.527, f_gate[0] = 0.715, o_gate[0] = 0.145, c_hat[0] = 0.771 + c_state[0] = 0.406, h_state[0] = 0.056 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.710, o_gate[0] = 0.123, c_hat[0] = 0.804 + c_state[0] = 0.628, h_state[0] = 0.069 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.708, o_gate[0] = 0.120, c_hat[0] = 0.813 + c_state[0] = 0.740, h_state[0] = 0.075 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.705, o_gate[0] = 0.122, c_hat[0] = 0.861 + c_state[0] = 0.792, h_state[0] = 0.080 +Time Step 4: + i_gate[0] = 0.283, f_gate[0] = 0.720, o_gate[0] = 0.123, c_hat[0] = 0.857 + c_state[0] = 0.812, h_state[0] = 0.082 +Backward Time Step 4: + Gradient di[0] = 138715.172, df[0] = 115804.641, dc_hat[0] = 59869.789 + Gradient do_[0] = 7925267.500 +Backward Time Step 3: + Gradient di[0] = 238509.000, df[0] = 188616.172, dc_hat[0] = 104409.516 + Gradient do_[0] = 12342997.000 +Backward Time Step 2: + Gradient di[0] = 346294.406, df[0] = 269584.312, dc_hat[0] = 226781.000 + Gradient do_[0] = 16024729.000 +Backward Time Step 1: + Gradient di[0] = 400095.531, df[0] = 298260.906, dc_hat[0] = 305391.750 + Gradient do_[0] = 13344412.000 +Backward Time Step 0: + Gradient di[0] = 334596.219, df[0] = 253660.000, dc_hat[0] = 372589.875 + Gradient do_[0] = 5426293.000 +Time Step 0: + i_gate[0] = 0.515, f_gate[0] = 0.705, o_gate[0] = 0.139, c_hat[0] = 0.750 + c_state[0] = 0.386, h_state[0] = 0.051 +Time Step 1: + i_gate[0] = 0.415, f_gate[0] = 0.699, o_gate[0] = 0.118, c_hat[0] = 0.785 + c_state[0] = 0.595, h_state[0] = 0.063 +Time Step 2: + i_gate[0] = 0.358, f_gate[0] = 0.695, o_gate[0] = 0.113, c_hat[0] = 0.793 + c_state[0] = 0.698, h_state[0] = 0.068 +Time Step 3: + i_gate[0] = 0.308, f_gate[0] = 0.688, o_gate[0] = 0.112, c_hat[0] = 0.844 + c_state[0] = 0.740, h_state[0] = 0.071 +Time Step 4: + i_gate[0] = 0.275, f_gate[0] = 0.699, o_gate[0] = 0.112, c_hat[0] = 0.838 + c_state[0] = 0.748, h_state[0] = 0.071 +Backward Time Step 4: + Gradient di[0] = 246.630, df[0] = 217.044, dc_hat[0] = 120.772 + Gradient do_[0] = 14023.690 +Backward Time Step 3: + Gradient di[0] = 465.916, df[0] = 382.035, dc_hat[0] = 228.934 + Gradient do_[0] = 23992.588 +Backward Time Step 2: + Gradient di[0] = 688.049, df[0] = 556.141, dc_hat[0] = 500.825 + Gradient do_[0] = 31724.008 +Backward Time Step 1: + Gradient di[0] = 752.028, df[0] = 580.509, dc_hat[0] = 629.454 + Gradient do_[0] = 25030.568 +Backward Time Step 0: + Gradient di[0] = 556.902, df[0] = 435.935, dc_hat[0] = 669.965 + Gradient do_[0] = 9089.733 +Time Step 0: + i_gate[0] = 0.502, f_gate[0] = 0.695, o_gate[0] = 0.133, c_hat[0] = 0.727 + c_state[0] = 0.365, h_state[0] = 0.047 +Time Step 1: + i_gate[0] = 0.407, f_gate[0] = 0.687, o_gate[0] = 0.112, c_hat[0] = 0.761 + c_state[0] = 0.560, h_state[0] = 0.057 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.681, o_gate[0] = 0.105, c_hat[0] = 0.766 + c_state[0] = 0.652, h_state[0] = 0.060 +Time Step 3: + i_gate[0] = 0.304, f_gate[0] = 0.670, o_gate[0] = 0.103, c_hat[0] = 0.821 + c_state[0] = 0.686, h_state[0] = 0.061 +Time Step 4: + i_gate[0] = 0.269, f_gate[0] = 0.678, o_gate[0] = 0.100, c_hat[0] = 0.811 + c_state[0] = 0.683, h_state[0] = 0.060 +Backward Time Step 4: + Gradient di[0] = 10.519, df[0] = 9.771, dc_hat[0] = 6.085 + Gradient do_[0] = 602.033 +Backward Time Step 3: + Gradient di[0] = 22.955, df[0] = 19.585, dc_hat[0] = 13.083 + Gradient do_[0] = 1183.318 +Backward Time Step 2: + Gradient di[0] = 33.960, df[0] = 28.730, dc_hat[0] = 28.228 + Gradient do_[0] = 1571.613 +Backward Time Step 1: + Gradient di[0] = 35.046, df[0] = 28.199, dc_hat[0] = 32.717 + Gradient do_[0] = 1172.555 +Backward Time Step 0: + Gradient di[0] = 23.757, df[0] = 19.254, dc_hat[0] = 30.921 + Gradient do_[0] = 390.645 +Time Step 0: + i_gate[0] = 0.490, f_gate[0] = 0.684, o_gate[0] = 0.128, c_hat[0] = 0.703 + c_state[0] = 0.344, h_state[0] = 0.042 +Time Step 1: + i_gate[0] = 0.399, f_gate[0] = 0.675, o_gate[0] = 0.106, c_hat[0] = 0.738 + c_state[0] = 0.527, h_state[0] = 0.051 +Time Step 2: + i_gate[0] = 0.348, f_gate[0] = 0.669, o_gate[0] = 0.099, c_hat[0] = 0.743 + c_state[0] = 0.611, h_state[0] = 0.054 +Time Step 3: + i_gate[0] = 0.302, f_gate[0] = 0.657, o_gate[0] = 0.095, c_hat[0] = 0.803 + c_state[0] = 0.644, h_state[0] = 0.054 +Time Step 4: + i_gate[0] = 0.268, f_gate[0] = 0.661, o_gate[0] = 0.091, c_hat[0] = 0.790 + c_state[0] = 0.637, h_state[0] = 0.051 +Backward Time Step 4: + Gradient di[0] = -0.014, df[0] = -0.013, dc_hat[0] = -0.009 + Gradient do_[0] = -0.823 +Backward Time Step 3: + Gradient di[0] = -0.072, df[0] = -0.063, dc_hat[0] = -0.046 + Gradient do_[0] = -3.736 +Backward Time Step 2: + Gradient di[0] = -0.106, df[0] = -0.093, dc_hat[0] = -0.098 + Gradient do_[0] = -5.060 +Backward Time Step 1: + Gradient di[0] = -0.098, df[0] = -0.082, dc_hat[0] = -0.101 + Gradient do_[0] = -3.422 +Backward Time Step 0: + Gradient di[0] = -0.054, df[0] = -0.045, dc_hat[0] = -0.076 + Gradient do_[0] = -0.993 +Time Step 0: + i_gate[0] = 0.489, f_gate[0] = 0.683, o_gate[0] = 0.123, c_hat[0] = 0.698 + c_state[0] = 0.341, h_state[0] = 0.040 +Time Step 1: + i_gate[0] = 0.394, f_gate[0] = 0.674, o_gate[0] = 0.101, c_hat[0] = 0.734 + c_state[0] = 0.519, h_state[0] = 0.048 +Time Step 2: + i_gate[0] = 0.340, f_gate[0] = 0.668, o_gate[0] = 0.094, c_hat[0] = 0.740 + c_state[0] = 0.598, h_state[0] = 0.050 +Time Step 3: + i_gate[0] = 0.292, f_gate[0] = 0.657, o_gate[0] = 0.090, c_hat[0] = 0.800 + c_state[0] = 0.627, h_state[0] = 0.050 +Time Step 4: + i_gate[0] = 0.258, f_gate[0] = 0.665, o_gate[0] = 0.087, c_hat[0] = 0.788 + c_state[0] = 0.620, h_state[0] = 0.048 +Backward Time Step 4: + Gradient di[0] = 26.020, df[0] = 25.578, dc_hat[0] = 16.904 + Gradient do_[0] = 1574.091 +Backward Time Step 3: + Gradient di[0] = 48.911, df[0] = 43.744, dc_hat[0] = 31.055 + Gradient do_[0] = 2629.933 +Backward Time Step 2: + Gradient di[0] = 67.793, df[0] = 60.447, dc_hat[0] = 62.708 + Gradient do_[0] = 3273.653 +Backward Time Step 1: + Gradient di[0] = 70.116, df[0] = 59.298, dc_hat[0] = 72.830 + Gradient do_[0] = 2455.016 +Backward Time Step 0: + Gradient di[0] = 50.062, df[0] = 42.457, dc_hat[0] = 71.964 + Gradient do_[0] = 859.918 +Time Step 0: + i_gate[0] = 0.476, f_gate[0] = 0.672, o_gate[0] = 0.118, c_hat[0] = 0.671 + c_state[0] = 0.320, h_state[0] = 0.036 +Time Step 1: + i_gate[0] = 0.386, f_gate[0] = 0.662, o_gate[0] = 0.095, c_hat[0] = 0.705 + c_state[0] = 0.484, h_state[0] = 0.043 +Time Step 2: + i_gate[0] = 0.335, f_gate[0] = 0.654, o_gate[0] = 0.088, c_hat[0] = 0.708 + c_state[0] = 0.554, h_state[0] = 0.044 +Time Step 3: + i_gate[0] = 0.289, f_gate[0] = 0.641, o_gate[0] = 0.083, c_hat[0] = 0.771 + c_state[0] = 0.578, h_state[0] = 0.044 +Time Step 4: + i_gate[0] = 0.254, f_gate[0] = 0.645, o_gate[0] = 0.079, c_hat[0] = 0.752 + c_state[0] = 0.564, h_state[0] = 0.040 +Backward Time Step 4: + Gradient di[0] = -0.034, df[0] = -0.035, dc_hat[0] = -0.026 + Gradient do_[0] = -2.070 +Backward Time Step 3: + Gradient di[0] = -0.074, df[0] = -0.069, dc_hat[0] = -0.055 + Gradient do_[0] = -3.993 +Backward Time Step 2: + Gradient di[0] = -0.096, df[0] = -0.090, dc_hat[0] = -0.101 + Gradient do_[0] = -4.752 +Backward Time Step 1: + Gradient di[0] = -0.088, df[0] = -0.078, dc_hat[0] = -0.102 + Gradient do_[0] = -3.214 +Backward Time Step 0: + Gradient di[0] = -0.049, df[0] = -0.043, dc_hat[0] = -0.076 + Gradient do_[0] = -0.938 +Time Step 0: + i_gate[0] = 0.477, f_gate[0] = 0.673, o_gate[0] = 0.123, c_hat[0] = 0.674 + c_state[0] = 0.321, h_state[0] = 0.038 +Time Step 1: + i_gate[0] = 0.382, f_gate[0] = 0.663, o_gate[0] = 0.100, c_hat[0] = 0.708 + c_state[0] = 0.484, h_state[0] = 0.045 +Time Step 2: + i_gate[0] = 0.329, f_gate[0] = 0.656, o_gate[0] = 0.093, c_hat[0] = 0.712 + c_state[0] = 0.551, h_state[0] = 0.047 +Time Step 3: + i_gate[0] = 0.281, f_gate[0] = 0.644, o_gate[0] = 0.090, c_hat[0] = 0.775 + c_state[0] = 0.573, h_state[0] = 0.046 +Time Step 4: + i_gate[0] = 0.246, f_gate[0] = 0.650, o_gate[0] = 0.086, c_hat[0] = 0.757 + c_state[0] = 0.559, h_state[0] = 0.044 +Backward Time Step 4: + Gradient di[0] = 10.092, df[0] = 10.635, dc_hat[0] = 7.543 + Gradient do_[0] = 571.171 +Backward Time Step 3: + Gradient di[0] = 18.426, df[0] = 17.387, dc_hat[0] = 13.205 + Gradient do_[0] = 928.326 +Backward Time Step 2: + Gradient di[0] = 24.810, df[0] = 23.368, dc_hat[0] = 25.564 + Gradient do_[0] = 1137.920 +Backward Time Step 1: + Gradient di[0] = 25.377, df[0] = 22.479, dc_hat[0] = 28.946 + Gradient do_[0] = 853.199 +Backward Time Step 0: + Gradient di[0] = 17.881, df[0] = 15.756, dc_hat[0] = 27.708 + Gradient do_[0] = 297.790 +Time Step 0: + i_gate[0] = 0.465, f_gate[0] = 0.662, o_gate[0] = 0.118, c_hat[0] = 0.646 + c_state[0] = 0.300, h_state[0] = 0.034 +Time Step 1: + i_gate[0] = 0.375, f_gate[0] = 0.651, o_gate[0] = 0.095, c_hat[0] = 0.677 + c_state[0] = 0.449, h_state[0] = 0.040 +Time Step 2: + i_gate[0] = 0.325, f_gate[0] = 0.642, o_gate[0] = 0.087, c_hat[0] = 0.677 + c_state[0] = 0.508, h_state[0] = 0.041 +Time Step 3: + i_gate[0] = 0.278, f_gate[0] = 0.628, o_gate[0] = 0.083, c_hat[0] = 0.743 + c_state[0] = 0.526, h_state[0] = 0.040 +Time Step 4: + i_gate[0] = 0.243, f_gate[0] = 0.631, o_gate[0] = 0.078, c_hat[0] = 0.717 + c_state[0] = 0.506, h_state[0] = 0.036 +Backward Time Step 4: + Gradient di[0] = -0.106, df[0] = -0.118, dc_hat[0] = -0.095 + Gradient do_[0] = -6.168 +Backward Time Step 3: + Gradient di[0] = -0.205, df[0] = -0.202, dc_hat[0] = -0.172 + Gradient do_[0] = -10.427 +Backward Time Step 2: + Gradient di[0] = -0.259, df[0] = -0.258, dc_hat[0] = -0.307 + Gradient do_[0] = -12.114 +Backward Time Step 1: + Gradient di[0] = -0.244, df[0] = -0.227, dc_hat[0] = -0.311 + Gradient do_[0] = -8.372 +Backward Time Step 0: + Gradient di[0] = -0.149, df[0] = -0.137, dc_hat[0] = -0.251 + Gradient do_[0] = -2.600 +Time Step 0: + i_gate[0] = 0.467, f_gate[0] = 0.665, o_gate[0] = 0.123, c_hat[0] = 0.656 + c_state[0] = 0.307, h_state[0] = 0.037 +Time Step 1: + i_gate[0] = 0.373, f_gate[0] = 0.654, o_gate[0] = 0.100, c_hat[0] = 0.689 + c_state[0] = 0.457, h_state[0] = 0.043 +Time Step 2: + i_gate[0] = 0.319, f_gate[0] = 0.646, o_gate[0] = 0.092, c_hat[0] = 0.691 + c_state[0] = 0.516, h_state[0] = 0.044 +Time Step 3: + i_gate[0] = 0.271, f_gate[0] = 0.633, o_gate[0] = 0.089, c_hat[0] = 0.755 + c_state[0] = 0.531, h_state[0] = 0.043 +Time Step 4: + i_gate[0] = 0.235, f_gate[0] = 0.638, o_gate[0] = 0.085, c_hat[0] = 0.733 + c_state[0] = 0.512, h_state[0] = 0.040 +Backward Time Step 4: + Gradient di[0] = -13.589, df[0] = -15.173, dc_hat[0] = -11.209 + Gradient do_[0] = -730.753 +Backward Time Step 3: + Gradient di[0] = -24.522, df[0] = -24.182, dc_hat[0] = -19.131 + Gradient do_[0] = -1175.594 +Backward Time Step 2: + Gradient di[0] = -32.433, df[0] = -31.928, dc_hat[0] = -36.049 + Gradient do_[0] = -1431.292 +Backward Time Step 1: + Gradient di[0] = -32.940, df[0] = -30.262, dc_hat[0] = -40.094 + Gradient do_[0] = -1074.297 +Backward Time Step 0: + Gradient di[0] = -22.819, df[0] = -20.691, dc_hat[0] = -37.163 + Gradient do_[0] = -371.204 +Time Step 0: + i_gate[0] = 0.480, f_gate[0] = 0.676, o_gate[0] = 0.128, c_hat[0] = 0.684 + c_state[0] = 0.328, h_state[0] = 0.041 +Time Step 1: + i_gate[0] = 0.381, f_gate[0] = 0.666, o_gate[0] = 0.105, c_hat[0] = 0.719 + c_state[0] = 0.492, h_state[0] = 0.048 +Time Step 2: + i_gate[0] = 0.323, f_gate[0] = 0.660, o_gate[0] = 0.099, c_hat[0] = 0.725 + c_state[0] = 0.559, h_state[0] = 0.050 +Time Step 3: + i_gate[0] = 0.274, f_gate[0] = 0.651, o_gate[0] = 0.097, c_hat[0] = 0.787 + c_state[0] = 0.580, h_state[0] = 0.051 +Time Step 4: + i_gate[0] = 0.240, f_gate[0] = 0.660, o_gate[0] = 0.095, c_hat[0] = 0.772 + c_state[0] = 0.568, h_state[0] = 0.049 +Backward Time Step 4: + Gradient di[0] = -59007.633, df[0] = -62085.996, dc_hat[0] = -40699.586 + Gradient do_[0] = -3087282.750 +Backward Time Step 3: + Gradient di[0] = -101361.938, df[0] = -95751.867, dc_hat[0] = -67624.703 + Gradient do_[0] = -4799500.000 +Backward Time Step 2: + Gradient di[0] = -138141.906, df[0] = -128878.289, dc_hat[0] = -133345.609 + Gradient do_[0] = -6019691.500 +Backward Time Step 1: + Gradient di[0] = -149881.547, df[0] = -130996.055, dc_hat[0] = -162582.484 + Gradient do_[0] = -4835544.000 +Backward Time Step 0: + Gradient di[0] = -114608.344, df[0] = -99444.844, dc_hat[0] = -171621.484 + Gradient do_[0] = -1843199.750 +Time Step 0: + i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709 + c_state[0] = 0.349, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.388, f_gate[0] = 0.677, o_gate[0] = 0.110, c_hat[0] = 0.742 + c_state[0] = 0.524, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.327, f_gate[0] = 0.673, o_gate[0] = 0.104, c_hat[0] = 0.748 + c_state[0] = 0.597, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.278, f_gate[0] = 0.667, o_gate[0] = 0.104, c_hat[0] = 0.805 + c_state[0] = 0.622, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.246, f_gate[0] = 0.680, o_gate[0] = 0.103, c_hat[0] = 0.793 + c_state[0] = 0.618, h_state[0] = 0.057 +Backward Time Step 4: + Gradient di[0] = -538430848.000, df[0] = -542115904.000, dc_hat[0] = -334196576.000 + Gradient do_[0] = -28003743744.000 +Backward Time Step 3: + Gradient di[0] = -894501568.000, df[0] = -820606720.000, dc_hat[0] = -541224512.000 + Gradient do_[0] = -42478477312.000 +Backward Time Step 2: + Gradient di[0] = -1237090816.000, df[0] = -1113104640.000, dc_hat[0] = -1084785920.000 + Gradient do_[0] = -54138720256.000 +Backward Time Step 1: + Gradient di[0] = -1443009920.000, df[0] = -1213306496.000, dc_hat[0] = -1428586496.000 + Gradient do_[0] = -46599806976.000 +Backward Time Step 0: + Gradient di[0] = -1261675520.000, df[0] = -1051560896.000, dc_hat[0] = -1740872704.000 + Gradient do_[0] = -20093313024.000 +Time Step 0: + i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733 + c_state[0] = 0.370, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.396, f_gate[0] = 0.689, o_gate[0] = 0.115, c_hat[0] = 0.764 + c_state[0] = 0.557, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.333, f_gate[0] = 0.688, o_gate[0] = 0.110, c_hat[0] = 0.770 + c_state[0] = 0.640, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.285, f_gate[0] = 0.686, o_gate[0] = 0.111, c_hat[0] = 0.822 + c_state[0] = 0.673, h_state[0] = 0.065 +Time Step 4: + i_gate[0] = 0.255, f_gate[0] = 0.702, o_gate[0] = 0.111, c_hat[0] = 0.811 + c_state[0] = 0.679, h_state[0] = 0.066 +Backward Time Step 4: + Gradient di[0] = 230659325952.000, df[0] = 219999502336.000, dc_hat[0] = 130854019072.000 + Gradient do_[0] = 12234542546944.000 +Backward Time Step 3: + Gradient di[0] = 373047263232.000, df[0] = 329246375936.000, dc_hat[0] = 205306396672.000 + Gradient do_[0] = 18020896342016.000 +Backward Time Step 2: + Gradient di[0] = 511328616448.000, df[0] = 441360842752.000, dc_hat[0] = 405705424896.000 + Gradient do_[0] = 22602133274624.000 +Backward Time Step 1: + Gradient di[0] = 621603061760.000, df[0] = 502193520640.000, dc_hat[0] = 559379709952.000 + Gradient do_[0] = 20122297171968.000 +Backward Time Step 0: + Gradient di[0] = 605582721024.000, df[0] = 486429229056.000, dc_hat[0] = 771354132480.000 + Gradient do_[0] = 9563190329344.000 +Time Step 0: + i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709 + c_state[0] = 0.349, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.388, f_gate[0] = 0.676, o_gate[0] = 0.110, c_hat[0] = 0.741 + c_state[0] = 0.524, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.327, f_gate[0] = 0.672, o_gate[0] = 0.104, c_hat[0] = 0.748 + c_state[0] = 0.596, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.277, f_gate[0] = 0.665, o_gate[0] = 0.104, c_hat[0] = 0.805 + c_state[0] = 0.620, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.245, f_gate[0] = 0.678, o_gate[0] = 0.103, c_hat[0] = 0.794 + c_state[0] = 0.615, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = -73820720.000, df[0] = -74399184.000, dc_hat[0] = -45636792.000 + Gradient do_[0] = -3809626112.000 +Backward Time Step 3: + Gradient di[0] = -124529768.000, df[0] = -114297888.000, dc_hat[0] = -75171720.000 + Gradient do_[0] = -5880279040.000 +Backward Time Step 2: + Gradient di[0] = -173315488.000, df[0] = -156012448.000, dc_hat[0] = -151893232.000 + Gradient do_[0] = -7560016896.000 +Backward Time Step 1: + Gradient di[0] = -200900496.000, df[0] = -168995280.000, dc_hat[0] = -199160464.000 + Gradient do_[0] = -6482587136.000 +Backward Time Step 0: + Gradient di[0] = -172885424.000, df[0] = -144093744.000, dc_hat[0] = -238549088.000 + Gradient do_[0] = -2753355520.000 +Time Step 0: + i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733 + c_state[0] = 0.370, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.396, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.764 + c_state[0] = 0.557, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.333, f_gate[0] = 0.686, o_gate[0] = 0.110, c_hat[0] = 0.770 + c_state[0] = 0.639, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.284, f_gate[0] = 0.684, o_gate[0] = 0.111, c_hat[0] = 0.822 + c_state[0] = 0.671, h_state[0] = 0.065 +Time Step 4: + i_gate[0] = 0.254, f_gate[0] = 0.699, o_gate[0] = 0.111, c_hat[0] = 0.811 + c_state[0] = 0.675, h_state[0] = 0.066 +Backward Time Step 4: + Gradient di[0] = 5734780928.000, df[0] = 5480751104.000, dc_hat[0] = 3245264640.000 + Gradient do_[0] = 301448134656.000 +Backward Time Step 3: + Gradient di[0] = 9375862784.000, df[0] = 8284593664.000, dc_hat[0] = 5152923136.000 + Gradient do_[0] = 449973518336.000 +Backward Time Step 2: + Gradient di[0] = 12900345856.000, df[0] = 11145098240.000, dc_hat[0] = 10236901376.000 + Gradient do_[0] = 568093966336.000 +Backward Time Step 1: + Gradient di[0] = 15583946752.000, df[0] = 12598241280.000, dc_hat[0] = 14047940608.000 + Gradient do_[0] = 504000905216.000 +Backward Time Step 0: + Gradient di[0] = 14996421632.000, df[0] = 12045751296.000, dc_hat[0] = 19101521920.000 + Gradient do_[0] = 236819267584.000 +Time Step 0: + i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709 + c_state[0] = 0.349, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.388, f_gate[0] = 0.676, o_gate[0] = 0.110, c_hat[0] = 0.741 + c_state[0] = 0.524, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.327, f_gate[0] = 0.672, o_gate[0] = 0.104, c_hat[0] = 0.748 + c_state[0] = 0.596, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.277, f_gate[0] = 0.665, o_gate[0] = 0.104, c_hat[0] = 0.805 + c_state[0] = 0.620, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.245, f_gate[0] = 0.678, o_gate[0] = 0.103, c_hat[0] = 0.793 + c_state[0] = 0.615, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = -52794228.000, df[0] = -53213832.000, dc_hat[0] = -32655412.000 + Gradient do_[0] = -2722256128.000 +Backward Time Step 3: + Gradient di[0] = -89103728.000, df[0] = -81785184.000, dc_hat[0] = -53805124.000 + Gradient do_[0] = -4205185792.000 +Backward Time Step 2: + Gradient di[0] = -123994856.000, df[0] = -111624192.000, dc_hat[0] = -108704840.000 + Gradient do_[0] = -5406868480.000 +Backward Time Step 1: + Gradient di[0] = -143592656.000, df[0] = -120796936.000, dc_hat[0] = -142381280.000 + Gradient do_[0] = -4632870400.000 +Backward Time Step 0: + Gradient di[0] = -123510528.000, df[0] = -102941560.000, dc_hat[0] = -170421088.000 + Gradient do_[0] = -1967015808.000 +Time Step 0: + i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733 + c_state[0] = 0.370, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.396, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.764 + c_state[0] = 0.557, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.333, f_gate[0] = 0.686, o_gate[0] = 0.110, c_hat[0] = 0.770 + c_state[0] = 0.639, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.284, f_gate[0] = 0.684, o_gate[0] = 0.111, c_hat[0] = 0.822 + c_state[0] = 0.670, h_state[0] = 0.065 +Time Step 4: + i_gate[0] = 0.254, f_gate[0] = 0.699, o_gate[0] = 0.111, c_hat[0] = 0.811 + c_state[0] = 0.675, h_state[0] = 0.066 +Backward Time Step 4: + Gradient di[0] = 1483947520.000, df[0] = 1418497024.000, dc_hat[0] = 840254656.000 + Gradient do_[0] = 77940498432.000 +Backward Time Step 3: + Gradient di[0] = 2426511360.000, df[0] = 2144266240.000, dc_hat[0] = 1334088320.000 + Gradient do_[0] = 116393222144.000 +Backward Time Step 2: + Gradient di[0] = 3337781760.000, df[0] = 2883934720.000, dc_hat[0] = 2649557248.000 + Gradient do_[0] = 146938953728.000 +Backward Time Step 1: + Gradient di[0] = 4028459520.000, df[0] = 3256907008.000, dc_hat[0] = 3632219904.000 + Gradient do_[0] = 130270339072.000 +Backward Time Step 0: + Gradient di[0] = 3875390208.000, df[0] = 3112874752.000, dc_hat[0] = 4936235008.000 + Gradient do_[0] = 61199065088.000 +Time Step 0: + i_gate[0] = 0.492, f_gate[0] = 0.686, o_gate[0] = 0.134, c_hat[0] = 0.709 + c_state[0] = 0.349, h_state[0] = 0.045 +Time Step 1: + i_gate[0] = 0.388, f_gate[0] = 0.676, o_gate[0] = 0.110, c_hat[0] = 0.741 + c_state[0] = 0.523, h_state[0] = 0.053 +Time Step 2: + i_gate[0] = 0.327, f_gate[0] = 0.671, o_gate[0] = 0.104, c_hat[0] = 0.748 + c_state[0] = 0.596, h_state[0] = 0.056 +Time Step 3: + i_gate[0] = 0.277, f_gate[0] = 0.665, o_gate[0] = 0.104, c_hat[0] = 0.805 + c_state[0] = 0.620, h_state[0] = 0.057 +Time Step 4: + i_gate[0] = 0.245, f_gate[0] = 0.677, o_gate[0] = 0.103, c_hat[0] = 0.793 + c_state[0] = 0.614, h_state[0] = 0.056 +Backward Time Step 4: + Gradient di[0] = -31030956.000, df[0] = -31281024.000, dc_hat[0] = -19204178.000 + Gradient do_[0] = -1598732928.000 +Backward Time Step 3: + Gradient di[0] = -52398980.000, df[0] = -48096676.000, dc_hat[0] = -31651684.000 + Gradient do_[0] = -2471589632.000 +Backward Time Step 2: + Gradient di[0] = -72907688.000, df[0] = -65638796.000, dc_hat[0] = -63938548.000 + Gradient do_[0] = -3178128384.000 +Backward Time Step 1: + Gradient di[0] = -84349528.000, df[0] = -70963744.000, dc_hat[0] = -83657000.000 + Gradient do_[0] = -2721141248.000 +Backward Time Step 0: + Gradient di[0] = -72518024.000, df[0] = -60441156.000, dc_hat[0] = -100061128.000 + Gradient do_[0] = -1154914560.000 +Time Step 0: + i_gate[0] = 0.505, f_gate[0] = 0.697, o_gate[0] = 0.140, c_hat[0] = 0.733 + c_state[0] = 0.370, h_state[0] = 0.049 +Time Step 1: + i_gate[0] = 0.395, f_gate[0] = 0.688, o_gate[0] = 0.115, c_hat[0] = 0.764 + c_state[0] = 0.557, h_state[0] = 0.058 +Time Step 2: + i_gate[0] = 0.333, f_gate[0] = 0.686, o_gate[0] = 0.110, c_hat[0] = 0.770 + c_state[0] = 0.638, h_state[0] = 0.062 +Time Step 3: + i_gate[0] = 0.284, f_gate[0] = 0.684, o_gate[0] = 0.111, c_hat[0] = 0.822 + c_state[0] = 0.670, h_state[0] = 0.065 +Time Step 4: + i_gate[0] = 0.254, f_gate[0] = 0.699, o_gate[0] = 0.111, c_hat[0] = 0.811 + c_state[0] = 0.674, h_state[0] = 0.065 +Backward Time Step 4: + Gradient di[0] = -43258628.000, df[0] = -41372240.000, dc_hat[0] = -24508188.000 + Gradient do_[0] = -2268284160.000 +Backward Time Step 3: + Gradient di[0] = -70760968.000, df[0] = -62556228.000, dc_hat[0] = -38924360.000 + Gradient do_[0] = -3390198016.000 +Backward Time Step 2: + Gradient di[0] = -97374600.000, df[0] = -84165688.000, dc_hat[0] = -77342368.000 + Gradient do_[0] = -4283812352.000 +Backward Time Step 1: + Gradient di[0] = -117517400.000, df[0] = -95030048.000, dc_hat[0] = -106004696.000 + Gradient do_[0] = -3799421952.000 +Backward Time Step 0: + Gradient di[0] = -113085320.000, df[0] = -90834840.000, dc_hat[0] = -144041152.000 + Gradient do_[0] = -1785811328.000 +Time Step 0: + i_gate[0] = 0.517, f_gate[0] = 0.707, o_gate[0] = 0.146, c_hat[0] = 0.755 + c_state[0] = 0.391, h_state[0] = 0.054 +Time Step 1: + i_gate[0] = 0.404, f_gate[0] = 0.700, o_gate[0] = 0.120, c_hat[0] = 0.785 + c_state[0] = 0.590, h_state[0] = 0.064 +Time Step 2: + i_gate[0] = 0.340, f_gate[0] = 0.701, o_gate[0] = 0.116, c_hat[0] = 0.790 + c_state[0] = 0.683, h_state[0] = 0.069 +Time Step 3: + i_gate[0] = 0.293, f_gate[0] = 0.702, o_gate[0] = 0.119, c_hat[0] = 0.838 + c_state[0] = 0.725, h_state[0] = 0.074 +Time Step 4: + i_gate[0] = 0.265, f_gate[0] = 0.719, o_gate[0] = 0.120, c_hat[0] = 0.826 + c_state[0] = 0.741, h_state[0] = 0.076 +Backward Time Step 4: + Gradient di[0] = -2951044136960.000, df[0] = -2659334225920.000, dc_hat[0] = -1541281808384.000 + Gradient do_[0] = -159024939008000.000 +Backward Time Step 3: + Gradient di[0] = -4712402780160.000, df[0] = -3989636644864.000, dc_hat[0] = -2368184516608.000 + Gradient do_[0] = -230211908009984.000 +Backward Time Step 2: + Gradient di[0] = -6374583959552.000, df[0] = -5279795642368.000, dc_hat[0] = -4584784265216.000 + Gradient do_[0] = -283295539527680.000 +Backward Time Step 1: + Gradient di[0] = -7880765865984.000, df[0] = -6128523542528.000, dc_hat[0] = -6458225721344.000 + Gradient do_[0] = -255434237673472.000 +Backward Time Step 0: + Gradient di[0] = -8310984540160.000, df[0] = -6452621082624.000, dc_hat[0] = -9787926380544.000 + Gradient do_[0] = -130307789422592.000 +Time Step 0: + i_gate[0] = 0.529, f_gate[0] = 0.718, o_gate[0] = 0.152, c_hat[0] = 0.776 + c_state[0] = 0.411, h_state[0] = 0.059 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.712, o_gate[0] = 0.126, c_hat[0] = 0.805 + c_state[0] = 0.625, h_state[0] = 0.070 +Time Step 2: + i_gate[0] = 0.348, f_gate[0] = 0.717, o_gate[0] = 0.123, c_hat[0] = 0.810 + c_state[0] = 0.730, h_state[0] = 0.077 +Time Step 3: + i_gate[0] = 0.303, f_gate[0] = 0.721, o_gate[0] = 0.127, c_hat[0] = 0.852 + c_state[0] = 0.784, h_state[0] = 0.083 +Time Step 4: + i_gate[0] = 0.279, f_gate[0] = 0.740, o_gate[0] = 0.129, c_hat[0] = 0.840 + c_state[0] = 0.814, h_state[0] = 0.087 +Backward Time Step 4: + Gradient di[0] = -18106463280955392.000, df[0] = -15275344319741952.000, dc_hat[0] = -8816178973638656.000 + Gradient do_[0] = -1015781755736555520.000 +Backward Time Step 3: + Gradient di[0] = -28348923047313408.000, df[0] = -22848725651030016.000, dc_hat[0] = -13108312128094208.000 + Gradient do_[0] = -1421152451444080640.000 +Backward Time Step 2: + Gradient di[0] = -37491860448477184.000, df[0] = -29688579181510656.000, dc_hat[0] = -24458421411512320.000 + Gradient do_[0] = -1686239619659595776.000 +Backward Time Step 1: + Gradient di[0] = -46854257793564672.000, df[0] = -35054387051102208.000, dc_hat[0] = -34852270185119744.000 + Gradient do_[0] = -1522876930783707136.000 +Backward Time Step 0: + Gradient di[0] = -53166889056075776.000, df[0] = -40003879445725184.000, dc_hat[0] = -57976685491912704.000 + Gradient do_[0] = -828696210138202112.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.724, o_gate[0] = 0.132, c_hat[0] = 0.823 + c_state[0] = 0.660, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.359, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.826 + c_state[0] = 0.779, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.316, f_gate[0] = 0.739, o_gate[0] = 0.136, c_hat[0] = 0.863 + c_state[0] = 0.849, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.295, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.850 + c_state[0] = 0.894, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -101980545289814016.000, df[0] = -80244609157955584.000, dc_hat[0] = -47186275945414656.000 + Gradient do_[0] = -6031987310432616448.000 +Backward Time Step 3: + Gradient di[0] = -158601734808141824.000, df[0] = -121093673743024128.000, dc_hat[0] = -68639444315406336.000 + Gradient do_[0] = -8224412946462146560.000 +Backward Time Step 2: + Gradient di[0] = -205309693130702848.000, df[0] = -155138238820909056.000, dc_hat[0] = -122958531363078144.000 + Gradient do_[0] = -9384368026951876608.000 +Backward Time Step 1: + Gradient di[0] = -257790310837387264.000, df[0] = -185687258865598464.000, dc_hat[0] = -175027648253132800.000 + Gradient do_[0] = -8418781863241318400.000 +Backward Time Step 0: + Gradient di[0] = -310038708252311552.000, df[0] = -226607387979546624.000, dc_hat[0] = -313438741802778624.000 + Gradient do_[0] = -4809923017113075712.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -2139848269889536.000, df[0] = -1573007612968960.000, dc_hat[0] = -947072903675904.000 + Gradient do_[0] = -134049184893370368.000 +Backward Time Step 3: + Gradient di[0] = -3328388932567040.000, df[0] = -2403894537748480.000, dc_hat[0] = -1354085747916800.000 + Gradient do_[0] = -179012604809707520.000 +Backward Time Step 2: + Gradient di[0] = -4193047722065920.000, df[0] = -3019229401972736.000, dc_hat[0] = -2304750854864896.000 + Gradient do_[0] = -194846139185364992.000 +Backward Time Step 1: + Gradient di[0] = -5244740794580992.000, df[0] = -3637416124481536.000, dc_hat[0] = -3249862166446080.000 + Gradient do_[0] = -172178504387919872.000 +Backward Time Step 0: + Gradient di[0] = -6547313644273664.000, df[0] = -4658376290074624.000, dc_hat[0] = -6143588865933312.000 + Gradient do_[0] = -101220877244301312.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 12787719864320.000, df[0] = 8799122358272.000, dc_hat[0] = 5441820557312.000 + Gradient do_[0] = 857433714982912.000 +Backward Time Step 3: + Gradient di[0] = 19931185807360.000, df[0] = 13601958002688.000, dc_hat[0] = 7692902989824.000 + Gradient do_[0] = 1121597691789312.000 +Backward Time Step 2: + Gradient di[0] = 24235414126592.000, df[0] = 16606877974528.000, dc_hat[0] = 12365238108160.000 + Gradient do_[0] = 1151787553783808.000 +Backward Time Step 1: + Gradient di[0] = 30172784885760.000, df[0] = 20160184320000.000, dc_hat[0] = 17275903016960.000 + Gradient do_[0] = 1000161920679936.000 +Backward Time Step 0: + Gradient di[0] = 38972403220480.000, df[0] = 27043716661248.000, dc_hat[0] = 33977505153024.000 + Gradient do_[0] = 601104026959872.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -1907859369492480.000, df[0] = -1402466675458048.000, dc_hat[0] = -844360237187072.000 + Gradient do_[0] = -119515032743575552.000 +Backward Time Step 3: + Gradient di[0] = -2967483904425984.000, df[0] = -2143231361941504.000, dc_hat[0] = -1207197597958144.000 + Gradient do_[0] = -159599799308386304.000 +Backward Time Step 2: + Gradient di[0] = -3737943323705344.000, df[0] = -2691525007900672.000, dc_hat[0] = -2054534650134528.000 + Gradient do_[0] = -173697101744570368.000 +Backward Time Step 1: + Gradient di[0] = -4675552401162240.000, df[0] = -3242663801257984.000, dc_hat[0] = -2897169115774976.000 + Gradient do_[0] = -153492750950465536.000 +Backward Time Step 0: + Gradient di[0] = -5838666052141056.000, df[0] = -4154177462730752.000, dc_hat[0] = -5478638271070208.000 + Gradient do_[0] = -90265257485795328.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 14036002406400.000, df[0] = 9658014105600.000, dc_hat[0] = 5972785889280.000 + Gradient do_[0] = 941123166011392.000 +Backward Time Step 3: + Gradient di[0] = 21876579500032.000, df[0] = 14929554833408.000, dc_hat[0] = 8443375124480.000 + Gradient do_[0] = 1231058288771072.000 +Backward Time Step 2: + Gradient di[0] = 26597925060608.000, df[0] = 18225731469312.000, dc_hat[0] = 13570188247040.000 + Gradient do_[0] = 1264058938425344.000 +Backward Time Step 1: + Gradient di[0] = 33114329972736.000, df[0] = 22125601619968.000, dc_hat[0] = 18960131031040.000 + Gradient do_[0] = 1097667811737600.000 +Backward Time Step 0: + Gradient di[0] = 42785822474240.000, df[0] = 29689915637760.000, dc_hat[0] = 37302172647424.000 + Gradient do_[0] = 659921523703808.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -1682640008642560.000, df[0] = -1236903605043200.000, dc_hat[0] = -744653041172480.000 + Gradient do_[0] = -105405300672036864.000 +Backward Time Step 3: + Gradient di[0] = -2617116846981120.000, df[0] = -1890179807707136.000, dc_hat[0] = -1064612132814848.000 + Gradient do_[0] = -140754573735231488.000 +Backward Time Step 2: + Gradient di[0] = -3296226875277312.000, df[0] = -2373461473230848.000, dc_hat[0] = -1811692635815936.000 + Gradient do_[0] = -153170336345489408.000 +Backward Time Step 1: + Gradient di[0] = -4123089147265024.000, df[0] = -2859510574088192.000, dc_hat[0] = -2554839552753664.000 + Gradient do_[0] = -135356048952262656.000 +Backward Time Step 0: + Gradient di[0] = -5150450156306432.000, df[0] = -3664515757506560.000, dc_hat[0] = -4832859336474624.000 + Gradient do_[0] = -79625489622237184.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 15324808216576.000, df[0] = 10544774905856.000, dc_hat[0] = 6520949440512.000 + Gradient do_[0] = 1027527640350720.000 +Backward Time Step 3: + Gradient di[0] = 23885059719168.000, df[0] = 16300200951808.000, dc_hat[0] = 9218120744960.000 + Gradient do_[0] = 1344066528739328.000 +Backward Time Step 2: + Gradient di[0] = 29036600360960.000, df[0] = 19896765251584.000, dc_hat[0] = 14813919969280.000 + Gradient do_[0] = 1379948967231488.000 +Backward Time Step 1: + Gradient di[0] = 36150781673472.000, df[0] = 24154428407808.000, dc_hat[0] = 20698697302016.000 + Gradient do_[0] = 1198319497904128.000 +Backward Time Step 0: + Gradient di[0] = 46724408147968.000, df[0] = 32422980747264.000, dc_hat[0] = 40735973834752.000 + Gradient do_[0] = 720669675356160.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -1464533918941184.000, df[0] = -1076573079863296.000, dc_hat[0] = -648092110028800.000 + Gradient do_[0] = -91741145687654400.000 +Backward Time Step 3: + Gradient di[0] = -2277830536724480.000, df[0] = -1645135146254336.000, dc_hat[0] = -926534906937344.000 + Gradient do_[0] = -122505420903284736.000 +Backward Time Step 2: + Gradient di[0] = -2868630668705792.000, df[0] = -2065566810505216.000, dc_hat[0] = -1576621559513088.000 + Gradient do_[0] = -133299867719041024.000 +Backward Time Step 1: + Gradient di[0] = -3588270961197056.000, df[0] = -2488594782486528.000, dc_hat[0] = -2223443634290688.000 + Gradient do_[0] = -117798600603336704.000 +Backward Time Step 0: + Gradient di[0] = -4483704265113600.000, df[0] = -3190129640341504.000, dc_hat[0] = -4207226751287296.000 + Gradient do_[0] = -69317662601117696.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 16724554416128.000, df[0] = 11507913981952.000, dc_hat[0] = 7116144246784.000 + Gradient do_[0] = 1121362810765312.000 +Backward Time Step 3: + Gradient di[0] = 26066057953280.000, df[0] = 17788634660864.000, dc_hat[0] = 10059216060416.000 + Gradient do_[0] = 1466779113095168.000 +Backward Time Step 2: + Gradient di[0] = 31685013929984.000, df[0] = 21711531540480.000, dc_hat[0] = 16164497391616.000 + Gradient do_[0] = 1505804393906176.000 +Backward Time Step 1: + Gradient di[0] = 39448033099776.000, df[0] = 26357513846784.000, dc_hat[0] = 22586591281152.000 + Gradient do_[0] = 1307616215040000.000 +Backward Time Step 0: + Gradient di[0] = 51000404606976.000, df[0] = 35390178197504.000, dc_hat[0] = 44463934144512.000 + Gradient do_[0] = 786621985193984.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -1259865708167168.000, df[0] = -926120912355328.000, dc_hat[0] = -557488936058880.000 + Gradient do_[0] = -78919173660475392.000 +Backward Time Step 3: + Gradient di[0] = -1959452932571136.000, df[0] = -1415191724032000.000, dc_hat[0] = -796981043658752.000 + Gradient do_[0] = -105381377704198144.000 +Backward Time Step 2: + Gradient di[0] = -2467449316311040.000, df[0] = -1776693081538560.000, dc_hat[0] = -1356083713015808.000 + Gradient do_[0] = -114657089724350464.000 +Backward Time Step 1: + Gradient di[0] = -3086476241797120.000, df[0] = -2140582172426240.000, dc_hat[0] = -1912510416420864.000 + Gradient do_[0] = -101325287899267072.000 +Backward Time Step 0: + Gradient di[0] = -3857843778224128.000, df[0] = -2744833605107712.000, dc_hat[0] = -3619958693036032.000 + Gradient do_[0] = -59641917327015936.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 18136527011840.000, df[0] = 12479464734720.000, dc_hat[0] = 7716471832576.000 + Gradient do_[0] = 1216015971123200.000 +Backward Time Step 3: + Gradient di[0] = 28265978789888.000, df[0] = 19289985777664.000, dc_hat[0] = 10907500412928.000 + Gradient do_[0] = 1590553493897216.000 +Backward Time Step 2: + Gradient di[0] = 34355816693760.000, df[0] = 23541640593408.000, dc_hat[0] = 17526407823360.000 + Gradient do_[0] = 1632723764510720.000 +Backward Time Step 1: + Gradient di[0] = 42773214396416.000, df[0] = 28579257647104.000, dc_hat[0] = 24490476044288.000 + Gradient do_[0] = 1417838631845888.000 +Backward Time Step 0: + Gradient di[0] = 55314976079872.000, df[0] = 38384147890176.000, dc_hat[0] = 48225532772352.000 + Gradient do_[0] = 853169215111168.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -1067033957498880.000, df[0] = -784370415697920.000, dc_hat[0] = -472133977243648.000 + Gradient do_[0] = -66838998319890432.000 +Backward Time Step 3: + Gradient di[0] = -1659505167302656.000, df[0] = -1198558808113152.000, dc_hat[0] = -674938339983360.000 + Gradient do_[0] = -89248784755720192.000 +Backward Time Step 2: + Gradient di[0] = -2089538096726016.000, df[0] = -1504575630606336.000, dc_hat[0] = -1148349566681088.000 + Gradient do_[0] = -97095878984073216.000 +Backward Time Step 1: + Gradient di[0] = -2613783147053056.000, df[0] = -1812752553213952.000, dc_hat[0] = -1619610021396480.000 + Gradient do_[0] = -85807347720519680.000 +Backward Time Step 0: + Gradient di[0] = -3267989881225216.000, df[0] = -2325155976052736.000, dc_hat[0] = -3066476995018752.000 + Gradient do_[0] = -50522829879246848.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 19426460041216.000, df[0] = 13367039229952.000, dc_hat[0] = 8264806825984.000 + Gradient do_[0] = 1302483997556736.000 +Backward Time Step 3: + Gradient di[0] = 30275662774272.000, df[0] = 20661518991360.000, dc_hat[0] = 11682291122176.000 + Gradient do_[0] = 1703621326536704.000 +Backward Time Step 2: + Gradient di[0] = 36794972241920.000, df[0] = 25213014114304.000, dc_hat[0] = 18770051465216.000 + Gradient do_[0] = 1748633254887424.000 +Backward Time Step 1: + Gradient di[0] = 45809936629760.000, df[0] = 30608266887168.000, dc_hat[0] = 26229197504512.000 + Gradient do_[0] = 1518499511926784.000 +Backward Time Step 0: + Gradient di[0] = 59258825605120.000, df[0] = 41120859947008.000, dc_hat[0] = 51663918333952.000 + Gradient do_[0] = 913998501380096.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -876562224250880.000, df[0] = -644354750283776.000, dc_hat[0] = -387832896028672.000 + Gradient do_[0] = -54907016530886656.000 +Backward Time Step 3: + Gradient di[0] = -1363240739143680.000, df[0] = -984585684910080.000, dc_hat[0] = -554409712943104.000 + Gradient do_[0] = -73314679425859584.000 +Backward Time Step 2: + Gradient di[0] = -1716341677490176.000, df[0] = -1235853619757056.000, dc_hat[0] = -943220183793664.000 + Gradient do_[0] = -79753857604780032.000 +Backward Time Step 1: + Gradient di[0] = -2146979123560448.000, df[0] = -1489007045246976.000, dc_hat[0] = -1330358603743232.000 + Gradient do_[0] = -70482732609699840.000 +Backward Time Step 0: + Gradient di[0] = -2685149665820672.000, df[0] = -1910468427907072.000, dc_hat[0] = -2519575992205312.000 + Gradient do_[0] = -41512168880865280.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 20528391782400.000, df[0] = 14125249855488.000, dc_hat[0] = 8733094576128.000 + Gradient do_[0] = 1376343207968768.000 +Backward Time Step 3: + Gradient di[0] = 31992202657792.000, df[0] = 21832992292864.000, dc_hat[0] = 12343847157760.000 + Gradient do_[0] = 1800188431695872.000 +Backward Time Step 2: + Gradient di[0] = 38877486120960.000, df[0] = 26640006512640.000, dc_hat[0] = 19831680139264.000 + Gradient do_[0] = 1847591717306368.000 +Backward Time Step 1: + Gradient di[0] = 48402654035968.000, df[0] = 32340617199616.000, dc_hat[0] = 27713702199296.000 + Gradient do_[0] = 1604442478608384.000 +Backward Time Step 0: + Gradient di[0] = 62630307823616.000, df[0] = 43460396580864.000, dc_hat[0] = 54603299160064.000 + Gradient do_[0] = 965999683698688.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -703317604827136.000, df[0] = -517003198595072.000, dc_hat[0] = -311163199946752.000 + Gradient do_[0] = -44054471692517376.000 +Backward Time Step 3: + Gradient di[0] = -1093780665008128.000, df[0] = -789971522813952.000, dc_hat[0] = -444795973533696.000 + Gradient do_[0] = -58822523466285056.000 +Backward Time Step 2: + Gradient di[0] = -1376959267340288.000, df[0] = -991479778508800.000, dc_hat[0] = -756685861814272.000 + Gradient do_[0] = -63983338989289472.000 +Backward Time Step 1: + Gradient di[0] = -1722462811193344.000, df[0] = -1194589855678464.000, dc_hat[0] = -1067310446018560.000 + Gradient do_[0] = -56546375482998784.000 +Backward Time Step 0: + Gradient di[0] = -2154866562564096.000, df[0] = -1533175146741760.000, dc_hat[0] = -2021991682932736.000 + Gradient do_[0] = -33314042680442880.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 21468396126208.000, df[0] = 14772044038144.000, dc_hat[0] = 9132448940032.000 + Gradient do_[0] = 1439345143709696.000 +Backward Time Step 3: + Gradient di[0] = 33456362881024.000, df[0] = 22832239083520.000, dc_hat[0] = 12907967414272.000 + Gradient do_[0] = 1882554898579456.000 +Backward Time Step 2: + Gradient di[0] = 40652876283904.000, df[0] = 27856541319168.000, dc_hat[0] = 20736565575680.000 + Gradient do_[0] = 1931953599152128.000 +Backward Time Step 1: + Gradient di[0] = 50613039661056.000, df[0] = 33817498746880.000, dc_hat[0] = 28979291488256.000 + Gradient do_[0] = 1677711667888128.000 +Backward Time Step 0: + Gradient di[0] = 65508825104384.000, df[0] = 45457854169088.000, dc_hat[0] = 57112885460992.000 + Gradient do_[0] = 1010397431726080.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -552834298806272.000, df[0] = -406383530868736.000, dc_hat[0] = -244571896283136.000 + Gradient do_[0] = -34627974108020736.000 +Backward Time Step 3: + Gradient di[0] = -859732596228096.000, df[0] = -620933152768000.000, dc_hat[0] = -349596043706368.000 + Gradient do_[0] = -46235091013206016.000 +Backward Time Step 2: + Gradient di[0] = -1082215190495232.000, df[0] = -779248331653120.000, dc_hat[0] = -594694392250368.000 + Gradient do_[0] = -50287156433780736.000 +Backward Time Step 1: + Gradient di[0] = -1353777047142400.000, df[0] = -938893004242944.000, dc_hat[0] = -838857243230208.000 + Gradient do_[0] = -44442869880061952.000 +Backward Time Step 0: + Gradient di[0] = -1694133206908928.000, df[0] = -1205365928624128.000, dc_hat[0] = -1589668327981056.000 + Gradient do_[0] = -26191144362704896.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 22265764773888.000, df[0] = 15320690458624.000, dc_hat[0] = 9471071879168.000 + Gradient do_[0] = 1492781247758336.000 +Backward Time Step 3: + Gradient di[0] = 34698109648896.000, df[0] = 23679702401024.000, dc_hat[0] = 13386208247808.000 + Gradient do_[0] = 1952403414843392.000 +Backward Time Step 2: + Gradient di[0] = 42157687701504.000, df[0] = 28887677403136.000, dc_hat[0] = 21503368232960.000 + Gradient do_[0] = 2003456214695936.000 +Backward Time Step 1: + Gradient di[0] = 52486526205952.000, df[0] = 35069286678528.000, dc_hat[0] = 30051984736256.000 + Gradient do_[0] = 1739813807980544.000 +Backward Time Step 0: + Gradient di[0] = 67952825073664.000, df[0] = 47153795825664.000, dc_hat[0] = 59243654807552.000 + Gradient do_[0] = 1048093420158976.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -425947677327360.000, df[0] = -313109927428096.000, dc_hat[0] = -188426657726464.000 + Gradient do_[0] = -26679757022167040.000 +Backward Time Step 3: + Gradient di[0] = -662390257483776.000, df[0] = -478405032148992.000, dc_hat[0] = -269333037056000.000 + Gradient do_[0] = -35621898259791872.000 +Backward Time Step 2: + Gradient di[0] = -833725763944448.000, df[0] = -600323148218368.000, dc_hat[0] = -458129934385152.000 + Gradient do_[0] = -38740411736391680.000 +Backward Time Step 1: + Gradient di[0] = -1042945566310400.000, df[0] = -723320207048704.000, dc_hat[0] = -646252924502016.000 + Gradient do_[0] = -34238641765089280.000 +Backward Time Step 0: + Gradient di[0] = -1305544832843776.000, df[0] = -928887609491456.000, dc_hat[0] = -1225041442242560.000 + Gradient do_[0] = -20183606102065152.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 22935249092608.000, df[0] = 15781346672640.000, dc_hat[0] = 9755283161088.000 + Gradient do_[0] = 1537642718035968.000 +Backward Time Step 3: + Gradient di[0] = 35740524216320.000, df[0] = 24391136051200.000, dc_hat[0] = 13787495137280.000 + Gradient do_[0] = 2011033610747904.000 +Backward Time Step 2: + Gradient di[0] = 43420064153600.000, df[0] = 29752685494272.000, dc_hat[0] = 22146470379520.000 + Gradient do_[0] = 2063437580468224.000 +Backward Time Step 1: + Gradient di[0] = 54058215800832.000, df[0] = 36119420862464.000, dc_hat[0] = 30951881048064.000 + Gradient do_[0] = 1791911895498752.000 +Backward Time Step 0: + Gradient di[0] = 70007392305152.000, df[0] = 48579498475520.000, dc_hat[0] = 61034895245312.000 + Gradient do_[0] = 1079782628392960.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -321297208836096.000, df[0] = -236182029991936.000, dc_hat[0] = -142124192366592.000 + Gradient do_[0] = -20124541711810560.000 +Backward Time Step 3: + Gradient di[0] = -499636229898240.000, df[0] = -360857649283072.000, dc_hat[0] = -203143027621888.000 + Gradient do_[0] = -26869014756065280.000 +Backward Time Step 2: + Gradient di[0] = -628814820409344.000, df[0] = -452776828076032.000, dc_hat[0] = -345520354623488.000 + Gradient do_[0] = -29218752709001216.000 +Backward Time Step 1: + Gradient di[0] = -786620777234432.000, df[0] = -545549765836800.000, dc_hat[0] = -487423322619904.000 + Gradient do_[0] = -25823808694779904.000 +Backward Time Step 0: + Gradient di[0] = -984973976797184.000, df[0] = -700803371237376.000, dc_hat[0] = -924237837631488.000 + Gradient do_[0] = -15227608979472384.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 23487601180672.000, df[0] = 16161392558080.000, dc_hat[0] = 9989622071296.000 + Gradient do_[0] = 1574649632653312.000 +Backward Time Step 3: + Gradient di[0] = 36600385896448.000, df[0] = 24977986289664.000, dc_hat[0] = 14118300942336.000 + Gradient do_[0] = 2059392392364032.000 +Backward Time Step 2: + Gradient di[0] = 44460444483584.000, df[0] = 30465568276480.000, dc_hat[0] = 22676294860800.000 + Gradient do_[0] = 2112866748465152.000 +Backward Time Step 1: + Gradient di[0] = 55353433653248.000, df[0] = 36984831606784.000, dc_hat[0] = 31693478035456.000 + Gradient do_[0] = 1834845596549120.000 +Backward Time Step 0: + Gradient di[0] = 71705032654848.000, df[0] = 49757527474176.000, dc_hat[0] = 62514956075008.000 + Gradient do_[0] = 1105966829404160.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -236550558318592.000, df[0] = -173885509074944.000, dc_hat[0] = -104630830759936.000 + Gradient do_[0] = -14816185472253952.000 +Backward Time Step 3: + Gradient di[0] = -367841098334208.000, df[0] = -265670000377856.000, dc_hat[0] = -149548076892160.000 + Gradient do_[0] = -19781208468619264.000 +Backward Time Step 2: + Gradient di[0] = -462900703526912.000, df[0] = -333310366580736.000, dc_hat[0] = -254345547350016.000 + Gradient do_[0] = -21509204808302592.000 +Backward Time Step 1: + Gradient di[0] = -579076146331648.000, df[0] = -401610110926848.000, dc_hat[0] = -358819955736576.000 + Gradient do_[0] = -19010371360653312.000 +Backward Time Step 0: + Gradient di[0] = -725313407418368.000, df[0] = -516056292524032.000, dc_hat[0] = -680588570787840.000 + Gradient do_[0] = -11213279141560320.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 23930437894144.000, df[0] = 16466093015040.000, dc_hat[0] = 10177367506944.000 + Gradient do_[0] = 1604314032242688.000 +Backward Time Step 3: + Gradient di[0] = 37289610706944.000, df[0] = 25448383774720.000, dc_hat[0] = 14383248834560.000 + Gradient do_[0] = 2098146956017664.000 +Backward Time Step 2: + Gradient di[0] = 45293332594688.000, df[0] = 31036278833152.000, dc_hat[0] = 23100267692032.000 + Gradient do_[0] = 2152437087469568.000 +Backward Time Step 1: + Gradient di[0] = 56390374653952.000, df[0] = 37677676101632.000, dc_hat[0] = 32287198543872.000 + Gradient do_[0] = 1869217951383552.000 +Backward Time Step 0: + Gradient di[0] = 73068760268800.000, df[0] = 50703842148352.000, dc_hat[0] = 63703902846976.000 + Gradient do_[0] = 1127000626429952.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -169105864785920.000, df[0] = -124307543949312.000, dc_hat[0] = -74794464706560.000 + Gradient do_[0] = -10591673893519360.000 +Backward Time Step 3: + Gradient di[0] = -262956604456960.000, df[0] = -189918185783296.000, dc_hat[0] = -106899856949248.000 + Gradient do_[0] = -14140721334321152.000 +Backward Time Step 2: + Gradient di[0] = -330881327693824.000, df[0] = -238249888972800.000, dc_hat[0] = -181800060059648.000 + Gradient do_[0] = -15374687987040256.000 +Backward Time Step 1: + Gradient di[0] = -413928144240640.000, df[0] = -287074003451904.000, dc_hat[0] = -256487276412928.000 + Gradient do_[0] = -13588761128468480.000 +Backward Time Step 0: + Gradient di[0] = -518613576450048.000, df[0] = -368990572511232.000, dc_hat[0] = -486634424369152.000 + Gradient do_[0] = -8017718246637568.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 24270430273536.000, df[0] = 16700024029184.000, dc_hat[0] = 10321342234624.000 + Gradient do_[0] = 1627082996056064.000 +Backward Time Step 3: + Gradient di[0] = 37818449526784.000, df[0] = 25809330896896.000, dc_hat[0] = 14586305576960.000 + Gradient do_[0] = 2127877390729216.000 +Backward Time Step 2: + Gradient di[0] = 45931277844480.000, df[0] = 31473402904576.000, dc_hat[0] = 23424764215296.000 + Gradient do_[0] = 2182741302968320.000 +Backward Time Step 1: + Gradient di[0] = 57184629030912.000, df[0] = 38208360415232.000, dc_hat[0] = 32741959663616.000 + Gradient do_[0] = 1895545698254848.000 +Backward Time Step 0: + Gradient di[0] = 74118946881536.000, df[0] = 51432589885440.000, dc_hat[0] = 64619490050048.000 + Gradient do_[0] = 1143198558715904.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -116380284223488.000, df[0] = -85549532577792.000, dc_hat[0] = -51471232008192.000 + Gradient do_[0] = -7289182271569920.000 +Backward Time Step 3: + Gradient di[0] = -180964789583872.000, df[0] = -130700393054208.000, dc_hat[0] = -73563000274944.000 + Gradient do_[0] = -9731426303868928.000 +Backward Time Step 2: + Gradient di[0] = -227688430174208.000, df[0] = -163946065559552.000, dc_hat[0] = -125097364946944.000 + Gradient do_[0] = -10579685566054400.000 +Backward Time Step 1: + Gradient di[0] = -284838137430016.000, df[0] = -197545477275648.000, dc_hat[0] = -176497688051712.000 + Gradient do_[0] = -9350892201443328.000 +Backward Time Step 0: + Gradient di[0] = -356982447931392.000, df[0] = -253990960889856.000, dc_hat[0] = -334969935233024.000 + Gradient do_[0] = -5518915937501184.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 24511982338048.000, df[0] = 16866221228032.000, dc_hat[0] = 10423453614080.000 + Gradient do_[0] = 1643250997788672.000 +Backward Time Step 3: + Gradient di[0] = 38193873289216.000, df[0] = 26065577705472.000, dc_hat[0] = 14730166009856.000 + Gradient do_[0] = 2148976014917632.000 +Backward Time Step 2: + Gradient di[0] = 46382899527680.000, df[0] = 31782858653696.000, dc_hat[0] = 23654232489984.000 + Gradient do_[0] = 2204190772297728.000 +Backward Time Step 1: + Gradient di[0] = 57746883870720.000, df[0] = 38584035835904.000, dc_hat[0] = 33063885078528.000 + Gradient do_[0] = 1914183171964928.000 +Backward Time Step 0: + Gradient di[0] = 74868754219008.000, df[0] = 51952893296640.000, dc_hat[0] = 65273201688576.000 + Gradient do_[0] = 1154763429249024.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -75988130070528.000, df[0] = -55857765154816.000, dc_hat[0] = -33605178884096.000 + Gradient do_[0] = -4759250576343040.000 +Backward Time Step 3: + Gradient di[0] = -118154416095232.000, df[0] = -85336151556096.000, dc_hat[0] = -48027238662144.000 + Gradient do_[0] = -6353705108504576.000 +Backward Time Step 2: + Gradient di[0] = -148647056506880.000, df[0] = -107032531173376.000, dc_hat[0] = -81667410624512.000 + Gradient do_[0] = -6906943067127808.000 +Backward Time Step 1: + Gradient di[0] = -185959568113664.000, df[0] = -128969655451648.000, dc_hat[0] = -115228377350144.000 + Gradient do_[0] = -6104828933570560.000 +Backward Time Step 0: + Gradient di[0] = -233129281323008.000, df[0] = -165870143799296.000, dc_hat[0] = -218753908342784.000 + Gradient do_[0] = -3604157508354048.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 24660034977792.000, df[0] = 16968085143552.000, dc_hat[0] = 10485792505856.000 + Gradient do_[0] = 1653150494752768.000 +Backward Time Step 3: + Gradient di[0] = 38423649845248.000, df[0] = 26222427897856.000, dc_hat[0] = 14817853177856.000 + Gradient do_[0] = 2161877828239360.000 +Backward Time Step 2: + Gradient di[0] = 46657437696000.000, df[0] = 31970968993792.000, dc_hat[0] = 23793384816640.000 + Gradient do_[0] = 2217225058516992.000 +Backward Time Step 1: + Gradient di[0] = 58088711258112.000, df[0] = 38812424077312.000, dc_hat[0] = 33259603886080.000 + Gradient do_[0] = 1925513832562688.000 +Backward Time Step 0: + Gradient di[0] = 75333164335104.000, df[0] = 52275154255872.000, dc_hat[0] = 65678086242304.000 + Gradient do_[0] = 1161926495174656.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -45791095816192.000, df[0] = -33660310913024.000, dc_hat[0] = -20249562841088.000 + Gradient do_[0] = -2867921999101952.000 +Backward Time Step 3: + Gradient di[0] = -71199090540544.000, df[0] = -51423035260928.000, dc_hat[0] = -28939045044224.000 + Gradient do_[0] = -3828657059528704.000 +Backward Time Step 2: + Gradient di[0] = -89565343776768.000, df[0] = -64490989158400.000, dc_hat[0] = -49205993275392.000 + Gradient do_[0] = -4161666275082240.000 +Backward Time Step 1: + Gradient di[0] = -112048734208000.000, df[0] = -77709824753664.000, dc_hat[0] = -69430113468416.000 + Gradient do_[0] = -3678424740659200.000 +Backward Time Step 0: + Gradient di[0] = -140512447168512.000, df[0] = -99973794365440.000, dc_hat[0] = -131848055291904.000 + Gradient do_[0] = -2172309498494976.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 24718216265728.000, df[0] = 17008106143744.000, dc_hat[0] = 10509900316672.000 + Gradient do_[0] = 1657026165866496.000 +Backward Time Step 3: + Gradient di[0] = 38513361813504.000, df[0] = 26283689902080.000, dc_hat[0] = 14851503030272.000 + Gradient do_[0] = 2166899047661568.000 +Backward Time Step 2: + Gradient di[0] = 46761817145344.000, df[0] = 32042486071296.000, dc_hat[0] = 23845748604928.000 + Gradient do_[0] = 2222174068801536.000 +Backward Time Step 1: + Gradient di[0] = 58218713710592.000, df[0] = 38899292307456.000, dc_hat[0] = 33334040199168.000 + Gradient do_[0] = 1929823295373312.000 +Backward Time Step 0: + Gradient di[0] = 75523099197440.000, df[0] = 52406956064768.000, dc_hat[0] = 65843677364224.000 + Gradient do_[0] = 1164856065523712.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -23907960619008.000, df[0] = -17574345572352.000, dc_hat[0] = -10571873255424.000 + Gradient do_[0] = -1497346126905344.000 +Backward Time Step 3: + Gradient di[0] = -37172774174720.000, df[0] = -26847792332800.000, dc_hat[0] = -15108011982848.000 + Gradient do_[0] = -1998903146708992.000 +Backward Time Step 2: + Gradient di[0] = -46757308268544.000, df[0] = -33667281846272.000, dc_hat[0] = -25686964174848.000 + Gradient do_[0] = -2172572028370944.000 +Backward Time Step 1: + Gradient di[0] = -58495340642304.000, df[0] = -40568621105152.000, dc_hat[0] = -36246176923648.000 + Gradient do_[0] = -1920331551866880.000 +Backward Time Step 0: + Gradient di[0] = -73376789954560.000, df[0] = -52207164588032.000, dc_hat[0] = -68852180320256.000 + Gradient do_[0] = -1134398439161856.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 24690481430528.000, df[0] = 16989010526208.000, dc_hat[0] = 10497478885376.000 + Gradient do_[0] = 1655140943659008.000 +Backward Time Step 3: + Gradient di[0] = 38469246124032.000, df[0] = 26253623033856.000, dc_hat[0] = 14833548263424.000 + Gradient do_[0] = 2164391726284800.000 +Backward Time Step 2: + Gradient di[0] = 46703843475456.000, df[0] = 32002745040896.000, dc_hat[0] = 23815310540800.000 + Gradient do_[0] = 2219407036121088.000 +Backward Time Step 1: + Gradient di[0] = 58146450046976.000, df[0] = 38851007479808.000, dc_hat[0] = 33292665487360.000 + Gradient do_[0] = 1927427911581696.000 +Backward Time Step 0: + Gradient di[0] = 75450537738240.000, df[0] = 52356607639552.000, dc_hat[0] = 65780423065600.000 + Gradient do_[0] = 1163736823889920.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -8712473804800.000, df[0] = -6404386062336.000, dc_hat[0] = -3852349734912.000 + Gradient do_[0] = -545650362023936.000 +Backward Time Step 3: + Gradient di[0] = -13546077290496.000, df[0] = -9783570595840.000, dc_hat[0] = -5505140916224.000 + Gradient do_[0] = -728408266899456.000 +Backward Time Step 2: + Gradient di[0] = -17037168476160.000, df[0] = -12267484610560.000, dc_hat[0] = -9359354494976.000 + Gradient do_[0] = -791625689202688.000 +Backward Time Step 1: + Gradient di[0] = -21314458877952.000, df[0] = -14782342103040.000, dc_hat[0] = -13207334813696.000 + Gradient do_[0] = -699727884582912.000 +Backward Time Step 0: + Gradient di[0] = -26744937512960.000, df[0] = -19028869382144.000, dc_hat[0] = -25095770734592.000 + Gradient do_[0] = -413474320547840.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.445, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.854 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.383, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.855 + c_state[0] = 0.884, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.118 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.063, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 24580613734400.000, df[0] = 16913407148032.000, dc_hat[0] = 10450155601920.000 + Gradient do_[0] = 1647750915555328.000 +Backward Time Step 3: + Gradient di[0] = 38297099304960.000, df[0] = 26136178327552.000, dc_hat[0] = 14766224441344.000 + Gradient do_[0] = 2154681610534912.000 +Backward Time Step 2: + Gradient di[0] = 46490399539200.000, df[0] = 31856474980352.000, dc_hat[0] = 23705616908288.000 + Gradient do_[0] = 2209251854385152.000 +Backward Time Step 1: + Gradient di[0] = 57880711528448.000, df[0] = 38673450008576.000, dc_hat[0] = 33140508721152.000 + Gradient do_[0] = 1918619067875328.000 +Backward Time Step 0: + Gradient di[0] = 75126972350464.000, df[0] = 52132082352128.000, dc_hat[0] = 65498326761472.000 + Gradient do_[0] = 1158746340327424.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.737, o_gate[0] = 0.139, c_hat[0] = 0.840 + c_state[0] = 0.695, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.370, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.830, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.331, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.873 + c_state[0] = 0.916, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1187031154688.000, df[0] = 872564260864.000, dc_hat[0] = 524832538624.000 + Gradient do_[0] = 74341052055552.000 +Backward Time Step 3: + Gradient di[0] = 1845539766272.000, df[0] = 1332930805760.000, dc_hat[0] = 749981728768.000 + Gradient do_[0] = 99238398656512.000 +Backward Time Step 2: + Gradient di[0] = 2320955604992.000, df[0] = 1671184777216.000, dc_hat[0] = 1274972340224.000 + Gradient do_[0] = 107841721466880.000 +Backward Time Step 1: + Gradient di[0] = 2903676551168.000, df[0] = 2013804232704.000, dc_hat[0] = 1799240286208.000 + Gradient do_[0] = 95324198666240.000 +Backward Time Step 0: + Gradient di[0] = 3644559982592.000, df[0] = 2593083621376.000, dc_hat[0] = 3419826814976.000 + Gradient do_[0] = 56344568659968.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.825 + c_state[0] = 0.659, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.358, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.828 + c_state[0] = 0.778, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.316, f_gate[0] = 0.739, o_gate[0] = 0.137, c_hat[0] = 0.864 + c_state[0] = 0.848, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.295, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851 + c_state[0] = 0.893, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -4890987423858688.000, df[0] = -3847528877391872.000, dc_hat[0] = -2254875010269184.000 + Gradient do_[0] = -288815694294810624.000 +Backward Time Step 3: + Gradient di[0] = -7608407326457856.000, df[0] = -5808903539392512.000, dc_hat[0] = -3270609005969408.000 + Gradient do_[0] = -393598946263433216.000 +Backward Time Step 2: + Gradient di[0] = -9843585079836672.000, df[0] = -7434920175599616.000, dc_hat[0] = -5831557881266176.000 + Gradient do_[0] = -448408432676962304.000 +Backward Time Step 1: + Gradient di[0] = -12363124564819968.000, df[0] = -8897320032665600.000, dc_hat[0] = -8286589104947200.000 + Gradient do_[0] = -402262548134494208.000 +Backward Time Step 0: + Gradient di[0] = -14895893622816768.000, df[0] = -10887413530361856.000, dc_hat[0] = -15059250556436480.000 + Gradient do_[0] = -231094065435770880.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.431, f_gate[0] = 0.736, o_gate[0] = 0.139, c_hat[0] = 0.841 + c_state[0] = 0.694, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.369, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.843 + c_state[0] = 0.829, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.330, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.874 + c_state[0] = 0.915, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -888198791168.000, df[0] = -652712673280.000, dc_hat[0] = -391856029696.000 + Gradient do_[0] = -55559172653056.000 +Backward Time Step 3: + Gradient di[0] = -1381722488832.000, df[0] = -997821054976.000, dc_hat[0] = -558874361856.000 + Gradient do_[0] = -74154179035136.000 +Backward Time Step 2: + Gradient di[0] = -1739546034176.000, df[0] = -1252109189120.000, dc_hat[0] = -946914459648.000 + Gradient do_[0] = -80582159630336.000 +Backward Time Step 1: + Gradient di[0] = -2175829147648.000, df[0] = -1507879157760.000, dc_hat[0] = -1331884457984.000 + Gradient do_[0] = -71178060300288.000 +Backward Time Step 0: + Gradient di[0] = -2719982551040.000, df[0] = -1935251865600.000, dc_hat[0] = -2552261246976.000 + Gradient do_[0] = -42050682617856.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.443, f_gate[0] = 0.749, o_gate[0] = 0.146, c_hat[0] = 0.855 + c_state[0] = 0.731, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.382, f_gate[0] = 0.761, o_gate[0] = 0.148, c_hat[0] = 0.856 + c_state[0] = 0.883, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.347, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.882 + c_state[0] = 0.986, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.868 + c_state[0] = 1.062, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 24050648743936.000, df[0] = 16542823612416.000, dc_hat[0] = 10203755970560.000 + Gradient do_[0] = 1610281352429568.000 +Backward Time Step 3: + Gradient di[0] = 37493109948416.000, df[0] = 25581783613440.000, dc_hat[0] = 14390737764352.000 + Gradient do_[0] = 2105229625524224.000 +Backward Time Step 2: + Gradient di[0] = 45563605155840.000, df[0] = 31209293873152.000, dc_hat[0] = 23017562308608.000 + Gradient do_[0] = 2158281732653056.000 +Backward Time Step 1: + Gradient di[0] = 56707912826880.000, df[0] = 37860912660480.000, dc_hat[0] = 32049341661184.000 + Gradient do_[0] = 1872576078938112.000 +Backward Time Step 0: + Gradient di[0] = 73285211521024.000, df[0] = 50854044368896.000, dc_hat[0] = 63892608778240.000 + Gradient do_[0] = 1130339158196224.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.431, f_gate[0] = 0.736, o_gate[0] = 0.139, c_hat[0] = 0.841 + c_state[0] = 0.694, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.369, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.843 + c_state[0] = 0.829, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.330, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.874 + c_state[0] = 0.915, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.976, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 5571085336576.000, df[0] = 4094030249984.000, dc_hat[0] = 2457711673344.000 + Gradient do_[0] = 348480727941120.000 +Backward Time Step 3: + Gradient di[0] = 8666417725440.000, df[0] = 6258521276416.000, dc_hat[0] = 3505138696192.000 + Gradient do_[0] = 465102847344640.000 +Backward Time Step 2: + Gradient di[0] = 10909720248320.000, df[0] = 7852707545088.000, dc_hat[0] = 5938456559616.000 + Gradient do_[0] = 505375715491840.000 +Backward Time Step 1: + Gradient di[0] = 13646051672064.000, df[0] = 9456899325952.000, dc_hat[0] = 8353122091008.000 + Gradient do_[0] = 446404304568320.000 +Backward Time Step 0: + Gradient di[0] = 17063897726976.000, df[0] = 12140864864256.000, dc_hat[0] = 16011691556864.000 + Gradient do_[0] = 263806320115712.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.420, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.826 + c_state[0] = 0.658, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.357, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.829 + c_state[0] = 0.778, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.315, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.864 + c_state[0] = 0.848, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.295, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851 + c_state[0] = 0.893, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3851269525471232.000, df[0] = -3029021122101248.000, dc_hat[0] = -1771289878462464.000 + Gradient do_[0] = -227136974627012608.000 +Backward Time Step 3: + Gradient di[0] = -5994380628328448.000, df[0] = -4576522803347456.000, dc_hat[0] = -2564058700054528.000 + Gradient do_[0] = -309508984165892096.000 +Backward Time Step 2: + Gradient di[0] = -7762617590349824.000, df[0] = -5861173828255744.000, dc_hat[0] = -4557597566828544.000 + Gradient do_[0] = -352591732392591360.000 +Backward Time Step 1: + Gradient di[0] = -9749418995613696.000, df[0] = -7011011735322624.000, dc_hat[0] = -6460340556529664.000 + Gradient do_[0] = -316185218769747968.000 +Backward Time Step 0: + Gradient di[0] = -11710066197528576.000, df[0] = -8558892178997248.000, dc_hat[0] = -11838486793420800.000 + Gradient do_[0] = -181669316960190464.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.430, f_gate[0] = 0.736, o_gate[0] = 0.139, c_hat[0] = 0.842 + c_state[0] = 0.694, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.368, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.844 + c_state[0] = 0.829, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.330, f_gate[0] = 0.755, o_gate[0] = 0.146, c_hat[0] = 0.874 + c_state[0] = 0.914, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.975, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 4127742492672.000, df[0] = 3032606572544.000, dc_hat[0] = 1817677135872.000 + Gradient do_[0] = 257935368257536.000 +Backward Time Step 3: + Gradient di[0] = 6424655560704.000, df[0] = 4639119376384.000, dc_hat[0] = 2588271443968.000 + Gradient do_[0] = 344224549568512.000 +Backward Time Step 2: + Gradient di[0] = 8095635341312.000, df[0] = 5825458339840.000, dc_hat[0] = 4372755382272.000 + Gradient do_[0] = 374045077929984.000 +Backward Time Step 1: + Gradient di[0] = 10125283688448.000, df[0] = 7012514529280.000, dc_hat[0] = 6132332494848.000 + Gradient do_[0] = 330213057626112.000 +Backward Time Step 0: + Gradient di[0] = 12615629668352.000, df[0] = 8975947923456.000, dc_hat[0] = 11837716299776.000 + Gradient do_[0] = 195036494954496.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.419, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.827 + c_state[0] = 0.658, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.356, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.830 + c_state[0] = 0.777, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.315, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.865 + c_state[0] = 0.847, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851 + c_state[0] = 0.893, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -4045101131104256.000, df[0] = -3180911298347008.000, dc_hat[0] = -1856776236433408.000 + Gradient do_[0] = -238319403018485760.000 +Backward Time Step 3: + Gradient di[0] = -6299549328998400.000, df[0] = -4809410425651200.000, dc_hat[0] = -2683494224363520.000 + Gradient do_[0] = -324736910972944384.000 +Backward Time Step 2: + Gradient di[0] = -8165443646783488.000, df[0] = -6163601802919936.000, dc_hat[0] = -4757687745118208.000 + Gradient do_[0] = -369970716260433920.000 +Backward Time Step 1: + Gradient di[0] = -10256010388176896.000, df[0] = -7370597134761984.000, dc_hat[0] = -6728436173242368.000 + Gradient do_[0] = -331670122060054528.000 +Backward Time Step 0: + Gradient di[0] = -12280851245039616.000, df[0] = -8976077921714176.000, dc_hat[0] = -12415529608282112.000 + Gradient do_[0] = -190524422833045504.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.429, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.843 + c_state[0] = 0.693, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.367, f_gate[0] = 0.747, o_gate[0] = 0.139, c_hat[0] = 0.845 + c_state[0] = 0.828, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.330, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.874 + c_state[0] = 0.914, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.975, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2645138866176.000, df[0] = 1942922461184.000, dc_hat[0] = 1162985472000.000 + Gradient do_[0] = 165143656792064.000 +Backward Time Step 3: + Gradient di[0] = 4119195549696.000, df[0] = 2974102847488.000, dc_hat[0] = 1653856141312.000 + Gradient do_[0] = 220381684170752.000 +Backward Time Step 2: + Gradient di[0] = 5195264688128.000, df[0] = 3737468534784.000, dc_hat[0] = 2787297984512.000 + Gradient do_[0] = 239490362769408.000 +Backward Time Step 1: + Gradient di[0] = 6497768046592.000, df[0] = 4497723097088.000, dc_hat[0] = 3898063978496.000 + Gradient do_[0] = 211330158034944.000 +Backward Time Step 0: + Gradient di[0] = 8069355405312.000, df[0] = 5741300154368.000, dc_hat[0] = 7571777257472.000 + Gradient do_[0] = 124751502639104.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.418, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.828 + c_state[0] = 0.658, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.355, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.831 + c_state[0] = 0.777, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.315, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.865 + c_state[0] = 0.847, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851 + c_state[0] = 0.892, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -4245605773737984.000, df[0] = -3338048045580288.000, dc_hat[0] = -1945491470286848.000 + Gradient do_[0] = -249904699342323712.000 +Backward Time Step 3: + Gradient di[0] = -6615324988276736.000, df[0] = -5050383323889664.000, dc_hat[0] = -2807932848701440.000 + Gradient do_[0] = -340528096971063296.000 +Backward Time Step 2: + Gradient di[0] = -8582279886536704.000, df[0] = -6476685054574592.000, dc_hat[0] = -4967296107806720.000 + Gradient do_[0] = -388008513751744512.000 +Backward Time Step 1: + Gradient di[0] = -10780964913414144.000, df[0] = -7743565115424768.000, dc_hat[0] = -7009881622052864.000 + Gradient do_[0] = -347763398877904896.000 +Backward Time Step 0: + Gradient di[0] = -12873553510662144.000, df[0] = -9409284429316096.000, dc_hat[0] = -13014732306907136.000 + Gradient do_[0] = -199719569856659456.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.428, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.844 + c_state[0] = 0.693, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.367, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.845 + c_state[0] = 0.828, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.330, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875 + c_state[0] = 0.914, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.975, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1099228053504.000, df[0] = 807249903616.000, dc_hat[0] = 482642886656.000 + Gradient do_[0] = 68574987157504.000 +Backward Time Step 3: + Gradient di[0] = 1712662642688.000, df[0] = 1236451196928.000, dc_hat[0] = 685610041344.000 + Gradient do_[0] = 91512792678400.000 +Backward Time Step 2: + Gradient di[0] = 2161880334336.000, df[0] = 1554915262464.000, dc_hat[0] = 1153043398656.000 + Gradient do_[0] = 99457257439232.000 +Backward Time Step 1: + Gradient di[0] = 2704064905216.000, df[0] = 1870850031616.000, dc_hat[0] = 1608437989376.000 + Gradient do_[0] = 87730922979328.000 +Backward Time Step 0: + Gradient di[0] = 3348076167168.000, df[0] = 2382136868864.000, dc_hat[0] = 3141624922112.000 + Gradient do_[0] = 51760957751296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.417, f_gate[0] = 0.724, o_gate[0] = 0.133, c_hat[0] = 0.829 + c_state[0] = 0.657, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.355, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.831 + c_state[0] = 0.776, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.865 + c_state[0] = 0.846, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.851 + c_state[0] = 0.892, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -4451681794260992.000, df[0] = -3499564854149120.000, dc_hat[0] = -2036900990812160.000 + Gradient do_[0] = -261826188526223360.000 +Backward Time Step 3: + Gradient di[0] = -6940004114759680.000, df[0] = -5298142807326720.000, dc_hat[0] = -2936568461393920.000 + Gradient do_[0] = -356791969889910784.000 +Backward Time Step 2: + Gradient di[0] = -9010912253968384.000, df[0] = -6798739956039680.000, dc_hat[0] = -5184930153758720.000 + Gradient do_[0] = -406601255377436672.000 +Backward Time Step 1: + Gradient di[0] = -11321400648269824.000, df[0] = -8127821880754176.000, dc_hat[0] = -7302640551591936.000 + Gradient do_[0] = -364370766101217280.000 +Backward Time Step 0: + Gradient di[0] = -13484724135657472.000, df[0] = -9855990019129344.000, dc_hat[0] = -13632606302109696.000 + Gradient do_[0] = -209201259918393344.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.427, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.845 + c_state[0] = 0.693, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.846 + c_state[0] = 0.827, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875 + c_state[0] = 0.913, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -510535401472.000, df[0] = -374863298560.000, dc_hat[0] = -223917162496.000 + Gradient do_[0] = -31829734195200.000 +Backward Time Step 3: + Gradient di[0] = -795820687360.000, df[0] = -574497357824.000, dc_hat[0] = -317823516672.000 + Gradient do_[0] = -42479063662592.000 +Backward Time Step 2: + Gradient di[0] = -1005301530624.000, df[0] = -722927288320.000, dc_hat[0] = -533623799808.000 + Gradient do_[0] = -46172798451712.000 +Backward Time Step 1: + Gradient di[0] = -1257585901568.000, df[0] = -869745885184.000, dc_hat[0] = -742772113408.000 + Gradient do_[0] = -40718743633920.000 +Backward Time Step 0: + Gradient di[0] = -1553133207552.000, df[0] = -1105045487616.000, dc_hat[0] = -1457362829312.000 + Gradient do_[0] = -24011297783808.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.439, f_gate[0] = 0.748, o_gate[0] = 0.147, c_hat[0] = 0.859 + c_state[0] = 0.729, h_state[0] = 0.091 +Time Step 2: + i_gate[0] = 0.380, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.859 + c_state[0] = 0.881, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.346, f_gate[0] = 0.770, o_gate[0] = 0.157, c_hat[0] = 0.884 + c_state[0] = 0.984, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.869 + c_state[0] = 1.061, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 22810523074560.000, df[0] = 15673729220608.000, dc_hat[0] = 9625830162432.000 + Gradient do_[0] = 1522248347287552.000 +Backward Time Step 3: + Gradient di[0] = 35631883354112.000, df[0] = 24295919058944.000, dc_hat[0] = 13512977940480.000 + Gradient do_[0] = 1989686952198144.000 +Backward Time Step 2: + Gradient di[0] = 43452276408320.000, df[0] = 29731766403072.000, dc_hat[0] = 21400555356160.000 + Gradient do_[0] = 2039898609549312.000 +Backward Time Step 1: + Gradient di[0] = 54068290519040.000, df[0] = 36024335990784.000, dc_hat[0] = 29417019539456.000 + Gradient do_[0] = 1765696354648064.000 +Backward Time Step 0: + Gradient di[0] = 68901035573248.000, df[0] = 47811777265664.000, dc_hat[0] = 60070331154432.000 + Gradient do_[0] = 1062718320672768.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.427, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.845 + c_state[0] = 0.693, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.846 + c_state[0] = 0.827, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875 + c_state[0] = 0.913, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 4962075017216.000, df[0] = 3643424636928.000, dc_hat[0] = 2176203751424.000 + Gradient do_[0] = 309359481454592.000 +Backward Time Step 3: + Gradient di[0] = 7734678257664.000, df[0] = 5583615295488.000, dc_hat[0] = 3088766992384.000 + Gradient do_[0] = 412853899100160.000 +Backward Time Step 2: + Gradient di[0] = 9769721004032.000, df[0] = 7025545707520.000, dc_hat[0] = 5185685946368.000 + Gradient do_[0] = 448713990340608.000 +Backward Time Step 1: + Gradient di[0] = 12221570613248.000, df[0] = 8452432723968.000, dc_hat[0] = 7218466914304.000 + Gradient do_[0] = 395716073619456.000 +Backward Time Step 0: + Gradient di[0] = 15098293780480.000, df[0] = 10742349692928.000, dc_hat[0] = 14167293558784.000 + Gradient do_[0] = 233418268868608.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.416, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.830 + c_state[0] = 0.657, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.354, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.832 + c_state[0] = 0.776, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866 + c_state[0] = 0.846, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.892, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3519205202722816.000, df[0] = -2766172613246976.000, dc_hat[0] = -1608211446628352.000 + Gradient do_[0] = -206846621407772672.000 +Backward Time Step 3: + Gradient di[0] = -5488868883169280.000, df[0] = -4190234786922496.000, dc_hat[0] = -2316519870562304.000 + Gradient do_[0] = -281894285777829888.000 +Backward Time Step 2: + Gradient di[0] = -7131400205500416.000, df[0] = -5379731616694272.000, dc_hat[0] = -4083791807119360.000 + Gradient do_[0] = -321280596171030528.000 +Backward Time Step 1: + Gradient di[0] = -8962138068484096.000, df[0] = -6431542800809984.000, dc_hat[0] = -5742452342259712.000 + Gradient do_[0] = -287897636445356032.000 +Backward Time Step 0: + Gradient di[0] = -10654938761789440.000, df[0] = -7787698588745728.000, dc_hat[0] = -10771787642044416.000 + Gradient do_[0] = -165300131443245056.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.427, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.846 + c_state[0] = 0.692, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.846 + c_state[0] = 0.827, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875 + c_state[0] = 0.913, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 3820313116672.000, df[0] = 2804691238912.000, dc_hat[0] = 1674017898496.000 + Gradient do_[0] = 238061430505472.000 +Backward Time Step 3: + Gradient di[0] = 5957576294400.000, df[0] = 4300474417152.000, dc_hat[0] = 2374679658496.000 + Gradient do_[0] = 317738727768064.000 +Backward Time Step 2: + Gradient di[0] = 7529908666368.000, df[0] = 5414112460800.000, dc_hat[0] = 3981889241088.000 + Gradient do_[0] = 345392713564160.000 +Backward Time Step 1: + Gradient di[0] = 9421282344960.000, df[0] = 6513787666432.000, dc_hat[0] = 5533144186880.000 + Gradient do_[0] = 304554386128896.000 +Backward Time Step 0: + Gradient di[0] = 11614657970176.000, df[0] = 8263762968576.000, dc_hat[0] = 10898466930688.000 + Gradient do_[0] = 179561576792064.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.415, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.831 + c_state[0] = 0.657, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.354, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.833 + c_state[0] = 0.775, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866 + c_state[0] = 0.846, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.758, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.892, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3691189651898368.000, df[0] = -2901048008114176.000, dc_hat[0] = -1685188937515008.000 + Gradient do_[0] = -216846903000694784.000 +Backward Time Step 3: + Gradient di[0] = -5759739585626112.000, df[0] = -4396941093896192.000, dc_hat[0] = -2426003150340096.000 + Gradient do_[0] = -295565905415372800.000 +Backward Time Step 2: + Gradient di[0] = -7488289976090624.000, df[0] = -5648210559238144.000, dc_hat[0] = -4272091494875136.000 + Gradient do_[0] = -336936301720764416.000 +Backward Time Step 1: + Gradient di[0] = -9413011387187200.000, df[0] = -6753020029173760.000, dc_hat[0] = -5999181864894464.000 + Gradient do_[0] = -301926580182056960.000 +Backward Time Step 0: + Gradient di[0] = -11171262550245376.000, df[0] = -8165079648305152.000, dc_hat[0] = -11293773271138304.000 + Gradient do_[0] = -173310331349630976.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.426, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.846 + c_state[0] = 0.692, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.366, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847 + c_state[0] = 0.826, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.875 + c_state[0] = 0.913, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2607822143488.000, df[0] = 1914296336384.000, dc_hat[0] = 1141849980928.000 + Gradient do_[0] = 162436988534784.000 +Backward Time Step 3: + Gradient di[0] = 4068512104448.000, df[0] = 2936692015104.000, dc_hat[0] = 1619062161408.000 + Gradient do_[0] = 216832531234816.000 +Backward Time Step 2: + Gradient di[0] = 5145399656448.000, df[0] = 3699164577792.000, dc_hat[0] = 2712007081984.000 + Gradient do_[0] = 235746023702528.000 +Backward Time Step 1: + Gradient di[0] = 6439066664960.000, df[0] = 4450733260800.000, dc_hat[0] = 3762652971008.000 + Gradient do_[0] = 207851486183424.000 +Backward Time Step 0: + Gradient di[0] = 7923264651264.000, df[0] = 5637357436928.000, dc_hat[0] = 7434694295552.000 + Gradient do_[0] = 122492962209792.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.415, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.831 + c_state[0] = 0.656, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.354, f_gate[0] = 0.732, o_gate[0] = 0.131, c_hat[0] = 0.833 + c_state[0] = 0.775, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866 + c_state[0] = 0.845, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.892, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3869897872375808.000, df[0] = -3041204602142720.000, dc_hat[0] = -1765281756086272.000 + Gradient do_[0] = -227246771170967552.000 +Backward Time Step 3: + Gradient di[0] = -6041278449975296.000, df[0] = -4611788242944000.000, dc_hat[0] = -2540140295618560.000 + Gradient do_[0] = -309791902251614208.000 +Backward Time Step 2: + Gradient di[0] = -7859268313153536.000, df[0] = -5927339410063360.000, dc_hat[0] = -4468923672035328.000 + Gradient do_[0] = -353236630322020352.000 +Backward Time Step 1: + Gradient di[0] = -9881971350044672.000, df[0] = -7087532483280896.000, dc_hat[0] = -6268060776267776.000 + Gradient do_[0] = -316543831359094784.000 +Backward Time Step 0: + Gradient di[0] = -11709146000785408.000, df[0] = -8558219479744512.000, dc_hat[0] = -11837556933001216.000 + Gradient do_[0] = -181655040488898560.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.426, f_gate[0] = 0.736, o_gate[0] = 0.140, c_hat[0] = 0.847 + c_state[0] = 0.692, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847 + c_state[0] = 0.826, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.773, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1317845860352.000, df[0] = 967262732288.000, dc_hat[0] = 576625967104.000 + Gradient do_[0] = 82054981091328.000 +Backward Time Step 3: + Gradient di[0] = 2056865316864.000, df[0] = 1484588711936.000, dc_hat[0] = 817315643392.000 + Gradient do_[0] = 109549323288576.000 +Backward Time Step 2: + Gradient di[0] = 2602805493760.000, df[0] = 1871017934848.000, dc_hat[0] = 1367761354752.000 + Gradient do_[0] = 119126949363712.000 +Backward Time Step 1: + Gradient di[0] = 3257881591808.000, df[0] = 2251332517888.000, dc_hat[0] = 1894889553920.000 + Gradient do_[0] = 105024306806784.000 +Backward Time Step 0: + Gradient di[0] = 4001863041024.000, df[0] = 2847302483968.000, dc_hat[0] = 3755097456640.000 + Gradient do_[0] = 61868437667840.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.414, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.832 + c_state[0] = 0.656, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.833 + c_state[0] = 0.775, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866 + c_state[0] = 0.845, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -4054508988530688.000, df[0] = -3185987949690880.000, dc_hat[0] = -1848063626838016.000 + Gradient do_[0] = -237993483720196096.000 +Backward Time Step 3: + Gradient di[0] = -6332232956379136.000, df[0] = -4833817114181632.000, dc_hat[0] = -2658239548227584.000 + Gradient do_[0] = -324499656979513344.000 +Backward Time Step 2: + Gradient di[0] = -8242806208331776.000, df[0] = -6215942421872640.000, dc_hat[0] = -4672864825376768.000 + Gradient do_[0] = -370097950371610624.000 +Backward Time Step 1: + Gradient di[0] = -10367044956454912.000, df[0] = -7433605378736128.000, dc_hat[0] = -6546834755420160.000 + Gradient do_[0] = -331671908766449664.000 +Backward Time Step 0: + Gradient di[0] = -12265779131056128.000, df[0] = -8965061867470848.000, dc_hat[0] = -12400293211799552.000 + Gradient do_[0] = -190290604813451264.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.847 + c_state[0] = 0.692, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847 + c_state[0] = 0.826, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -53605457920.000, df[0] = -39340789760.000, dc_hat[0] = -23441201152.000 + Gradient do_[0] = -3336640135168.000 +Backward Time Step 3: + Gradient di[0] = -83700424704.000, df[0] = -60409909248.000, dc_hat[0] = -33217296384.000 + Gradient do_[0] = -4455423016960.000 +Backward Time Step 2: + Gradient di[0] = -105973432320.000, df[0] = -76171427840.000, dc_hat[0] = -55546757120.000 + Gradient do_[0] = -4845913767936.000 +Backward Time Step 1: + Gradient di[0] = -132673929216.000, df[0] = -91664515072.000, dc_hat[0] = -76858621952.000 + Gradient do_[0] = -4272139010048.000 +Backward Time Step 0: + Gradient di[0] = -162724757504.000, df[0] = -115777724416.000, dc_hat[0] = -152690704384.000 + Gradient do_[0] = -2515709984768.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.437, f_gate[0] = 0.748, o_gate[0] = 0.147, c_hat[0] = 0.861 + c_state[0] = 0.728, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.378, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.860 + c_state[0] = 0.879, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.884 + c_state[0] = 0.983, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.331, f_gate[0] = 0.786, o_gate[0] = 0.160, c_hat[0] = 0.869 + c_state[0] = 1.060, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 21692873179136.000, df[0] = 14897366695936.000, dc_hat[0] = 9129976397824.000 + Gradient do_[0] = 1445326019887104.000 +Backward Time Step 3: + Gradient di[0] = 33938867224576.000, df[0] = 23133291544576.000, dc_hat[0] = 12795915534336.000 + Gradient do_[0] = 1889871106932736.000 +Backward Time Step 2: + Gradient di[0] = 41473923874816.000, df[0] = 28363133550592.000, dc_hat[0] = 20172322635776.000 + Gradient do_[0] = 1938159692677120.000 +Backward Time Step 1: + Gradient di[0] = 51634054889472.000, df[0] = 34367837569024.000, dc_hat[0] = 27523704422400.000 + Gradient do_[0] = 1676247620911104.000 +Backward Time Step 0: + Gradient di[0] = 65273977634816.000, df[0] = 45294892875776.000, dc_hat[0] = 56908140511232.000 + Gradient do_[0] = 1006775163682816.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.847 + c_state[0] = 0.692, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.847 + c_state[0] = 0.826, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 4537703202816.000, df[0] = 3330194276352.000, dc_hat[0] = 1984182091776.000 + Gradient do_[0] = 282442216767488.000 +Backward Time Step 3: + Gradient di[0] = 7085068648448.000, df[0] = 5113578520576.000, dc_hat[0] = 2811593752576.000 + Gradient do_[0] = 377137689264128.000 +Backward Time Step 2: + Gradient di[0] = 8969579921408.000, df[0] = 6447136505856.000, dc_hat[0] = 4701308846080.000 + Gradient do_[0] = 410155351015424.000 +Backward Time Step 1: + Gradient di[0] = 11229576822784.000, df[0] = 7758522351616.000, dc_hat[0] = 6505345056768.000 + Gradient do_[0] = 361595611906048.000 +Backward Time Step 0: + Gradient di[0] = 13777175052288.000, df[0] = 9802381000704.000, dc_hat[0] = 12927637651456.000 + Gradient do_[0] = 212993887436800.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.414, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.833 + c_state[0] = 0.656, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.834 + c_state[0] = 0.775, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.866 + c_state[0] = 0.845, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3200392733130752.000, df[0] = -2514617184026624.000, dc_hat[0] = -1457710289649664.000 + Gradient do_[0] = -187793476467818496.000 +Backward Time Step 3: + Gradient di[0] = -5000319574474752.000, df[0] = -3817019308769280.000, dc_hat[0] = -2096133958533120.000 + Gradient do_[0] = -256099210414260224.000 +Backward Time Step 2: + Gradient di[0] = -6512163631923200.000, df[0] = -4910406581616640.000, dc_hat[0] = -3682146329821184.000 + Gradient do_[0] = -292137112763891712.000 +Backward Time Step 1: + Gradient di[0] = -8192842853777408.000, df[0] = -5873376367214592.000, dc_hat[0] = -5154170134855680.000 + Gradient do_[0] = -261834005366702080.000 +Backward Time Step 0: + Gradient di[0] = -9683824107585536.000, df[0] = -7077910682796032.000, dc_hat[0] = -9790022542688256.000 + Gradient do_[0] = -150234296701943808.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.848 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848 + c_state[0] = 0.826, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 3568919379968.000, df[0] = 2618950156288.000, dc_hat[0] = 1559714988032.000 + Gradient do_[0] = 222077105733632.000 +Backward Time Step 3: + Gradient di[0] = 5574676709376.000, df[0] = 4023296720896.000, dc_hat[0] = 2209678622720.000 + Gradient do_[0] = 296588127764480.000 +Backward Time Step 2: + Gradient di[0] = 7061104492544.000, df[0] = 5074929057792.000, dc_hat[0] = 3692393660416.000 + Gradient do_[0] = 322620125872128.000 +Backward Time Step 1: + Gradient di[0] = 8842242949120.000, df[0] = 6107975647232.000, dc_hat[0] = 5103439839232.000 + Gradient do_[0] = 284424209956864.000 +Backward Time Step 0: + Gradient di[0] = 10833075634176.000, df[0] = 7707671134208.000, dc_hat[0] = 10165078196224.000 + Gradient do_[0] = 167478357393408.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.414, f_gate[0] = 0.723, o_gate[0] = 0.133, c_hat[0] = 0.833 + c_state[0] = 0.656, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.834 + c_state[0] = 0.774, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.314, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.845, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3357549344587776.000, df[0] = -2637887308824576.000, dc_hat[0] = -1528354851258368.000 + Gradient do_[0] = -196956084819197952.000 +Backward Time Step 3: + Gradient di[0] = -5248048456269824.000, df[0] = -4006064173350912.000, dc_hat[0] = -2197252386848768.000 + Gradient do_[0] = -268650135645323264.000 +Backward Time Step 2: + Gradient di[0] = -6838593494450176.000, df[0] = -5156123808104448.000, dc_hat[0] = -3857612923731968.000 + Gradient do_[0] = -306535973443862528.000 +Backward Time Step 1: + Gradient di[0] = -8605956430626816.000, df[0] = -6168352036749312.000, dc_hat[0] = -5395110283968512.000 + Gradient do_[0] = -274767251406585856.000 +Backward Time Step 0: + Gradient di[0] = -10159966329503744.000, df[0] = -7425922755985408.000, dc_hat[0] = -10271386371096576.000 + Gradient do_[0] = -157621142234857472.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.425, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.848 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848 + c_state[0] = 0.826, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2532843192320.000, df[0] = 1858481815552.000, dc_hat[0] = 1106365513728.000 + Gradient do_[0] = 157565237329920.000 +Backward Time Step 3: + Gradient di[0] = 3957893103616.000, df[0] = 2856335704064.000, dc_hat[0] = 1567178620928.000 + Gradient do_[0] = 210472523530240.000 +Backward Time Step 2: + Gradient di[0] = 5015737466880.000, df[0] = 3604605566976.000, dc_hat[0] = 2617249366016.000 + Gradient do_[0] = 228995039756288.000 +Backward Time Step 1: + Gradient di[0] = 6282385817600.000, df[0] = 4338962399232.000, dc_hat[0] = 3613634330624.000 + Gradient do_[0] = 201887387222016.000 +Backward Time Step 0: + Gradient di[0] = 7686917718016.000, df[0] = 5469197828096.000, dc_hat[0] = 7212922044416.000 + Gradient do_[0] = 118839060725760.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.413, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.833 + c_state[0] = 0.656, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.834 + c_state[0] = 0.774, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.845, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3521024121372672.000, df[0] = -2766112752140288.000, dc_hat[0] = -1601865632448512.000 + Gradient do_[0] = -206489984503382016.000 +Backward Time Step 3: + Gradient di[0] = -5505831856504832.000, df[0] = -4202780386394112.000, dc_hat[0] = -2302573306445824.000 + Gradient do_[0] = -281715082562371584.000 +Backward Time Step 2: + Gradient di[0] = -7178386073976832.000, df[0] = -5411914343514112.000, dc_hat[0] = -4040566182510592.000 + Gradient do_[0] = -321531800218238976.000 +Backward Time Step 1: + Gradient di[0] = -9036138744381440.000, df[0] = -6475557088788480.000, dc_hat[0] = -5646516194639872.000 + Gradient do_[0] = -288242195901710336.000 +Backward Time Step 0: + Gradient di[0] = -10656046863351808.000, df[0] = -7788508190081024.000, dc_hat[0] = -10772907554766848.000 + Gradient do_[0] = -165317294132559872.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.848 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.365, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.329, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1423825698816.000, df[0] = 1044644298752.000, dc_hat[0] = 621651099648.000 + Gradient do_[0] = 88553375662080.000 +Backward Time Step 3: + Gradient di[0] = 2225780817920.000, df[0] = 1606243450880.000, dc_hat[0] = 880483172352.000 + Gradient do_[0] = 118311870267392.000 +Backward Time Step 2: + Gradient di[0] = 2822038093824.000, df[0] = 2027936940032.000, dc_hat[0] = 1469696311296.000 + Gradient do_[0] = 128751870410752.000 +Backward Time Step 1: + Gradient di[0] = 3535531933696.000, df[0] = 2441456123904.000, dc_hat[0] = 2027262181376.000 + Gradient do_[0] = 113515096440832.000 +Backward Time Step 0: + Gradient di[0] = 4320826228736.000, df[0] = 3074242904064.000, dc_hat[0] = 4054392504320.000 + Gradient do_[0] = 66799584411648.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.413, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.834 + c_state[0] = 0.656, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.353, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835 + c_state[0] = 0.774, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.845, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3690022226100224.000, df[0] = -2898666717184000.000, dc_hat[0] = -1677881453314048.000 + Gradient do_[0] = -216348171398283264.000 +Backward Time Step 3: + Gradient di[0] = -5772436582694912.000, df[0] = -4406225202577408.000, dc_hat[0] = -2411571523354624.000 + Gradient do_[0] = -295230966685761536.000 +Backward Time Step 2: + Gradient di[0] = -7529948474507264.000, df[0] = -5676576133873664.000, dc_hat[0] = -4230100136493056.000 + Gradient do_[0] = -337052506355924992.000 +Backward Time Step 1: + Gradient di[0] = -9481401225183232.000, df[0] = -6793562372964352.000, dc_hat[0] = -5907145547579392.000 + Gradient do_[0] = -302195067177664512.000 +Backward Time Step 0: + Gradient di[0] = -11169719583244288.000, df[0] = -8163952219389952.000, dc_hat[0] = -11292214198009856.000 + Gradient do_[0] = -173286399791857664.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.848 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.312, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.974, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 236308938752.000, df[0] = 173362692096.000, dc_hat[0] = 103130161152.000 + Gradient do_[0] = 14693817122816.000 +Backward Time Step 3: + Gradient di[0] = 369550032896.000, df[0] = 266677895168.000, dc_hat[0] = 146060115968.000 + Gradient do_[0] = 19635833405440.000 +Backward Time Step 2: + Gradient di[0] = 468766621696.000, df[0] = 336836853760.000, dc_hat[0] = 243695566848.000 + Gradient do_[0] = 21373311254528.000 +Backward Time Step 1: + Gradient di[0] = 587424399360.000, df[0] = 405587820544.000, dc_hat[0] = 335853748224.000 + Gradient do_[0] = 18844984803328.000 +Backward Time Step 0: + Gradient di[0] = 717116669952.000, df[0] = 510224367616.000, dc_hat[0] = 672897302528.000 + Gradient do_[0] = 11086558396416.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.413, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.834 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835 + c_state[0] = 0.774, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3863815661813760.000, df[0] = -3034979047047168.000, dc_hat[0] = -1756066501099520.000 + Gradient do_[0] = -226488142507540480.000 +Backward Time Step 3: + Gradient di[0] = -6046736279666688.000, df[0] = -4615540433747968.000, dc_hat[0] = -2523775933349888.000 + Gradient do_[0] = -309139857496604672.000 +Backward Time Step 2: + Gradient di[0] = -7891806985388032.000, df[0] = -5949000003878912.000, dc_hat[0] = -4425382904201216.000 + Gradient do_[0] = -353032842713759744.000 +Backward Time Step 1: + Gradient di[0] = -9939873951645696.000, df[0] = -7121029470093312.000, dc_hat[0] = -6175839775358976.000 + Gradient do_[0] = -316566474426679296.000 +Backward Time Step 0: + Gradient di[0] = -11698816604438528.000, df[0] = -8550669464109056.000, dc_hat[0] = -11827113720020992.000 + Gradient do_[0] = -181494786669150208.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -1030644039680.000, df[0] = -756050165760.000, dc_hat[0] = -449622441984.000 + Gradient do_[0] = -64073886597120.000 +Backward Time Step 3: + Gradient di[0] = -1612381028352.000, df[0] = -1163503337472.000, dc_hat[0] = -636778446848.000 + Gradient do_[0] = -85643342381056.000 +Backward Time Step 2: + Gradient di[0] = -2046189109248.000, df[0] = -1470223089664.000, dc_hat[0] = -1062064029696.000 + Gradient do_[0] = -93242968244224.000 +Backward Time Step 1: + Gradient di[0] = -2564748476416.000, df[0] = -1770611933184.000, dc_hat[0] = -1462580674560.000 + Gradient do_[0] = -82218986766336.000 +Backward Time Step 0: + Gradient di[0] = -3127927111680.000, df[0] = -2225502158848.000, dc_hat[0] = -2935050731520.000 + Gradient do_[0] = -48357468798976.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.435, f_gate[0] = 0.747, o_gate[0] = 0.147, c_hat[0] = 0.863 + c_state[0] = 0.727, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.378, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.878, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.982, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869 + c_state[0] = 1.060, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 20361691267072.000, df[0] = 13976444338176.000, dc_hat[0] = 8551847690240.000 + Gradient do_[0] = 1354971182268416.000 +Backward Time Step 3: + Gradient di[0] = 31911395196928.000, df[0] = 21744909811712.000, dc_hat[0] = 11977612066816.000 + Gradient do_[0] = 1773109468200960.000 +Backward Time Step 2: + Gradient di[0] = 39072747749376.000, df[0] = 26710005252096.000, dc_hat[0] = 18821840633856.000 + Gradient do_[0] = 1819401028370432.000 +Backward Time Step 1: + Gradient di[0] = 48686931378176.000, df[0] = 32381260005376.000, dc_hat[0] = 25525384904704.000 + Gradient do_[0] = 1573046804545536.000 +Backward Time Step 0: + Gradient di[0] = 61148825124864.000, df[0] = 42432368476160.000, dc_hat[0] = 53311684214784.000 + Gradient do_[0] = 943149518159872.000 +Epoch 300, Train Loss=0.011598, Weight Norm=12.423228 +Sample Predictions at Epoch 300: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.63 | 63.87 | 6.24 | +| 193 | 2024-10-14 | 56.97 | 66.55 | 9.58 | +| 194 | 2024-10-15 | 57.12 | 66.00 | 8.88 | +| 195 | 2024-10-16 | 58.01 | 67.20 | 9.19 | +| 196 | 2024-10-17 | 57.48 | 66.76 | 9.28 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.424, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.912, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 3265897168896.000, df[0] = 2395762851840.000, dc_hat[0] = 1424677535744.000 + Gradient do_[0] = 203033673728000.000 +Backward Time Step 3: + Gradient di[0] = 5109171879936.000, df[0] = 3686811566080.000, dc_hat[0] = 2017637171200.000 + Gradient do_[0] = 271375579217920.000 +Backward Time Step 2: + Gradient di[0] = 6483168198656.000, df[0] = 4658268471296.000, dc_hat[0] = 3364937531392.000 + Gradient do_[0] = 295430332088320.000 +Backward Time Step 1: + Gradient di[0] = 8126215487488.000, df[0] = 5610051993600.000, dc_hat[0] = 4634078347264.000 + Gradient do_[0] = 260504765333504.000 +Backward Time Step 0: + Gradient di[0] = 9913542639616.000, df[0] = 7053428391936.000, dc_hat[0] = 9302246948864.000 + Gradient do_[0] = 153262468628480.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.834 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835 + c_state[0] = 0.774, h_state[0] = 0.085 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3062017745223680.000, df[0] = -2405014450470912.000, dc_hat[0] = -1390990254407680.000 + Gradient do_[0] = -179451962784219136.000 +Backward Time Step 3: + Gradient di[0] = -4793750974889984.000, df[0] = -3659075912990720.000, dc_hat[0] = -1999004749529088.000 + Gradient do_[0] = -244993799376338944.000 +Backward Time Step 2: + Gradient di[0] = -6259028996915200.000, df[0] = -4717913126731776.000, dc_hat[0] = -3504062758649856.000 + Gradient do_[0] = -279837408759906304.000 +Backward Time Step 1: + Gradient di[0] = -7885750544629760.000, df[0] = -5648704480477184.000, dc_hat[0] = -4887579904180224.000 + Gradient do_[0] = -250975812646338560.000 +Backward Time Step 0: + Gradient di[0] = -9276178024103936.000, df[0] = -6779961822150656.000, dc_hat[0] = -9377906471993344.000 + Gradient do_[0] = -143910111977406464.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2370713419776.000, df[0] = 1738956210176.000, dc_hat[0] = 1033807265792.000 + Gradient do_[0] = 147357173809152.000 +Backward Time Step 3: + Gradient di[0] = 3710146838528.000, df[0] = 2677185970176.000, dc_hat[0] = 1464112775168.000 + Gradient do_[0] = 197003606753280.000 +Backward Time Step 2: + Gradient di[0] = 4709952782336.000, df[0] = 3384001167360.000, dc_hat[0] = 2441062121472.000 + Gradient do_[0] = 214516101021696.000 +Backward Time Step 1: + Gradient di[0] = 5905026383872.000, df[0] = 4076154912768.000, dc_hat[0] = 3359419662336.000 + Gradient do_[0] = 189172790853632.000 +Backward Time Step 0: + Gradient di[0] = 7197335486464.000, df[0] = 5120862978048.000, dc_hat[0] = 6753528315904.000 + Gradient do_[0] = 111270162333696.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835 + c_state[0] = 0.774, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3209887563644928.000, df[0] = -2520998968557568.000, dc_hat[0] = -1457580635324416.000 + Gradient do_[0] = -188085912201068544.000 +Backward Time Step 3: + Gradient di[0] = -5027208217231360.000, df[0] = -3837224982413312.000, dc_hat[0] = -2094741315387392.000 + Gradient do_[0] = -256844455959592960.000 +Backward Time Step 2: + Gradient di[0] = -6567025933549568.000, df[0] = -4949824180846592.000, dc_hat[0] = -3671099405500416.000 + Gradient do_[0] = -293460752965042176.000 +Backward Time Step 1: + Gradient di[0] = -8276148576321536.000, df[0] = -5927654553288704.000, dc_hat[0] = -5118007583965184.000 + Gradient do_[0] = -263236535527145472.000 +Backward Time Step 0: + Gradient di[0] = -9727818531340288.000, df[0] = -7110066029199360.000, dc_hat[0] = -9834500150263808.000 + Gradient do_[0] = -150916835724754944.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.691, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.149, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1411668115456.000, df[0] = 1035408834560.000, dc_hat[0] = 615390445568.000 + Gradient do_[0] = 87732080607232.000 +Backward Time Step 3: + Gradient di[0] = 2210073149440.000, df[0] = 1594710949888.000, dc_hat[0] = 871581351936.000 + Gradient do_[0] = 117317878939648.000 +Backward Time Step 2: + Gradient di[0] = 2806829547520.000, df[0] = 2016548618240.000, dc_hat[0] = 1452795625472.000 + Gradient do_[0] = 127777407762432.000 +Backward Time Step 1: + Gradient di[0] = 3519867256832.000, df[0] = 2429459890176.000, dc_hat[0] = 1998110064640.000 + Gradient do_[0] = 112692878639104.000 +Backward Time Step 0: + Gradient di[0] = 4286650253312.000, df[0] = 3049926950912.000, dc_hat[0] = 4022323904512.000 + Gradient do_[0] = 66271232131072.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835 + c_state[0] = 0.774, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3363653029986304.000, df[0] = -2641605139890176.000, dc_hat[0] = -1526838996238336.000 + Gradient do_[0] = -197065932902760448.000 +Backward Time Step 3: + Gradient di[0] = -5270069827338240.000, df[0] = -4022550942187520.000, dc_hat[0] = -2194387241009152.000 + Gradient do_[0] = -269175289886539776.000 +Backward Time Step 2: + Gradient di[0] = -6887542699851776.000, df[0] = -5191171814981632.000, dc_hat[0] = -3845111616110592.000 + Gradient do_[0] = -307642597537480704.000 +Backward Time Step 1: + Gradient di[0] = -8682550193029120.000, df[0] = -6218060914491392.000, dc_hat[0] = -5358225104830464.000 + Gradient do_[0] = -276004940722208768.000 +Backward Time Step 0: + Gradient di[0] = -10198181874761728.000, df[0] = -7453854538924032.000, dc_hat[0] = -10310020675665920.000 + Gradient do_[0] = -158214002340528128.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 383367249920.000, df[0] = 281169068032.000, dc_hat[0] = 167075856384.000 + Gradient do_[0] = 23822675738624.000 +Backward Time Step 3: + Gradient di[0] = 600409047040.000, df[0] = 433223827456.000, dc_hat[0] = 236657360896.000 + Gradient do_[0] = 31864358174720.000 +Backward Time Step 2: + Gradient di[0] = 762829733888.000, df[0] = 548028645376.000, dc_hat[0] = 394418323456.000 + Gradient do_[0] = 34713733431296.000 +Backward Time Step 1: + Gradient di[0] = 956845654016.000, df[0] = 660371931136.000, dc_hat[0] = 542212554752.000 + Gradient do_[0] = 30619480358912.000 +Backward Time Step 0: + Gradient di[0] = 1164518490112.000, df[0] = 828548186112.000, dc_hat[0] = 1092711153664.000 + Gradient do_[0] = 18003349471232.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.835 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.852 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3522162019270656.000, df[0] = -2765937732222976.000, dc_hat[0] = -1598293159182336.000 + Gradient do_[0] = -206328974769389568.000 +Backward Time Step 3: + Gradient di[0] = -5520500243562496.000, df[0] = -4213652559233024.000, dc_hat[0] = -2297348747165696.000 + Gradient do_[0] = -281902102618308608.000 +Backward Time Step 2: + Gradient di[0] = -7218067813695488.000, df[0] = -5440088792104960.000, dc_hat[0] = -4025295694725120.000 + Gradient do_[0] = -322287714462334976.000 +Backward Time Step 1: + Gradient di[0] = -9101767992147968.000, df[0] = -6517716487766016.000, dc_hat[0] = -5607547419492352.000 + Gradient do_[0] = -289198152542584832.000 +Backward Time Step 0: + Gradient di[0] = -10684363582734336.000, df[0] = -7809204563738624.000, dc_hat[0] = -10801534585536512.000 + Gradient do_[0] = -165756600567463936.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -718535720960.000, df[0] = -526958886912.000, dc_hat[0] = -313072648192.000 + Gradient do_[0] = -44646289899520.000 +Backward Time Step 3: + Gradient di[0] = -1125734023168.000, df[0] = -812254167040.000, dc_hat[0] = -443526709248.000 + Gradient do_[0] = -59732786151424.000 +Backward Time Step 2: + Gradient di[0] = -1430800433152.000, df[0] = -1027875995648.000, dc_hat[0] = -739149152256.000 + Gradient do_[0] = -65090602663936.000 +Backward Time Step 1: + Gradient di[0] = -1795133145088.000, df[0] = -1238833299456.000, dc_hat[0] = -1015766646784.000 + Gradient do_[0] = -57421812727808.000 +Backward Time Step 0: + Gradient di[0] = -2183578386432.000, df[0] = -1553603493888.000, dc_hat[0] = -2048933101568.000 + Gradient do_[0] = -33757927047168.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.435, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.864 + c_state[0] = 0.726, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862 + c_state[0] = 0.878, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.982, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869 + c_state[0] = 1.060, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 19276096339968.000, df[0] = 13227745345536.000, dc_hat[0] = 8087126147072.000 + Gradient do_[0] = 1282003043352576.000 +Backward Time Step 3: + Gradient di[0] = 30248162820096.000, df[0] = 20608431685632.000, dc_hat[0] = 11328484802560.000 + Gradient do_[0] = 1678932109688832.000 +Backward Time Step 2: + Gradient di[0] = 37079778066432.000, df[0] = 25342578262016.000, dc_hat[0] = 17779222642688.000 + Gradient do_[0] = 1723599366914048.000 +Backward Time Step 1: + Gradient di[0] = 46236686090240.000, df[0] = 30740234371072.000, dc_hat[0] = 24042711023616.000 + Gradient do_[0] = 1490389085192192.000 +Backward Time Step 0: + Gradient di[0] = 57897715236864.000, df[0] = 40176361406464.000, dc_hat[0] = 50477253263360.000 + Gradient do_[0] = 893004969672704.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2905319931904.000, df[0] = 2130697256960.000, dc_hat[0] = 1265802412032.000 + Gradient do_[0] = 180519555825664.000 +Backward Time Step 3: + Gradient di[0] = 4551673380864.000, df[0] = 3284186169344.000, dc_hat[0] = 1793194852352.000 + Gradient do_[0] = 241514114449408.000 +Backward Time Step 2: + Gradient di[0] = 5784588517376.000, df[0] = 4155601059840.000, dc_hat[0] = 2988203048960.000 + Gradient do_[0] = 263153501863936.000 +Backward Time Step 1: + Gradient di[0] = 7257576701952.000, df[0] = 5008501768192.000, dc_hat[0] = 4106661134336.000 + Gradient do_[0] = 232151672946688.000 +Backward Time Step 0: + Gradient di[0] = 8830624727040.000, df[0] = 6282938941440.000, dc_hat[0] = 8286104453120.000 + Gradient do_[0] = 136520660942848.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.412, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2788333033881600.000, df[0] = -2189549497221120.000, dc_hat[0] = -1264897396572160.000 + Gradient do_[0] = -163324961883160576.000 +Backward Time Step 3: + Gradient di[0] = -4371864390467584.000, df[0] = -3336895652167680.000, dc_hat[0] = -1818381913161728.000 + Gradient do_[0] = -223203884095504384.000 +Backward Time Step 2: + Gradient di[0] = -5718183092682752.000, df[0] = -4309525087649792.000, dc_hat[0] = -3185945536888832.000 + Gradient do_[0] = -255239546940162048.000 +Backward Time Step 1: + Gradient di[0] = -7212571866169344.000, df[0] = -5164500604944384.000, dc_hat[0] = -4437506254700544.000 + Gradient do_[0] = -229084604856795136.000 +Backward Time Step 0: + Gradient di[0] = -8465179750694912.000, df[0] = -6187203185082368.000, dc_hat[0] = -8558014395056128.000 + Gradient do_[0] = -131328323111354368.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.825, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2137365282816.000, df[0] = 1567412322304.000, dc_hat[0] = 931009724416.000 + Gradient do_[0] = 132792679661568.000 +Backward Time Step 3: + Gradient di[0] = 3349729771520.000, df[0] = 2416895852544.000, dc_hat[0] = 1319138754560.000 + Gradient do_[0] = 177707711201280.000 +Backward Time Step 2: + Gradient di[0] = 4258649079808.000, df[0] = 3059286016000.000, dc_hat[0] = 2198162767872.000 + Gradient do_[0] = 193679453061120.000 +Backward Time Step 1: + Gradient di[0] = 5344339689472.000, df[0] = 3687923056640.000, dc_hat[0] = 3019980406784.000 + Gradient do_[0] = 170887789674496.000 +Backward Time Step 0: + Gradient di[0] = 6499475652608.000, df[0] = 4624339697664.000, dc_hat[0] = 6098699943936.000 + Gradient do_[0] = 100481305149440.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.835 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2924400819044352.000, df[0] = -2296283595276288.000, dc_hat[0] = -1326287780052992.000 + Gradient do_[0] = -171282041634029568.000 +Backward Time Step 3: + Gradient di[0] = -4586895686238208.000, df[0] = -3500986488324096.000, dc_hat[0] = -1906981585551360.000 + Gradient do_[0] = -234142512963387392.000 +Backward Time Step 2: + Gradient di[0] = -6002001510924288.000, df[0] = -4523296884260864.000, dc_hat[0] = -3341335574609920.000 + Gradient do_[0] = -267833593642876928.000 +Backward Time Step 1: + Gradient di[0] = -7572647965622272.000, df[0] = -5421964566986752.000, dc_hat[0] = -4653069321109504.000 + Gradient do_[0] = -240437010873974784.000 +Backward Time Step 0: + Gradient di[0] = -8883825782292480.000, df[0] = -6493191150764032.000, dc_hat[0] = -8981251209822208.000 + Gradient do_[0] = -137823181246300160.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.423, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.876 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1310817910784.000, df[0] = 961223000064.000, dc_hat[0] = 570855456768.000 + Gradient do_[0] = 81434131824640.000 +Backward Time Step 3: + Gradient di[0] = 2055077232640.000, df[0] = 1482751475712.000, dc_hat[0] = 808994734080.000 + Gradient do_[0] = 109007251439616.000 +Backward Time Step 2: + Gradient di[0] = 2613649866752.000, df[0] = 1877515960320.000, dc_hat[0] = 1348065296384.000 + Gradient do_[0] = 118834715426816.000 +Backward Time Step 1: + Gradient di[0] = 3280743956480.000, df[0] = 2263776755712.000, dc_hat[0] = 1851550990336.000 + Gradient do_[0] = 104866684862464.000 +Backward Time Step 0: + Gradient di[0] = 3988013973504.000, df[0] = 2837449015296.000, dc_hat[0] = 3742102192128.000 + Gradient do_[0] = 61654339420160.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3065858519728128.000, df[0] = -2407242464755712.000, dc_hat[0] = -1390109517676544.000 + Gradient do_[0] = -179555145078538240.000 +Backward Time Step 3: + Gradient di[0] = -4810542149533696.000, df[0] = -3671649966620672.000, dc_hat[0] = -1999142456918016.000 + Gradient do_[0] = -245520259287613440.000 +Backward Time Step 2: + Gradient di[0] = -6297297692393472.000, df[0] = -4745717671264256.000, dc_hat[0] = -3503057467867136.000 + Gradient do_[0] = -280938226057740288.000 +Backward Time Step 1: + Gradient di[0] = -7947405873905664.000, df[0] = -5689931334680576.000, dc_hat[0] = -4877514648322048.000 + Gradient do_[0] = -252253720035721216.000 +Backward Time Step 0: + Gradient di[0] = -9319588332306432.000, df[0] = -6811690356178944.000, dc_hat[0] = -9421793521565696.000 + Gradient do_[0] = -144583562849419264.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 419913170944.000, df[0] = 307906805760.000, dc_hat[0] = 182833905664.000 + Gradient do_[0] = 26085280448512.000 +Backward Time Step 3: + Gradient di[0] = 658565300224.000, df[0] = 475150614528.000, dc_hat[0] = 259157508096.000 + Gradient do_[0] = 34927034761216.000 +Backward Time Step 2: + Gradient di[0] = 837865111552.000, df[0] = 601864929280.000, dc_hat[0] = 431852847104.000 + Gradient do_[0] = 38085723160576.000 +Backward Time Step 1: + Gradient di[0] = 1051966767104.000, df[0] = 725836365824.000, dc_hat[0] = 593002692608.000 + Gradient do_[0] = 33614473461760.000 +Backward Time Step 0: + Gradient di[0] = 1278209032192.000, df[0] = 909438418944.000, dc_hat[0] = 1199391244288.000 + Gradient do_[0] = 19760997728256.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -3212015988375552.000, df[0] = -2521882658078720.000, dc_hat[0] = -1456048539959296.000 + Gradient do_[0] = -188103744905281536.000 +Backward Time Step 3: + Gradient di[0] = -5041713932402688.000, df[0] = -3848055547756544.000, dc_hat[0] = -2094412213518336.000 + Gradient do_[0] = -257281477471895552.000 +Backward Time Step 2: + Gradient di[0] = -6602676846460928.000, df[0] = -4975733571059712.000, dc_hat[0] = -3670333559144448.000 + Gradient do_[0] = -294491235878436864.000 +Backward Time Step 1: + Gradient di[0] = -8335076064493568.000, df[0] = -5967133355802624.000, dc_hat[0] = -5109753730564096.000 + Gradient do_[0] = -264478519810064384.000 +Backward Time Step 0: + Gradient di[0] = -9770407427047424.000, df[0] = -7141193804677120.000, dc_hat[0] = -9877555049922560.000 + Gradient do_[0] = -151577539133833216.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -537996034048.000, df[0] = -394475012096.000, dc_hat[0] = -234210557952.000 + Gradient do_[0] = -33419373314048.000 +Backward Time Step 3: + Gradient di[0] = -844050595840.000, df[0] = -608968376320.000, dc_hat[0] = -332062588928.000 + Gradient do_[0] = -44759569661952.000 +Backward Time Step 2: + Gradient di[0] = -1074213355520.000, df[0] = -771626893312.000, dc_hat[0] = -553391030272.000 + Gradient do_[0] = -48820301856768.000 +Backward Time Step 1: + Gradient di[0] = -1349022121984.000, df[0] = -930758721536.000, dc_hat[0] = -759802036224.000 + Gradient do_[0] = -43096389713920.000 +Backward Time Step 0: + Gradient di[0] = -1638663192576.000, df[0] = -1165899333632.000, dc_hat[0] = -1537618739200.000 + Gradient do_[0] = -25333585674240.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.434, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.864 + c_state[0] = 0.726, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862 + c_state[0] = 0.878, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869 + c_state[0] = 1.060, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 18203587641344.000, df[0] = 12489363292160.000, dc_hat[0] = 7631617916928.000 + Gradient do_[0] = 1210303261966336.000 +Backward Time Step 3: + Gradient di[0] = 28599113482240.000, df[0] = 19483041202176.000, dc_hat[0] = 10696618147840.000 + Gradient do_[0] = 1586427238285312.000 +Backward Time Step 2: + Gradient di[0] = 35091046727680.000, df[0] = 23980557729792.000, dc_hat[0] = 16779778719744.000 + Gradient do_[0] = 1629490459443200.000 +Backward Time Step 1: + Gradient di[0] = 43787313217536.000, df[0] = 29105267408896.000, dc_hat[0] = 22657972043776.000 + Gradient do_[0] = 1409489886511104.000 +Backward Time Step 0: + Gradient di[0] = 54746131988480.000, df[0] = 37989413552128.000, dc_hat[0] = 47729589878784.000 + Gradient do_[0] = 844395469340672.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.849 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2521632866304.000, df[0] = 1848935186432.000, dc_hat[0] = 1097701720064.000 + Gradient do_[0] = 156637021077504.000 +Backward Time Step 3: + Gradient di[0] = 3956043677696.000, df[0] = 2854222299136.000, dc_hat[0] = 1556268318720.000 + Gradient do_[0] = 209784271798272.000 +Backward Time Step 2: + Gradient di[0] = 5034330292224.000, df[0] = 3616247906304.000, dc_hat[0] = 2593389805568.000 + Gradient do_[0] = 228796414296064.000 +Backward Time Step 1: + Gradient di[0] = 6322250579968.000, df[0] = 4362041556992.000, dc_hat[0] = 3560845344768.000 + Gradient do_[0] = 201973118795776.000 +Backward Time Step 0: + Gradient di[0] = 7681907097600.000, df[0] = 5465632145408.000, dc_hat[0] = 7208219705344.000 + Gradient do_[0] = 118761583542272.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.352, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2542371262693376.000, df[0] = -1996027666104320.000, dc_hat[0] = -1152223090311168.000 + Gradient do_[0] = -148880952506974208.000 +Backward Time Step 3: + Gradient di[0] = -3991960842928128.000, df[0] = -3046811413512192.000, dc_hat[0] = -1657757115613184.000 + Gradient do_[0] = -203689167610183680.000 +Backward Time Step 2: + Gradient di[0] = -5229526946676736.000, df[0] = -3940861469523968.000, dc_hat[0] = -2905429680062464.000 + Gradient do_[0] = -233204663904632832.000 +Backward Time Step 1: + Gradient di[0] = -6603519733792768.000, df[0] = -4727292798435328.000, dc_hat[0] = -4044936580169728.000 + Gradient do_[0] = -209488919648010240.000 +Backward Time Step 0: + Gradient di[0] = -7740841032417280.000, df[0] = -5657783504470016.000, dc_hat[0] = -7825732134764544.000 + Gradient do_[0] = -120090970678099968.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1863454949376.000, df[0] = 1366276440064.000, dc_hat[0] = 811048173568.000 + Gradient do_[0] = 115747590242304.000 +Backward Time Step 3: + Gradient di[0] = 2924486328320.000, df[0] = 2109936631808.000, dc_hat[0] = 1150133338112.000 + Gradient do_[0] = 155063938973696.000 +Backward Time Step 2: + Gradient di[0] = 3722880745472.000, df[0] = 2674154799104.000, dc_hat[0] = 1916736372736.000 + Gradient do_[0] = 169160977940480.000 +Backward Time Step 1: + Gradient di[0] = 4676391534592.000, df[0] = 3226329153536.000, dc_hat[0] = 2631365558272.000 + Gradient do_[0] = 149354954358784.000 +Backward Time Step 0: + Gradient di[0] = 5680200155136.000, df[0] = 4041429483520.000, dc_hat[0] = 5329943265280.000 + Gradient do_[0] = 87815371096064.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.867 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2668273665572864.000, df[0] = -2094780641181696.000, dc_hat[0] = -1209045373419520.000 + Gradient do_[0] = -156247697592942592.000 +Backward Time Step 3: + Gradient di[0] = -4191167600132096.000, df[0] = -3198824533196800.000, dc_hat[0] = -1739941046386688.000 + Gradient do_[0] = -213829344877871104.000 +Backward Time Step 2: + Gradient di[0] = -5492708047060992.000, df[0] = -4139109106843648.000, dc_hat[0] = -3049908890238976.000 + Gradient do_[0] = -244893984336379904.000 +Backward Time Step 1: + Gradient di[0] = -6937716507803648.000, df[0] = -4966297527910400.000, dc_hat[0] = -4245832601698304.000 + Gradient do_[0] = -220037496765939712.000 +Backward Time Step 0: + Gradient di[0] = -8130054726877184.000, df[0] = -5942259589578752.000, dc_hat[0] = -8219213416103936.000 + Gradient do_[0] = -126129196480069632.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1150981505024.000, df[0] = 843859689472.000, dc_hat[0] = 500883980288.000 + Gradient do_[0] = 71491190259712.000 +Backward Time Step 3: + Gradient di[0] = 1806956232704.000, df[0] = 1303653777408.000, dc_hat[0] = 710493011968.000 + Gradient do_[0] = 95802324156416.000 +Backward Time Step 2: + Gradient di[0] = 2301007233024.000, df[0] = 1652796424192.000, dc_hat[0] = 1184233291776.000 + Gradient do_[0] = 104539797585920.000 +Backward Time Step 1: + Gradient di[0] = 2891016306688.000, df[0] = 1994499293184.000, dc_hat[0] = 1625703317504.000 + Gradient do_[0] = 92317377626112.000 +Backward Time Step 0: + Gradient di[0] = 3510844260352.000, df[0] = 2497945534464.000, dc_hat[0] = 3294356307968.000 + Gradient do_[0] = 54277338824704.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.294, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2798873688932352.000, df[0] = -2197222724730880.000, dc_hat[0] = -1268026682900480.000 + Gradient do_[0] = -163893392214851584.000 +Backward Time Step 3: + Gradient di[0] = -4397862364381184.000, df[0] = -3356553180610560.000, dc_hat[0] = -1825351973994496.000 + Gradient do_[0] = -224358629002706944.000 +Backward Time Step 2: + Gradient di[0] = -5765837365444608.000, df[0] = -4344871057883136.000, dc_hat[0] = -3200332335153152.000 + Gradient do_[0] = -257038640020979712.000 +Backward Time Step 1: + Gradient di[0] = -7284636585558016.000, df[0] = -5214458590789632.000, dc_hat[0] = -4455418684243968.000 + Gradient do_[0] = -231002994949226496.000 +Backward Time Step 0: + Gradient di[0] = -8534794728112128.000, df[0] = -6238085125767168.000, dc_hat[0] = -8628392802910208.000 + Gradient do_[0] = -132408326997671936.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.140, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 379642249216.000, df[0] = 278329393152.000, dc_hat[0] = 165191122944.000 + Gradient do_[0] = 23580393865216.000 +Backward Time Step 3: + Gradient di[0] = 596215660544.000, df[0] = 430142750720.000, dc_hat[0] = 234386161664.000 + Gradient do_[0] = 31608258166784.000 +Backward Time Step 2: + Gradient di[0] = 759475601408.000, df[0] = 545518354432.000, dc_hat[0] = 390729531392.000 + Gradient do_[0] = 34500272717824.000 +Backward Time Step 1: + Gradient di[0] = 954434453504.000, df[0] = 658439143424.000, dc_hat[0] = 536376639488.000 + Gradient do_[0] = 30472430157824.000 +Backward Time Step 0: + Gradient di[0] = 1158832848896.000, df[0] = 824502910976.000, dc_hat[0] = 1087376064512.000 + Gradient do_[0] = 17915451539456.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2933844378386432.000, df[0] = -2303086555037696.000, dc_hat[0] = -1328972671483904.000 + Gradient do_[0] = -171795152786948096.000 +Backward Time Step 3: + Gradient di[0] = -4611568662740992.000, df[0] = -3519630136049664.000, dc_hat[0] = -1913652609286144.000 + Gradient do_[0] = -235244945168924672.000 +Backward Time Step 2: + Gradient di[0] = -6048353334853632.000, df[0] = -4557704404140032.000, dc_hat[0] = -3355911351435264.000 + Gradient do_[0] = -269600405749497856.000 +Backward Time Step 1: + Gradient di[0] = -7643586866708480.000, df[0] = -5471222473162752.000, dc_hat[0] = -4672232391442432.000 + Gradient do_[0] = -242348305680433152.000 +Backward Time Step 0: + Gradient di[0] = -8953557260697600.000, df[0] = -6544157917052928.000, dc_hat[0] = -9051746655535104.000 + Gradient do_[0] = -138904971839012864.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -191882510336.000, df[0] = -140675743744.000, dc_hat[0] = -83486957568.000 + Gradient do_[0] = -11917436387328.000 +Backward Time Step 3: + Gradient di[0] = -301358481408.000, df[0] = -217417580544.000, dc_hat[0] = -118452436992.000 + Gradient do_[0] = -15974998736896.000 +Backward Time Step 2: + Gradient di[0] = -383905431552.000, df[0] = -275750715392.000, dc_hat[0] = -197442863104.000 + Gradient do_[0] = -17437188685824.000 +Backward Time Step 1: + Gradient di[0] = -482473082880.000, df[0] = -332837191680.000, dc_hat[0] = -270986346496.000 + Gradient do_[0] = -15401496870912.000 +Backward Time Step 0: + Gradient di[0] = -585679831040.000, df[0] = -416707837952.000, dc_hat[0] = -549565202432.000 + Gradient do_[0] = -9054557569024.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.434, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865 + c_state[0] = 0.726, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.869 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 17184394838016.000, df[0] = 11788703760384.000, dc_hat[0] = 7200873906176.000 + Gradient do_[0] = 1142316479807488.000 +Backward Time Step 3: + Gradient di[0] = 27022548008960.000, df[0] = 18408101904384.000, dc_hat[0] = 10098611060736.000 + Gradient do_[0] = 1498402554642432.000 +Backward Time Step 2: + Gradient di[0] = 33178934837248.000, df[0] = 22672408838144.000, dc_hat[0] = 15839551029248.000 + Gradient do_[0] = 1539751144325120.000 +Backward Time Step 1: + Gradient di[0] = 41424347201536.000, df[0] = 27530889265152.000, dc_hat[0] = 21372533211136.000 + Gradient do_[0] = 1332332409651200.000 +Backward Time Step 0: + Gradient di[0] = 51755215749120.000, df[0] = 35913958686720.000, dc_hat[0] = 45122007859200.000 + Gradient do_[0] = 798264030920704.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.364, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 2290124324864.000, df[0] = 1678967439360.000, dc_hat[0] = 996362354688.000 + Gradient do_[0] = 142232774508544.000 +Backward Time Step 3: + Gradient di[0] = 3596633505792.000, df[0] = 2594823995392.000, dc_hat[0] = 1413607194624.000 + Gradient do_[0] = 190655007555584.000 +Backward Time Step 2: + Gradient di[0] = 4581369053184.000, df[0] = 3290693369856.000, dc_hat[0] = 2356117766144.000 + Gradient do_[0] = 208087038296064.000 +Backward Time Step 1: + Gradient di[0] = 5757643259904.000, df[0] = 3971947429888.000, dc_hat[0] = 3233844035584.000 + Gradient do_[0] = 183795407912960.000 +Backward Time Step 0: + Gradient di[0] = 6991312846848.000, df[0] = 4974278868992.000, dc_hat[0] = 6560209174528.000 + Gradient do_[0] = 108085058207744.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2274592936689664.000, df[0] = -1785568497238016.000, dc_hat[0] = -1030222195458048.000 + Gradient do_[0] = -133181060333699072.000 +Backward Time Step 3: + Gradient di[0] = -3575385254002688.000, df[0] = -2728818208931840.000, dc_hat[0] = -1483330038005760.000 + Gradient do_[0] = -182368675074932736.000 +Backward Time Step 2: + Gradient di[0] = -4689131477139456.000, df[0] = -3533439261212672.000, dc_hat[0] = -2600782817918976.000 + Gradient do_[0] = -208986906690584576.000 +Backward Time Step 1: + Gradient di[0] = -5926108365062144.000, df[0] = -4241754261815296.000, dc_hat[0] = -3620430334132224.000 + Gradient do_[0] = -187865133702184960.000 +Backward Time Step 0: + Gradient di[0] = -6942549419753472.000, df[0] = -5074312197308416.000, dc_hat[0] = -7018685768138752.000 + Gradient do_[0] = -107706320810934272.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1908495220736.000, df[0] = 1399180886016.000, dc_hat[0] = 830275715072.000 + Gradient do_[0] = 118523296743424.000 +Backward Time Step 3: + Gradient di[0] = 2997418459136.000, df[0] = 2162524553216.000, dc_hat[0] = 1177918767104.000 + Gradient do_[0] = 158877064626176.000 +Backward Time Step 2: + Gradient di[0] = 3818344677376.000, df[0] = 2742611083264.000, dc_hat[0] = 1963081203712.000 + Gradient do_[0] = 173408885145600.000 +Backward Time Step 1: + Gradient di[0] = 4798899814400.000, df[0] = 3310473707520.000, dc_hat[0] = 2693878513664.000 + Gradient do_[0] = 153166637170688.000 +Backward Time Step 0: + Gradient di[0] = 5826056028160.000, df[0] = 4145205215232.000, dc_hat[0] = 5466806026240.000 + Gradient do_[0] = 90070296035328.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2340991788908544.000, df[0] = -1837693562519552.000, dc_hat[0] = -1060221401169920.000 + Gradient do_[0] = -137059793759109120.000 +Backward Time Step 3: + Gradient di[0] = -3679883150491648.000, df[0] = -2808595615842304.000, dc_hat[0] = -1526435940401152.000 + Gradient do_[0] = -187682322714198016.000 +Backward Time Step 2: + Gradient di[0] = -4826409906208768.000, df[0] = -3636859926216704.000, dc_hat[0] = -2676053193523200.000 + Gradient do_[0] = -215079334979829760.000 +Backward Time Step 1: + Gradient di[0] = -6099753892839424.000, df[0] = -4365938409340928.000, dc_hat[0] = -3724584633237504.000 + Gradient do_[0] = -193341801120137216.000 +Backward Time Step 0: + Gradient di[0] = -7144711382892544.000, df[0] = -5222072494063616.000, dc_hat[0] = -7223065008144384.000 + Gradient do_[0] = -110842643369426944.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1506172338176.000, df[0] = 1104223797248.000, dc_hat[0] = 655209070592.000 + Gradient do_[0] = 93531989344256.000 +Backward Time Step 3: + Gradient di[0] = 2365645389824.000, df[0] = 1706732290048.000, dc_hat[0] = 929512947712.000 + Gradient do_[0] = 125379524165632.000 +Backward Time Step 2: + Gradient di[0] = 3013743476736.000, df[0] = 2164674658304.000, dc_hat[0] = 1548949520384.000 + Gradient do_[0] = 136851910295552.000 +Backward Time Step 1: + Gradient di[0] = 3787811192832.000, df[0] = 2612926087168.000, dc_hat[0] = 2125191315456.000 + Gradient do_[0] = 120877769293824.000 +Backward Time Step 0: + Gradient di[0] = 4597739421696.000, df[0] = 3271265353728.000, dc_hat[0] = 4314230685696.000 + Gradient do_[0] = 71080635006976.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2409957152522240.000, df[0] = -1891834175422464.000, dc_hat[0] = -1091382328426496.000 + Gradient do_[0] = -141088653471383552.000 +Backward Time Step 3: + Gradient di[0] = -3788422342770688.000, df[0] = -2891458956754944.000, dc_hat[0] = -1571216477388800.000 + Gradient do_[0] = -193201956984979456.000 +Backward Time Step 2: + Gradient di[0] = -4968996914855936.000, df[0] = -3744281084821504.000, dc_hat[0] = -2754258642403328.000 + Gradient do_[0] = -221408020830093312.000 +Backward Time Step 1: + Gradient di[0] = -6280103864565760.000, df[0] = -4494919766900736.000, dc_hat[0] = -3832811366645760.000 + Gradient do_[0] = -199030811721203712.000 +Backward Time Step 0: + Gradient di[0] = -7354723472506880.000, df[0] = -5375570330255360.000, dc_hat[0] = -7435379737100288.000 + Gradient do_[0] = -114100762610499584.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1080610193408.000, df[0] = 792230297600.000, dc_hat[0] = 470056009728.000 + Gradient do_[0] = 67100831907840.000 +Backward Time Step 3: + Gradient di[0] = 1697315815424.000, df[0] = 1224561917952.000, dc_hat[0] = 666821787648.000 + Gradient do_[0] = 89950699651072.000 +Backward Time Step 2: + Gradient di[0] = 2162456264704.000, df[0] = 1553213423616.000, dc_hat[0] = 1111098654720.000 + Gradient do_[0] = 98184487501824.000 +Backward Time Step 1: + Gradient di[0] = 2717972168704.000, df[0] = 1874884427776.000, dc_hat[0] = 1524188315648.000 + Gradient do_[0] = 86724466180096.000 +Backward Time Step 0: + Gradient di[0] = 3298591768576.000, df[0] = 2346928832512.000, dc_hat[0] = 3095191355392.000 + Gradient do_[0] = 50995925090304.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.844, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2480827602567168.000, df[0] = -1947471181774848.000, dc_hat[0] = -1123405873020928.000 + Gradient do_[0] = -145228770016493568.000 +Backward Time Step 3: + Gradient di[0] = -3899955127255040.000, df[0] = -2976608830881792.000, dc_hat[0] = -1617236380876800.000 + Gradient do_[0] = -198874011055161344.000 +Backward Time Step 2: + Gradient di[0] = -5115513281708032.000, df[0] = -3854663354941440.000, dc_hat[0] = -2834634391945216.000 + Gradient do_[0] = -227911597748649984.000 +Backward Time Step 1: + Gradient di[0] = -6465438682710016.000, df[0] = -4627468094799872.000, dc_hat[0] = -3944063367643136.000 + Gradient do_[0] = -204877447622033408.000 +Backward Time Step 0: + Gradient di[0] = -7570571885805568.000, df[0] = -5533333605842944.000, dc_hat[0] = -7653595214249984.000 + Gradient do_[0] = -117449428352106496.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 630448586752.000, df[0] = 462203092992.000, dc_hat[0] = 274227478528.000 + Gradient do_[0] = 39145871245312.000 +Backward Time Step 3: + Gradient di[0] = 990286970880.000, df[0] = 714466525184.000, dc_hat[0] = 389011406848.000 + Gradient do_[0] = 52477583949824.000 +Backward Time Step 2: + Gradient di[0] = 1261741408256.000, df[0] = 906259922944.000, dc_hat[0] = 648153464832.000 + Gradient do_[0] = 57283144843264.000 +Backward Time Step 1: + Gradient di[0] = 1585926635520.000, df[0] = 1093969707008.000, dc_hat[0] = 889012682752.000 + Gradient do_[0] = 50597839503360.000 +Backward Time Step 0: + Gradient di[0] = 1924478664704.000, df[0] = 1369255444480.000, dc_hat[0] = 1805810139136.000 + Gradient do_[0] = 29752266063872.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2553104117530624.000, df[0] = -2004215215947776.000, dc_hat[0] = -1156079299854336.000 + Gradient do_[0] = -149452217517080576.000 +Backward Time Step 3: + Gradient di[0] = -4013709215137792.000, df[0] = -3063455753961472.000, dc_hat[0] = -1664219028127744.000 + Gradient do_[0] = -204661221788483584.000 +Backward Time Step 2: + Gradient di[0] = -5264914255970304.000, df[0] = -3967226394705920.000, dc_hat[0] = -2916752824467456.000 + Gradient do_[0] = -234547597098876928.000 +Backward Time Step 1: + Gradient di[0] = -6654407043186688.000, df[0] = -4762635010572288.000, dc_hat[0] = -4057847520296960.000 + Gradient do_[0] = -210843810031206400.000 +Backward Time Step 0: + Gradient di[0] = -7790877938286592.000, df[0] = -5694355687866368.000, dc_hat[0] = -7876317185835008.000 + Gradient do_[0] = -120867234477244416.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 154669252608.000, df[0] = 113393475584.000, dc_hat[0] = 67273957376.000 + Gradient do_[0] = 9603261661184.000 +Backward Time Step 3: + Gradient di[0] = 242958942208.000, df[0] = 175289712640.000, dc_hat[0] = 95431245824.000 + Gradient do_[0] = 12874124623872.000 +Backward Time Step 2: + Gradient di[0] = 309575122944.000, df[0] = 222355161088.000, dc_hat[0] = 158993727488.000 + Gradient do_[0] = 14053528305664.000 +Backward Time Step 1: + Gradient di[0] = 389129568256.000, df[0] = 268416696320.000, dc_hat[0] = 218050576384.000 + Gradient do_[0] = 12413581656064.000 +Backward Time Step 0: + Gradient di[0] = 472143495168.000, df[0] = 335927377920.000, dc_hat[0] = 443029848064.000 + Gradient do_[0] = 7299295870976.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.411, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2626609798447104.000, df[0] = -2061924544020480.000, dc_hat[0] = -1189306911686656.000 + Gradient do_[0] = -153747356611772416.000 +Backward Time Step 3: + Gradient di[0] = -4129391474900992.000, df[0] = -3151776656130048.000, dc_hat[0] = -1711997989158912.000 + Gradient do_[0] = -210546718553407488.000 +Backward Time Step 2: + Gradient di[0] = -5416870198902784.000, df[0] = -4081715995738112.000, dc_hat[0] = -3000274905989120.000 + Gradient do_[0] = -241296983585718272.000 +Backward Time Step 1: + Gradient di[0] = -6846600924102656.000, df[0] = -4900108323782656.000, dc_hat[0] = -4173570582249472.000 + Gradient do_[0] = -216911980345163776.000 +Backward Time Step 0: + Gradient di[0] = -8014945845248000.000, df[0] = -5858127085830144.000, dc_hat[0] = -8102842887831552.000 + Gradient do_[0] = -124343417797869568.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -347497431040.000, df[0] = -254763171840.000, dc_hat[0] = -151139188736.000 + Gradient do_[0] = -21574704955392.000 +Backward Time Step 3: + Gradient di[0] = -545880506368.000, df[0] = -393843736576.000, dc_hat[0] = -214394290176.000 + Gradient do_[0] = -28923800846336.000 +Backward Time Step 2: + Gradient di[0] = -695591895040.000, df[0] = -499613892608.000, dc_hat[0] = -357172477952.000 + Gradient do_[0] = -31574584197120.000 +Backward Time Step 1: + Gradient di[0] = -874374299648.000, df[0] = -603123220480.000, dc_hat[0] = -489783951360.000 + Gradient do_[0] = -27890481627136.000 +Backward Time Step 0: + Gradient di[0] = -1060789026816.000, df[0] = -754745278464.000, dc_hat[0] = -995377872896.000 + Gradient do_[0] = -16399702425600.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.434, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865 + c_state[0] = 0.726, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.377, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.345, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 15934027726848.000, df[0] = 10930823888896.000, dc_hat[0] = 6674673827840.000 + Gradient do_[0] = 1058828288262144.000 +Backward Time Step 3: + Gradient di[0] = 25063889829888.000, df[0] = 17074111905792.000, dc_hat[0] = 9359733030912.000 + Gradient do_[0] = 1389152411058176.000 +Backward Time Step 2: + Gradient di[0] = 30786046656512.000, df[0] = 21036533809152.000, dc_hat[0] = 14674194071552.000 + Gradient do_[0] = 1427788997328896.000 +Backward Time Step 1: + Gradient di[0] = 38447423160320.000, df[0] = 25549502152704.000, dc_hat[0] = 19780421550080.000 + Gradient do_[0] = 1235575117971456.000 +Backward Time Step 0: + Gradient di[0] = 48002995585024.000, df[0] = 33310218649600.000, dc_hat[0] = 41850694008832.000 + Gradient do_[0] = 740390420348928.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1781651865600.000, df[0] = 1306192773120.000, dc_hat[0] = 774860439552.000 + Gradient do_[0] = 110613871198208.000 +Backward Time Step 3: + Gradient di[0] = 2798714355712.000, df[0] = 2019228254208.000, dc_hat[0] = 1099122999296.000 + Gradient do_[0] = 148289634697216.000 +Backward Time Step 2: + Gradient di[0] = 3565936181248.000, df[0] = 2561257504768.000, dc_hat[0] = 1830971375616.000 + Gradient do_[0] = 161865422340096.000 +Backward Time Step 1: + Gradient di[0] = 4482458451968.000, df[0] = 3091896729600.000, dc_hat[0] = 2510865563648.000 + Gradient do_[0] = 142979880714240.000 +Backward Time Step 0: + Gradient di[0] = 5439683035136.000, df[0] = 3870302928896.000, dc_hat[0] = 5104257728512.000 + Gradient do_[0] = 84097003159552.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2040316563554304.000, df[0] = -1601678667153408.000, dc_hat[0] = -923754586701824.000 + Gradient do_[0] = -119421548485410816.000 +Backward Time Step 3: + Gradient di[0] = -3207693640663040.000, df[0] = -2448307720814592.000, dc_hat[0] = -1329655973937152.000 + Gradient do_[0] = -163539933586259968.000 +Backward Time Step 2: + Gradient di[0] = -4207556926898176.000, df[0] = -3170462985093120.000, dc_hat[0] = -2329901378043904.000 + Gradient do_[0] = -187411636695334912.000 +Backward Time Step 1: + Gradient di[0] = -5318287814557696.000, df[0] = -3806234914324480.000, dc_hat[0] = -3240834413625344.000 + Gradient do_[0] = -168476397197590528.000 +Backward Time Step 0: + Gradient di[0] = -6227011626336256.000, df[0] = -4551324767092736.000, dc_hat[0] = -6295300532600832.000 + Gradient do_[0] = -96605496798085120.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1450424008704.000, df[0] = 1063360659456.000, dc_hat[0] = 630781509632.000 + Gradient do_[0] = 90045230874624.000 +Backward Time Step 3: + Gradient di[0] = 2278490898432.000, df[0] = 1643906334720.000, dc_hat[0] = 894735089664.000 + Gradient do_[0] = 120718310244352.000 +Backward Time Step 2: + Gradient di[0] = 2903256334336.000, df[0] = 2085278973952.000, dc_hat[0] = 1490413551616.000 + Gradient do_[0] = 131774445256704.000 +Backward Time Step 1: + Gradient di[0] = 3649581875200.000, df[0] = 2517360181248.000, dc_hat[0] = 2043620753408.000 + Gradient do_[0] = 116401675173888.000 +Backward Time Step 0: + Gradient di[0] = 4428479332352.000, df[0] = 3150837448704.000, dc_hat[0] = 4155407073280.000 + Gradient do_[0] = 68463884238848.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.655, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2101419184226304.000, df[0] = -1649650767495168.000, dc_hat[0] = -951377199104000.000 + Gradient do_[0] = -122992100827463680.000 +Backward Time Step 3: + Gradient di[0] = -3303857958420480.000, df[0] = -2521726965514240.000, dc_hat[0] = -1369380394893312.000 + Gradient do_[0] = -168432725970124800.000 +Backward Time Step 2: + Gradient di[0] = -4333856614252544.000, df[0] = -3265623354245120.000, dc_hat[0] = -2399350730784768.000 + Gradient do_[0] = -193022238373445632.000 +Backward Time Step 1: + Gradient di[0] = -5478040196874240.000, df[0] = -3920507619508224.000, dc_hat[0] = -3337092952227840.000 + Gradient do_[0] = -173521248603602944.000 +Backward Time Step 0: + Gradient di[0] = -6413362204246016.000, df[0] = -4687528917467136.000, dc_hat[0] = -6483695514943488.000 + Gradient do_[0] = -99496550954172416.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1099196334080.000, df[0] = 805864734720.000, dc_hat[0] = 478016798720.000 + Gradient do_[0] = 68237106610176.000 +Backward Time Step 3: + Gradient di[0] = 1726809899008.000, df[0] = 1245883006976.000, dc_hat[0] = 678036832256.000 + Gradient do_[0] = 91483944255488.000 +Backward Time Step 2: + Gradient di[0] = 2200420745216.000, df[0] = 1580460146688.000, dc_hat[0] = 1129388572672.000 + Gradient do_[0] = 99866025918464.000 +Backward Time Step 1: + Gradient di[0] = 2766164721664.000, df[0] = 1907981549568.000, dc_hat[0] = 1548428115968.000 + Gradient do_[0] = 88217185419264.000 +Backward Time Step 0: + Gradient di[0] = 3356186640384.000, df[0] = 2387907444736.000, dc_hat[0] = 3149234962432.000 + Gradient do_[0] = 51886333886464.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2164653450854400.000, df[0] = -1699296965558272.000, dc_hat[0] = -979964904079360.000 + Gradient do_[0] = -126687232990904320.000 +Backward Time Step 3: + Gradient di[0] = -3403382014345216.000, df[0] = -2597712453173248.000, dc_hat[0] = -1410494774640640.000 + Gradient do_[0] = -173496372153024512.000 +Backward Time Step 2: + Gradient di[0] = -4464560522133504.000, df[0] = -3364103464681472.000, dc_hat[0] = -2471228887531520.000 + Gradient do_[0] = -198828707740123136.000 +Backward Time Step 1: + Gradient di[0] = -5643357783064576.000, df[0] = -4038762296246272.000, dc_hat[0] = -3436721161109504.000 + Gradient do_[0] = -178742142129143808.000 +Backward Time Step 0: + Gradient di[0] = -6606216436383744.000, df[0] = -4828485986025472.000, dc_hat[0] = -6678663944732672.000 + Gradient do_[0] = -102488459532304384.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 725991751680.000, df[0] = 532255113216.000, dc_hat[0] = 315706769408.000 + Gradient do_[0] = 45066831790080.000 +Backward Time Step 3: + Gradient di[0] = 1140559577088.000, df[0] = 822912614400.000, dc_hat[0] = 447805325312.000 + Gradient do_[0] = 60421746720768.000 +Backward Time Step 2: + Gradient di[0] = 1453454786560.000, df[0] = 1043947257856.000, dc_hat[0] = 745862070272.000 + Gradient do_[0] = 65960065105920.000 +Backward Time Step 1: + Gradient di[0] = 1827206070272.000, df[0] = 1260311019520.000, dc_hat[0] = 1022496473088.000 + Gradient do_[0] = 58267095007232.000 +Backward Time Step 0: + Gradient di[0] = 2216738947072.000, df[0] = 1577196978176.000, dc_hat[0] = 2080048807936.000 + Gradient do_[0] = 34270582145024.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.836 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.891, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2229486250622976.000, df[0] = -1750199173120000.000, dc_hat[0] = -1009275975499776.000 + Gradient do_[0] = -130475703383621632.000 +Backward Time Step 3: + Gradient di[0] = -3505417015525376.000, df[0] = -2675615643729920.000, dc_hat[0] = -1452646791643136.000 + Gradient do_[0] = -178688008361345024.000 +Backward Time Step 2: + Gradient di[0] = -4598570481090560.000, df[0] = -3465074924584960.000, dc_hat[0] = -2544926231363584.000 + Gradient do_[0] = -204782082168193024.000 +Backward Time Step 1: + Gradient di[0] = -5812876384141312.000, df[0] = -4160022913220608.000, dc_hat[0] = -3538885548179456.000 + Gradient do_[0] = -184095681424654336.000 +Backward Time Step 0: + Gradient di[0] = -6803976058044416.000, df[0] = -4973028815405056.000, dc_hat[0] = -6878593061748736.000 + Gradient do_[0] = -105556500700725248.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 329653846016.000, df[0] = 241683873792.000, dc_hat[0] = 143349268480.000 + Gradient do_[0] = 20462753021952.000 +Backward Time Step 3: + Gradient di[0] = 517918687232.000, df[0] = 373680472064.000, dc_hat[0] = 203328045056.000 + Gradient do_[0] = 27435512889344.000 +Backward Time Step 2: + Gradient di[0] = 660035731456.000, df[0] = 474071400448.000, dc_hat[0] = 338647089152.000 + Gradient do_[0] = 29951289982976.000 +Backward Time Step 1: + Gradient di[0] = 829789044736.000, df[0] = 572337618944.000, dc_hat[0] = 464204038144.000 + Gradient do_[0] = 26458529464320.000 +Backward Time Step 0: + Gradient di[0] = 1006597505024.000, df[0] = 716188352512.000, dc_hat[0] = 944527966208.000 + Gradient do_[0] = 15561908027392.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2295570093834240.000, df[0] = -1802083720232960.000, dc_hat[0] = -1039151432466432.000 + Gradient do_[0] = -134337162450567168.000 +Backward Time Step 3: + Gradient di[0] = -3609430386016256.000, df[0] = -2755030931210240.000, dc_hat[0] = -1495616865697792.000 + Gradient do_[0] = -183980129624522752.000 +Backward Time Step 2: + Gradient di[0] = -4735173526552576.000, df[0] = -3568000594608128.000, dc_hat[0] = -2620048162160640.000 + Gradient do_[0] = -210850613259403264.000 +Backward Time Step 1: + Gradient di[0] = -5985658086621184.000, df[0] = -4283616502743040.000, dc_hat[0] = -3643017030270976.000 + Gradient do_[0] = -189552368654745600.000 +Backward Time Step 0: + Gradient di[0] = -7005554442502144.000, df[0] = -5120362299785216.000, dc_hat[0] = -7082381743751168.000 + Gradient do_[0] = -108683778058092544.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -90431922176.000, df[0] = -66299969536.000, dc_hat[0] = -39323152384.000 + Gradient do_[0] = -5613209255936.000 +Backward Time Step 3: + Gradient di[0] = -142082686976.000, df[0] = -102514057216.000, dc_hat[0] = -55776428032.000 + Gradient do_[0] = -7526154240000.000 +Backward Time Step 2: + Gradient di[0] = -181078278144.000, df[0] = -130059722752.000, dc_hat[0] = -92894494720.000 + Gradient do_[0] = -8216579145728.000 +Backward Time Step 1: + Gradient di[0] = -227656974336.000, df[0] = -157022339072.000, dc_hat[0] = -127328116736.000 + Gradient do_[0] = -7258575470592.000 +Backward Time Step 0: + Gradient di[0] = -276150157312.000, df[0] = -196479254528.000, dc_hat[0] = -259121987584.000 + Gradient do_[0] = -4269256474624.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865 + c_state[0] = 0.726, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.862 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 14902563438592.000, df[0] = 10223314010112.000, dc_hat[0] = 6241249656832.000 + Gradient do_[0] = 990050426814464.000 +Backward Time Step 3: + Gradient di[0] = 23446775922688.000, df[0] = 15972911022080.000, dc_hat[0] = 8751943254016.000 + Gradient do_[0] = 1299144794701824.000 +Backward Time Step 2: + Gradient di[0] = 28806968508416.000, df[0] = 19683998695424.000, dc_hat[0] = 13718453747712.000 + Gradient do_[0] = 1335499813814272.000 +Backward Time Step 1: + Gradient di[0] = 35983617687552.000, df[0] = 23910695305216.000, dc_hat[0] = 18482987008000.000 + Gradient do_[0] = 1155858914344960.000 +Backward Time Step 0: + Gradient di[0] = 44918336651264.000, df[0] = 31169712226304.000, dc_hat[0] = 39161373392896.000 + Gradient do_[0] = 692813121454080.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1584051781632.000, df[0] = 1161342877696.000, dc_hat[0] = 688765992960.000 + Gradient do_[0] = 98322337497088.000 +Backward Time Step 3: + Gradient di[0] = 2488726454272.000, df[0] = 1795642621952.000, dc_hat[0] = 976919592960.000 + Gradient do_[0] = 131826832113664.000 +Backward Time Step 2: + Gradient di[0] = 3171469754368.000, df[0] = 2277911822336.000, dc_hat[0] = 1626930413568.000 + Gradient do_[0] = 143907341991936.000 +Backward Time Step 1: + Gradient di[0] = 3987265552384.000, df[0] = 2750145363968.000, dc_hat[0] = 2230070280192.000 + Gradient do_[0] = 127129278742528.000 +Backward Time Step 0: + Gradient di[0] = 4837988106240.000, df[0] = 3442201067520.000, dc_hat[0] = 4539665088512.000 + Gradient do_[0] = 74794858971136.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.836 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1779691102928896.000, df[0] = -1397108837974016.000, dc_hat[0] = -805565743759360.000 + Gradient do_[0] = -104142674776293376.000 +Backward Time Step 3: + Gradient di[0] = -2798314001006592.000, df[0] = -2135935286247424.000, dc_hat[0] = -1159377834737664.000 + Gradient do_[0] = -142627995520073728.000 +Backward Time Step 2: + Gradient di[0] = -3670845197123584.000, df[0] = -2766015310069760.000, dc_hat[0] = -2030809921880064.000 + Gradient do_[0] = -163448210264686592.000 +Backward Time Step 1: + Gradient di[0] = -4640398630715392.000, df[0] = -3320853412446208.000, dc_hat[0] = -2823655918665728.000 + Gradient do_[0] = -146942272989036544.000 +Backward Time Step 0: + Gradient di[0] = -5432328323072000.000, df[0] = -3970490838286336.000, dc_hat[0] = -5491902740692992.000 + Gradient do_[0] = -84276836124590080.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1314857156608.000, df[0] = 963986718720.000, dc_hat[0] = 571697922048.000 + Gradient do_[0] = 81609898328064.000 +Backward Time Step 3: + Gradient di[0] = 2065876910080.000, df[0] = 1490562580480.000, dc_hat[0] = 810872537088.000 + Gradient do_[0] = 109422797914112.000 +Backward Time Step 2: + Gradient di[0] = 2632746795008.000, df[0] = 1890971156480.000, dc_hat[0] = 1350346080256.000 + Gradient do_[0] = 119454331568128.000 +Backward Time Step 1: + Gradient di[0] = 3310073413632.000, df[0] = 2283036475392.000, dc_hat[0] = 1850787627008.000 + Gradient do_[0] = 105529217122304.000 +Backward Time Step 0: + Gradient di[0] = 4015973203968.000, df[0] = 2857341812736.000, dc_hat[0] = 3768337563648.000 + Gradient do_[0] = 62086579224576.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1835130675003392.000, df[0] = -1440636452470784.000, dc_hat[0] = -830630703136768.000 + Gradient do_[0] = -107382342837862400.000 +Backward Time Step 3: + Gradient di[0] = -2885571361898496.000, df[0] = -2202556939894784.000, dc_hat[0] = -1195430461308928.000 + Gradient do_[0] = -147067909372379136.000 +Backward Time Step 2: + Gradient di[0] = -3785431971790848.000, df[0] = -2852353279524864.000, dc_hat[0] = -2093848230625280.000 + Gradient do_[0] = -168539327058411520.000 +Backward Time Step 1: + Gradient di[0] = -4785336597086208.000, df[0] = -3424533386100736.000, dc_hat[0] = -2911060382187520.000 + Gradient do_[0] = -151520347349319680.000 +Backward Time Step 0: + Gradient di[0] = -5601504400506880.000, df[0] = -4094141336125440.000, dc_hat[0] = -5662933707128832.000 + Gradient do_[0] = -86901413329764352.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.911, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1026715615232.000, df[0] = 752739614720.000, dc_hat[0] = 446403805184.000 + Gradient do_[0] = 63723309891584.000 +Backward Time Step 3: + Gradient di[0] = 1613210845184.000, df[0] = 1163966545920.000, dc_hat[0] = 633162629120.000 + Gradient do_[0] = 85442921758720.000 +Backward Time Step 2: + Gradient di[0] = 2055962099712.000, df[0] = 1476696211456.000, dc_hat[0] = 1054383144960.000 + Gradient do_[0] = 93279358025728.000 +Backward Time Step 1: + Gradient di[0] = 2584976556032.000, df[0] = 1782904258560.000, dc_hat[0] = 1445054644224.000 + Gradient do_[0] = 82407302627328.000 +Backward Time Step 0: + Gradient di[0] = 3136088965120.000, df[0] = 2231309434880.000, dc_hat[0] = 2942709268480.000 + Gradient do_[0] = 48483654434816.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1861412787847168.000, df[0] = -1461266388353024.000, dc_hat[0] = -842499979476992.000 + Gradient do_[0] = -108917836595855360.000 +Backward Time Step 3: + Gradient di[0] = -2926869083062272.000, df[0] = -2234080389234688.000, dc_hat[0] = -1212455174799360.000 + Gradient do_[0] = -149167959401562112.000 +Backward Time Step 2: + Gradient di[0] = -3839605199601664.000, df[0] = -2893161911287808.000, dc_hat[0] = -2123539339542528.000 + Gradient do_[0] = -170943392052674560.000 +Backward Time Step 1: + Gradient di[0] = -4853743614951424.000, df[0] = -3473455479521280.000, dc_hat[0] = -2952087251976192.000 + Gradient do_[0] = -153677795321446400.000 +Backward Time Step 0: + Gradient di[0] = -5681064374697984.000, df[0] = -4152291972087808.000, dc_hat[0] = -5743366096551936.000 + Gradient do_[0] = -88135709621223424.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.422, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 975921610752.000, df[0] = 715498192896.000, dc_hat[0] = 424307490816.000 + Gradient do_[0] = 60569537216512.000 +Backward Time Step 3: + Gradient di[0] = 1533385637888.000, df[0] = 1106370101248.000, dc_hat[0] = 601793953792.000 + Gradient do_[0] = 81212496412672.000 +Backward Time Step 2: + Gradient di[0] = 1954226110464.000, df[0] = 1403619115008.000, dc_hat[0] = 1002085351424.000 + Gradient do_[0] = 88659474776064.000 +Backward Time Step 1: + Gradient di[0] = 2457007816704.000, df[0] = 1694627004416.000, dc_hat[0] = 1373237280768.000 + Gradient do_[0] = 78323233325056.000 +Backward Time Step 0: + Gradient di[0] = 2980543201280.000, df[0] = 2120639447040.000, dc_hat[0] = 2796755091456.000 + Gradient do_[0] = 46078929928192.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1888675495411712.000, df[0] = -1482666062905344.000, dc_hat[0] = -854813583605760.000 + Gradient do_[0] = -110510788426334208.000 +Backward Time Step 3: + Gradient di[0] = -2969706013130752.000, df[0] = -2266779049000960.000, dc_hat[0] = -1230117959368704.000 + Gradient do_[0] = -151346572972523520.000 +Backward Time Step 2: + Gradient di[0] = -3895798672654336.000, df[0] = -2935492572086272.000, dc_hat[0] = -2154348616351744.000 + Gradient do_[0] = -173437496741330944.000 +Backward Time Step 1: + Gradient di[0] = -4924703990874112.000, df[0] = -3524204813090816.000, dc_hat[0] = -2994667826184192.000 + Gradient do_[0] = -155916143297560576.000 +Backward Time Step 0: + Gradient di[0] = -5763609888030720.000, df[0] = -4212624451436544.000, dc_hat[0] = -5826817311113216.000 + Gradient do_[0] = -89416322840002560.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 923648524288.000, df[0] = 677172346880.000, dc_hat[0] = 401568890880.000 + Gradient do_[0] = 57324144164864.000 +Backward Time Step 3: + Gradient di[0] = 1451238883328.000, df[0] = 1047098884096.000, dc_hat[0] = 569519374336.000 + Gradient do_[0] = 76859479949312.000 +Backward Time Step 2: + Gradient di[0] = 1849530515456.000, df[0] = 1328416948224.000, dc_hat[0] = 948286783488.000 + Gradient do_[0] = 83905902280704.000 +Backward Time Step 1: + Gradient di[0] = 2325328953344.000, df[0] = 1603793190912.000, dc_hat[0] = 1299384238080.000 + Gradient do_[0] = 74121480241152.000 +Backward Time Step 0: + Gradient di[0] = 2820538368000.000, df[0] = 2006796861440.000, dc_hat[0] = 2646616309760.000 + Gradient do_[0] = 43605267841024.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1916727805870080.000, df[0] = -1504685689143296.000, dc_hat[0] = -867484743761920.000 + Gradient do_[0] = -112149988464656384.000 +Backward Time Step 3: + Gradient di[0] = -3013787946844160.000, df[0] = -2300427970281472.000, dc_hat[0] = -1248295468924928.000 + Gradient do_[0] = -153588528721166336.000 +Backward Time Step 2: + Gradient di[0] = -3953627186069504.000, df[0] = -2979055888498688.000, dc_hat[0] = -2186061849559040.000 + Gradient do_[0] = -176004358175981568.000 +Backward Time Step 1: + Gradient di[0] = -4997725951098880.000, df[0] = -3576429467926528.000, dc_hat[0] = -3038502799278080.000 + Gradient do_[0] = -158219723236966400.000 +Backward Time Step 0: + Gradient di[0] = -5848553603727360.000, df[0] = -4274709814312960.000, dc_hat[0] = -5912692497842176.000 + Gradient do_[0] = -90734130475565056.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 869448941568.000, df[0] = 637434396672.000, dc_hat[0] = 377994444800.000 + Gradient do_[0] = 53959276363776.000 +Backward Time Step 3: + Gradient di[0] = 1366064496640.000, df[0] = 985643220992.000, dc_hat[0] = 536061575168.000 + Gradient do_[0] = 72346408845312.000 +Backward Time Step 2: + Gradient di[0] = 1740979175424.000, df[0] = 1250446016512.000, dc_hat[0] = 892525805568.000 + Gradient do_[0] = 78977880293376.000 +Backward Time Step 1: + Gradient di[0] = 2188803702784.000, df[0] = 1509618745344.000, dc_hat[0] = 1222859423744.000 + Gradient do_[0] = 69765850726400.000 +Backward Time Step 0: + Gradient di[0] = 2654689558528.000, df[0] = 1888796278784.000, dc_hat[0] = 2490994262016.000 + Gradient do_[0] = 41041260445696.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1945364500316160.000, df[0] = -1527164205793280.000, dc_hat[0] = -880419574644736.000 + Gradient do_[0] = -113823333492981760.000 +Backward Time Step 3: + Gradient di[0] = -3058783903285248.000, df[0] = -2334774286876672.000, dc_hat[0] = -1266851606691840.000 + Gradient do_[0] = -155877093454905344.000 +Backward Time Step 2: + Gradient di[0] = -4012650774134784.000, df[0] = -3023519537430528.000, dc_hat[0] = -2218434628681728.000 + Gradient do_[0] = -178624339766149120.000 +Backward Time Step 1: + Gradient di[0] = -5072260276682752.000, df[0] = -3629735917649920.000, dc_hat[0] = -3083255553196032.000 + Gradient do_[0] = -160571200651657216.000 +Backward Time Step 0: + Gradient di[0] = -5935275435884544.000, df[0] = -4338094941667328.000, dc_hat[0] = -6000365128384512.000 + Gradient do_[0] = -92079528981037056.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 813608599552.000, df[0] = 596493795328.000, dc_hat[0] = 353708146688.000 + Gradient do_[0] = 50492772188160.000 +Backward Time Step 3: + Gradient di[0] = 1278316249088.000, df[0] = 922330923008.000, dc_hat[0] = 501598945280.000 + Gradient do_[0] = 67697366794240.000 +Backward Time Step 2: + Gradient di[0] = 1629144743936.000, df[0] = 1170117885952.000, dc_hat[0] = 835098443776.000 + Gradient do_[0] = 73901455441920.000 +Backward Time Step 1: + Gradient di[0] = 2048157024256.000, df[0] = 1412603707392.000, dc_hat[0] = 1144068767744.000 + Gradient do_[0] = 65279438618624.000 +Backward Time Step 0: + Gradient di[0] = 2483877314560.000, df[0] = 1767264354304.000, dc_hat[0] = 2330714701824.000 + Gradient do_[0] = 38400518258688.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1974529878392832.000, df[0] = -1550057321005056.000, dc_hat[0] = -893592977539072.000 + Gradient do_[0] = -115527533566361600.000 +Backward Time Step 3: + Gradient di[0] = -3104612009639936.000, df[0] = -2369755990196224.000, dc_hat[0] = -1285751878713344.000 + Gradient do_[0] = -158208023746052096.000 +Backward Time Step 2: + Gradient di[0] = -4072768505118720.000, df[0] = -3068807283212288.000, dc_hat[0] = -2251413132410880.000 + Gradient do_[0] = -181292957565976576.000 +Backward Time Step 1: + Gradient di[0] = -5148162012479488.000, df[0] = -3684021351481344.000, dc_hat[0] = -3128842336075776.000 + Gradient do_[0] = -162965885437345792.000 +Backward Time Step 0: + Gradient di[0] = -6023589090295808.000, df[0] = -4402643468288000.000, dc_hat[0] = -6089647834791936.000 + Gradient do_[0] = -93449623548461056.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 755736379392.000, df[0] = 554063822848.000, dc_hat[0] = 328540061696.000 + Gradient do_[0] = 46900325646336.000 +Backward Time Step 3: + Gradient di[0] = 1187376660480.000, df[0] = 856715821056.000, dc_hat[0] = 465888215040.000 + Gradient do_[0] = 62879592087552.000 +Backward Time Step 2: + Gradient di[0] = 1513242361856.000, df[0] = 1086868357120.000, dc_hat[0] = 775601258496.000 + Gradient do_[0] = 68641039056896.000 +Backward Time Step 1: + Gradient di[0] = 1902402338816.000, df[0] = 1312067420160.000, dc_hat[0] = 1062460391424.000 + Gradient do_[0] = 60630790832128.000 +Backward Time Step 0: + Gradient di[0] = 2306911764480.000, df[0] = 1641354493952.000, dc_hat[0] = 2164661420032.000 + Gradient do_[0] = 35664645062656.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2004161663074304.000, df[0] = -1573317119049728.000, dc_hat[0] = -906978041790464.000 + Gradient do_[0] = -117259006682071040.000 +Backward Time Step 3: + Gradient di[0] = -3151170260434944.000, df[0] = -2405295502393344.000, dc_hat[0] = -1304954677493760.000 + Gradient do_[0] = -160576131274113024.000 +Backward Time Step 2: + Gradient di[0] = -4133834618568704.000, df[0] = -3114810141048832.000, dc_hat[0] = -2284915353714688.000 + Gradient do_[0] = -184003820664127488.000 +Backward Time Step 1: + Gradient di[0] = -5225267412860928.000, df[0] = -3739168731561984.000, dc_hat[0] = -3175158994960384.000 + Gradient do_[0] = -165398829791707136.000 +Backward Time Step 0: + Gradient di[0] = -6113306662141952.000, df[0] = -4468218223656960.000, dc_hat[0] = -6180348954148864.000 + Gradient do_[0] = -94841493600075776.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 696167694336.000, df[0] = 510390272000.000, dc_hat[0] = 302635974656.000 + Gradient do_[0] = 43202782429184.000 +Backward Time Step 3: + Gradient di[0] = 1093773426688.000, df[0] = 789179072512.000, dc_hat[0] = 429137756160.000 + Gradient do_[0] = 57921081704448.000 +Backward Time Step 2: + Gradient di[0] = 1393946787840.000, df[0] = 1001182920704.000, dc_hat[0] = 714381197312.000 + Gradient do_[0] = 63227228585984.000 +Backward Time Step 1: + Gradient di[0] = 1752388468736.000, df[0] = 1208595251200.000, dc_hat[0] = 978507595776.000 + Gradient do_[0] = 55846973210624.000 +Backward Time Step 0: + Gradient di[0] = 2124814876672.000, df[0] = 1511793491968.000, dc_hat[0] = 1993793077248.000 + Gradient do_[0] = 32849447092224.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2034129931599872.000, df[0] = -1596840654929920.000, dc_hat[0] = -920515443163136.000 + Gradient do_[0] = -119010279597015040.000 +Backward Time Step 3: + Gradient di[0] = -3198254913159168.000, df[0] = -2441236594032640.000, dc_hat[0] = -1324374640558080.000 + Gradient do_[0] = -162971005038362624.000 +Backward Time Step 2: + Gradient di[0] = -4195599201075200.000, df[0] = -3161338863943680.000, dc_hat[0] = -2318801840373760.000 + Gradient do_[0] = -186745779325501440.000 +Backward Time Step 1: + Gradient di[0] = -5303254355279872.000, df[0] = -3794946398093312.000, dc_hat[0] = -3222011987886080.000 + Gradient do_[0] = -167859519634800640.000 +Backward Time Step 0: + Gradient di[0] = -6204058247364608.000, df[0] = -4534548624834560.000, dc_hat[0] = -6272095898042368.000 + Gradient do_[0] = -96249418239442944.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 634406895616.000, df[0] = 465110335488.000, dc_hat[0] = 275782467584.000 + Gradient do_[0] = 39369503145984.000 +Backward Time Step 3: + Gradient di[0] = 996725686272.000, df[0] = 719157264384.000, dc_hat[0] = 391045578752.000 + Gradient do_[0] = 52780890849280.000 +Backward Time Step 2: + Gradient di[0] = 1270258860032.000, df[0] = 912343826432.000, dc_hat[0] = 650943856640.000 + Gradient do_[0] = 57615312748544.000 +Backward Time Step 1: + Gradient di[0] = 1596860792832.000, df[0] = 1101324615680.000, dc_hat[0] = 891557576704.000 + Gradient do_[0] = 50888722874368.000 +Backward Time Step 0: + Gradient di[0] = 1936113401856.000, df[0] = 1377533558784.000, dc_hat[0] = 1816727519232.000 + Gradient do_[0] = 29932138790912.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.723, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2064487565754368.000, df[0] = -1620671415189504.000, dc_hat[0] = -934235716190208.000 + Gradient do_[0] = -120784865594441728.000 +Backward Time Step 3: + Gradient di[0] = -3245948645933056.000, df[0] = -2477642884317184.000, dc_hat[0] = -1344067870916608.000 + Gradient do_[0] = -165398159776808960.000 +Backward Time Step 2: + Gradient di[0] = -4258144662323200.000, df[0] = -3208460023889920.000, dc_hat[0] = -2353194327867392.000 + Gradient do_[0] = -189524605986144256.000 +Backward Time Step 1: + Gradient di[0] = -5382220550242304.000, df[0] = -3851433539534848.000, dc_hat[0] = -3269621163491328.000 + Gradient do_[0] = -170353641503326208.000 +Backward Time Step 0: + Gradient di[0] = -6296050541264896.000, df[0] = -4601785800982528.000, dc_hat[0] = -6365096972386304.000 + Gradient do_[0] = -97676575742361600.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 570433798144.000, df[0] = 418208448512.000, dc_hat[0] = 247967989760.000 + Gradient do_[0] = 35399044956160.000 +Backward Time Step 3: + Gradient di[0] = 896206176256.000, df[0] = 646630604800.000, dc_hat[0] = 351595036672.000 + Gradient do_[0] = 47457027227648.000 +Backward Time Step 2: + Gradient di[0] = 1142145417216.000, df[0] = 820326694912.000, dc_hat[0] = 585249456128.000 + Gradient do_[0] = 51803009843200.000 +Backward Time Step 1: + Gradient di[0] = 1435774353408.000, df[0] = 990221631488.000, dc_hat[0] = 801526775808.000 + Gradient do_[0] = 45753703596032.000 +Backward Time Step 0: + Gradient di[0] = 1740694224896.000, df[0] = 1238493822976.000, dc_hat[0] = 1633358184448.000 + Gradient do_[0] = 26910975328256.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2095196850356224.000, df[0] = -1644777992880128.000, dc_hat[0] = -948113426612224.000 + Gradient do_[0] = -122579869866393600.000 +Backward Time Step 3: + Gradient di[0] = -3294192134520832.000, df[0] = -2514469007654912.000, dc_hat[0] = -1363989002977280.000 + Gradient do_[0] = -167853163083202560.000 +Backward Time Step 2: + Gradient di[0] = -4321411409641472.000, df[0] = -3256124497199104.000, dc_hat[0] = -2387983026094080.000 + Gradient do_[0] = -192335335663861760.000 +Backward Time Step 1: + Gradient di[0] = -5462094057046016.000, df[0] = -3908570026344448.000, dc_hat[0] = -3317778752733184.000 + Gradient do_[0] = -172876436573519872.000 +Backward Time Step 0: + Gradient di[0] = -6389105302700032.000, df[0] = -4669799829340160.000, dc_hat[0] = -6459172325425152.000 + Gradient do_[0] = -99120217329762304.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 504443764736.000, df[0] = 369828233216.000, dc_hat[0] = 219278180352.000 + Gradient do_[0] = 31303531495424.000 +Backward Time Step 3: + Gradient di[0] = 792519901184.000, df[0] = 571819098112.000, dc_hat[0] = 310905405440.000 + Gradient do_[0] = 41965697630208.000 +Backward Time Step 2: + Gradient di[0] = 1009997643776.000, df[0] = 725412610048.000, dc_hat[0] = 517498470400.000 + Gradient do_[0] = 45808099524608.000 +Backward Time Step 1: + Gradient di[0] = 1269623291904.000, df[0] = 875626758144.000, dc_hat[0] = 708691558400.000 + Gradient do_[0] = 40457681764352.000 +Backward Time Step 0: + Gradient di[0] = 1539161849856.000, df[0] = 1095104724992.000, dc_hat[0] = 1444252876800.000 + Gradient do_[0] = 23795301613568.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2126226244239360.000, df[0] = -1669136631463936.000, dc_hat[0] = -962137501466624.000 + Gradient do_[0] = -124393643145428992.000 +Backward Time Step 3: + Gradient di[0] = -3342940013330432.000, df[0] = -2551680872742912.000, dc_hat[0] = -1384116427685888.000 + Gradient do_[0] = -170334004912848896.000 +Backward Time Step 2: + Gradient di[0] = -4385334213214208.000, df[0] = -3304282891747328.000, dc_hat[0] = -2423132770009088.000 + Gradient do_[0] = -195175442917883904.000 +Backward Time Step 1: + Gradient di[0] = -5542798103150592.000, df[0] = -3966299755511808.000, dc_hat[0] = -3366435363487744.000 + Gradient do_[0] = -175425482483826688.000 +Backward Time Step 0: + Gradient di[0] = -6483124284293120.000, df[0] = -4738517695463424.000, dc_hat[0] = -6554222636040192.000 + Gradient do_[0] = -100578831173156864.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 436169572352.000, df[0] = 319773278208.000, dc_hat[0] = 189596303360.000 + Gradient do_[0] = 27066395262976.000 +Backward Time Step 3: + Gradient di[0] = 685247234048.000, df[0] = 494419869696.000, dc_hat[0] = 268812058624.000 + Gradient do_[0] = 36284697411584.000 +Backward Time Step 2: + Gradient di[0] = 873281093632.000, df[0] = 627217399808.000, dc_hat[0] = 447417090048.000 + Gradient do_[0] = 39606321938432.000 +Backward Time Step 1: + Gradient di[0] = 1097736847360.000, df[0] = 757077573632.000, dc_hat[0] = 612677844992.000 + Gradient do_[0] = 34979262234624.000 +Backward Time Step 0: + Gradient di[0] = 1330705858560.000, df[0] = 946789548032.000, dc_hat[0] = 1248650854400.000 + Gradient do_[0] = 20572591357952.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2157605812174848.000, df[0] = -1693769208430592.000, dc_hat[0] = -976318141300736.000 + Gradient do_[0] = -126227946368139264.000 +Backward Time Step 3: + Gradient di[0] = -3392235768905728.000, df[0] = -2589310423400448.000, dc_hat[0] = -1404472022532096.000 + Gradient do_[0] = -172842660950704128.000 +Backward Time Step 2: + Gradient di[0] = -4449973739454464.000, df[0] = -3352981646868480.000, dc_hat[0] = -2458676845608960.000 + Gradient do_[0] = -198047229850681344.000 +Backward Time Step 1: + Gradient di[0] = -5624396576194560.000, df[0] = -4024671045419008.000, dc_hat[0] = -3415635287605248.000 + Gradient do_[0] = -178002720559464448.000 +Backward Time Step 0: + Gradient di[0] = -6578191237906432.000, df[0] = -4808002750119936.000, dc_hat[0] = -6650331656093696.000 + Gradient do_[0] = -102053697172799488.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.851 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 365640876032.000, df[0] = 268065669120.000, dc_hat[0] = 158935695360.000 + Gradient do_[0] = 22689462878208.000 +Backward Time Step 3: + Gradient di[0] = 574435229696.000, df[0] = 414466932736.000, dc_hat[0] = 225333968896.000 + Gradient do_[0] = 30416507502592.000 +Backward Time Step 2: + Gradient di[0] = 732056977408.000, df[0] = 525784940544.000, dc_hat[0] = 375036477440.000 + Gradient do_[0] = 33200451616768.000 +Backward Time Step 1: + Gradient di[0] = 920193662976.000, df[0] = 634628210688.000, dc_hat[0] = 513529708544.000 + Gradient do_[0] = 29320955297792.000 +Backward Time Step 0: + Gradient di[0] = 1115418001408.000, df[0] = 793613500416.000, dc_hat[0] = 1046638362624.000 + Gradient do_[0] = 17244261187584.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2189320387559424.000, df[0] = -1718665523232768.000, dc_hat[0] = -990651386691584.000 + Gradient do_[0] = -128081748742373376.000 +Backward Time Step 3: + Gradient di[0] = -3442061952942080.000, df[0] = -2627344237854720.000, dc_hat[0] = -1425045318533120.000 + Gradient do_[0] = -175378169124093952.000 +Backward Time Step 2: + Gradient di[0] = -4515309855703040.000, df[0] = -3402204119564288.000, dc_hat[0] = -2494601025814528.000 + Gradient do_[0] = -200949906188271616.000 +Backward Time Step 1: + Gradient di[0] = -5706874443792384.000, df[0] = -4083670474293248.000, dc_hat[0] = -3465363492700160.000 + Gradient do_[0] = -180607772843311104.000 +Backward Time Step 0: + Gradient di[0] = -6674270193188864.000, df[0] = -4878226539151360.000, dc_hat[0] = -6747464488976384.000 + Gradient do_[0] = -103544248393007104.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 292985503744.000, df[0] = 214798974976.000, dc_hat[0] = 127351799808.000 + Gradient do_[0] = 18180684644352.000 +Backward Time Step 3: + Gradient di[0] = 460285870080.000, df[0] = 332105908224.000, dc_hat[0] = 180549648384.000 + Gradient do_[0] = 24371817086976.000 +Backward Time Step 2: + Gradient di[0] = 586581868544.000, df[0] = 421299781632.000, dc_hat[0] = 300488491008.000 + Gradient do_[0] = 26602146627584.000 +Backward Time Step 1: + Gradient di[0] = 737315127296.000, df[0] = 508500410368.000, dc_hat[0] = 411427241984.000 + Gradient do_[0] = 23493030707200.000 +Backward Time Step 0: + Gradient di[0] = 893687103488.000, df[0] = 635853144064.000, dc_hat[0] = 838579912704.000 + Gradient do_[0] = 13816321540096.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2221360440934400.000, df[0] = -1743816717500416.000, dc_hat[0] = -1005131466276864.000 + Gradient do_[0] = -129954646541205504.000 +Backward Time Step 3: + Gradient di[0] = -3492391453458432.000, df[0] = -2665763257188352.000, dc_hat[0] = -1445826249359360.000 + Gradient do_[0] = -177939464281128960.000 +Backward Time Step 2: + Gradient di[0] = -4581313302495232.000, df[0] = -3451930445611008.000, dc_hat[0] = -2530895646949376.000 + Gradient do_[0] = -203882423958634496.000 +Backward Time Step 1: + Gradient di[0] = -5790192514367488.000, df[0] = -4143270393282560.000, dc_hat[0] = -3515599040806912.000 + Gradient do_[0] = -183239402384785408.000 +Backward Time Step 0: + Gradient di[0] = -6771329474756608.000, df[0] = -4949167050850304.000, dc_hat[0] = -6845587311820800.000 + Gradient do_[0] = -105050020977311744.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 217997656064.000, df[0] = 159822381056.000, dc_hat[0] = 94755258368.000 + Gradient do_[0] = 13527281565696.000 +Backward Time Step 3: + Gradient di[0] = 342473572352.000, df[0] = 247101931520.000, dc_hat[0] = 134332383232.000 + Gradient do_[0] = 18133419032576.000 +Backward Time Step 2: + Gradient di[0] = 436440104960.000, df[0] = 313463177216.000, dc_hat[0] = 223560613888.000 + Gradient do_[0] = 19792547282944.000 +Backward Time Step 1: + Gradient di[0] = 548578721792.000, df[0] = 378333822976.000, dc_hat[0] = 306078842880.000 + Gradient do_[0] = 17478806667264.000 +Backward Time Step 0: + Gradient di[0] = 664885067776.000, df[0] = 473061916672.000, dc_hat[0] = 623886467072.000 + Gradient do_[0] = 10279063650304.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2253760466255872.000, df[0] = -1769250976956416.000, dc_hat[0] = -1019772875571200.000 + Gradient do_[0] = -131848460830769152.000 +Backward Time Step 3: + Gradient di[0] = -3543285205303296.000, df[0] = -2704612846993408.000, dc_hat[0] = -1466841524338688.000 + Gradient do_[0] = -180529346740486144.000 +Backward Time Step 2: + Gradient di[0] = -4648047967469568.000, df[0] = -3502207601213440.000, dc_hat[0] = -2567591579090944.000 + Gradient do_[0] = -206847342962278400.000 +Backward Time Step 1: + Gradient di[0] = -5874432929169408.000, df[0] = -4203531468800000.000, dc_hat[0] = -3566394276839424.000 + Gradient do_[0] = -185900134624657408.000 +Backward Time Step 0: + Gradient di[0] = -6869462498148352.000, df[0] = -5020892467822592.000, dc_hat[0] = -6944797298262016.000 + Gradient do_[0] = -106572449444790272.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 140631359488.000, df[0] = 103102103552.000, dc_hat[0] = 61126000640.000 + Gradient do_[0] = 8726406758400.000 +Backward Time Step 3: + Gradient di[0] = 220928671744.000, df[0] = 159404818432.000, dc_hat[0] = 86654337024.000 + Gradient do_[0] = 11697616060416.000 +Backward Time Step 2: + Gradient di[0] = 281544032256.000, df[0] = 202212245504.000, dc_hat[0] = 144207790080.000 + Gradient do_[0] = 12767704645632.000 +Backward Time Step 1: + Gradient di[0] = 353875755008.000, df[0] = 244053590016.000, dc_hat[0] = 197424316416.000 + Gradient do_[0] = 11274853285888.000 +Backward Time Step 0: + Gradient di[0] = 428877807616.000, df[0] = 305144102912.000, dc_hat[0] = 402432032768.000 + Gradient do_[0] = 6630412386304.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2286473353101312.000, df[0] = -1794930854854656.000, dc_hat[0] = -1034557159636992.000 + Gradient do_[0] = -133760674760228864.000 +Backward Time Step 3: + Gradient di[0] = -3594673683693568.000, df[0] = -2743839588614144.000, dc_hat[0] = -1488058394345472.000 + Gradient do_[0] = -183144466427674624.000 +Backward Time Step 2: + Gradient di[0] = -4715434393731072.000, df[0] = -3552975725264896.000, dc_hat[0] = -2604645604130816.000 + Gradient do_[0] = -209841175685758976.000 +Backward Time Step 1: + Gradient di[0] = -5959502809530368.000, df[0] = -4264385518239744.000, dc_hat[0] = -3617688802820096.000 + Gradient do_[0] = -188587117704642560.000 +Backward Time Step 0: + Gradient di[0] = -6968560815439872.000, df[0] = -5093323870044160.000, dc_hat[0] = -7044982779150336.000 + Gradient do_[0] = -108109858758197248.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 61110956032.000, df[0] = 44802711552.000, dc_hat[0] = 26561681408.000 + Gradient do_[0] = 3791989506048.000 +Backward Time Step 3: + Gradient di[0] = 96002744320.000, df[0] = 69268078592.000, dc_hat[0] = 37653594112.000 + Gradient do_[0] = 5083011481600.000 +Backward Time Step 2: + Gradient di[0] = 122341736448.000, df[0] = 87868899328.000, dc_hat[0] = 62659858432.000 + Gradient do_[0] = 5547922817024.000 +Backward Time Step 1: + Gradient di[0] = 153769033728.000, df[0] = 106047758336.000, dc_hat[0] = 85777801216.000 + Gradient do_[0] = 4899103834112.000 +Backward Time Step 0: + Gradient di[0] = 186349125632.000, df[0] = 132586332160.000, dc_hat[0] = 174858338304.000 + Gradient do_[0] = 2880940539904.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2319541111619584.000, df[0] = -1820889234538496.000, dc_hat[0] = -1049501431234560.000 + Gradient do_[0] = -135693495942774784.000 +Backward Time Step 3: + Gradient di[0] = -3646617286606848.000, df[0] = -2783491531997184.000, dc_hat[0] = -1509506655715328.000 + Gradient do_[0] = -185787743920455680.000 +Backward Time Step 2: + Gradient di[0] = -4783540227014656.000, df[0] = -3604285820502016.000, dc_hat[0] = -2642093960855552.000 + Gradient do_[0] = -212866911426314240.000 +Backward Time Step 1: + Gradient di[0] = -6045463358734336.000, df[0] = -4325875759710208.000, dc_hat[0] = -3669520736583680.000 + Gradient do_[0] = -191302207050612736.000 +Backward Time Step 0: + Gradient di[0] = -7068695830462464.000, df[0] = -5166512260251648.000, dc_hat[0] = -7146215695187968.000 + Gradient do_[0] = -109663357019095040.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -20936667136.000, df[0] = -15349449728.000, dc_hat[0] = -9099941888.000 + Gradient do_[0] = -1299127730176.000 +Backward Time Step 3: + Gradient di[0] = -32890167296.000, df[0] = -23730995200.000, dc_hat[0] = -12899672064.000 + Gradient do_[0] = -1741399785472.000 +Backward Time Step 2: + Gradient di[0] = -41913380864.000, df[0] = -30103224320.000, dc_hat[0] = -21465958400.000 + Gradient do_[0] = -1900648726528.000 +Backward Time Step 1: + Gradient di[0] = -52678877184.000, df[0] = -36330217472.000, dc_hat[0] = -29384400896.000 + Gradient do_[0] = -1678328201216.000 +Backward Time Step 0: + Gradient di[0] = -63837835264.000, df[0] = -45420249088.000, dc_hat[0] = -59901419520.000 + Gradient do_[0] = -986927202304.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865 + c_state[0] = 0.725, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 15143181221888.000, df[0] = 10388175323136.000, dc_hat[0] = 6339325591552.000 + Gradient do_[0] = 1005682765594624.000 +Backward Time Step 3: + Gradient di[0] = 23823973875712.000, df[0] = 16230065897472.000, dc_hat[0] = 8885159067648.000 + Gradient do_[0] = 1319416369250304.000 +Backward Time Step 2: + Gradient di[0] = 29269482799104.000, df[0] = 19999343247360.000, dc_hat[0] = 13915838742528.000 + Gradient do_[0] = 1356046534705152.000 +Backward Time Step 1: + Gradient di[0] = 36549555126272.000, df[0] = 24284053372928.000, dc_hat[0] = 18720644661248.000 + Gradient do_[0] = 1173082538508288.000 +Backward Time Step 0: + Gradient di[0] = 45576443920384.000, df[0] = 31626388045824.000, dc_hat[0] = 39735137402880.000 + Gradient do_[0] = 702963672678400.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1347854925824.000, df[0] = 988168585216.000, dc_hat[0] = 585804611584.000 + Gradient do_[0] = 83632341385216.000 +Backward Time Step 3: + Gradient di[0] = 2117441159168.000, df[0] = 1527796465664.000, dc_hat[0] = 830424612864.000 + Gradient do_[0] = 112107261526016.000 +Backward Time Step 2: + Gradient di[0] = 2698207035392.000, df[0] = 1937926258688.000, dc_hat[0] = 1381846351872.000 + Gradient do_[0] = 122354592120832.000 +Backward Time Step 1: + Gradient di[0] = 3391417745408.000, df[0] = 2338906177536.000, dc_hat[0] = 1891740942336.000 + Gradient do_[0] = 108049238851584.000 +Backward Time Step 0: + Gradient di[0] = 4111209857024.000, df[0] = 2925102366720.000, dc_hat[0] = 3857701666816.000 + Gradient do_[0] = 63558935117824.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1798014507155456.000, df[0] = -1411486777868288.000, dc_hat[0] = -813488884678656.000 + Gradient do_[0] = -105180484903895040.000 +Backward Time Step 3: + Gradient di[0] = -2826725746540544.000, df[0] = -2157681510973440.000, dc_hat[0] = -1170028548325376.000 + Gradient do_[0] = -144011370126376960.000 +Backward Time Step 2: + Gradient di[0] = -3707752924839936.000, df[0] = -2793711138242560.000, dc_hat[0] = -2047773968957440.000 + Gradient do_[0] = -164990807898587136.000 +Backward Time Step 1: + Gradient di[0] = -4686005277818880.000, df[0] = -3353096805679104.000, dc_hat[0] = -2844185862340608.000 + Gradient do_[0] = -148281289173106688.000 +Backward Time Step 0: + Gradient di[0] = -5480716498370560.000, df[0] = -4005857746485248.000, dc_hat[0] = -5540821344452608.000 + Gradient do_[0] = -85027519098519552.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1306311393280.000, df[0] = 957710860288.000, dc_hat[0] = 567739351040.000 + Gradient do_[0] = 81053616177152.000 +Backward Time Step 3: + Gradient di[0] = 2052148559872.000, df[0] = 1480686567424.000, dc_hat[0] = 804790140928.000 + Gradient do_[0] = 108648554561536.000 +Backward Time Step 2: + Gradient di[0] = 2614984966144.000, df[0] = 1878151135232.000, dc_hat[0] = 1339142307840.000 + Gradient do_[0] = 118577973690368.000 +Backward Time Step 1: + Gradient di[0] = 3286740238336.000, df[0] = 2266705690624.000, dc_hat[0] = 1833173123072.000 + Gradient do_[0] = 104711420116992.000 +Backward Time Step 0: + Gradient di[0] = 3984099115008.000, df[0] = 2834663473152.000, dc_hat[0] = 3738428768256.000 + Gradient do_[0] = 61593811419136.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1821990088343552.000, df[0] = -1430307995648000.000, dc_hat[0] = -824322402811904.000 + Gradient do_[0] = -106581769523822592.000 +Backward Time Step 3: + Gradient di[0] = -2864386704146432.000, df[0] = -2186430679875584.000, dc_hat[0] = -1185574853541888.000 + Gradient do_[0] = -145927732994244608.000 +Backward Time Step 2: + Gradient di[0] = -3757134042890240.000, df[0] = -2830914144960512.000, dc_hat[0] = -2074918699139072.000 + Gradient do_[0] = -167184453855084544.000 +Backward Time Step 1: + Gradient di[0] = -4748323105800192.000, df[0] = -3397674808115200.000, dc_hat[0] = -2881744814473216.000 + Gradient do_[0] = -150249346267348992.000 +Backward Time Step 0: + Gradient di[0] = -5553305203769344.000, df[0] = -4058912940621824.000, dc_hat[0] = -5614206229413888.000 + Gradient do_[0] = -86153659523530752.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1264228630528.000, df[0] = 926858280960.000, dc_hat[0] = 549443338240.000 + Gradient do_[0] = 78441890185216.000 +Backward Time Step 3: + Gradient di[0] = 1986015526912.000, df[0] = 1432970723328.000, dc_hat[0] = 778836377600.000 + Gradient do_[0] = 105146008731648.000 +Backward Time Step 2: + Gradient di[0] = 2530686795776.000, df[0] = 1817605308416.000, dc_hat[0] = 1295923019776.000 + Gradient do_[0] = 114753707966464.000 +Backward Time Step 1: + Gradient di[0] = 3180709806080.000, df[0] = 2193575903232.000, dc_hat[0] = 1773932380160.000 + Gradient do_[0] = 101331733839872.000 +Backward Time Step 0: + Gradient di[0] = 3855426256896.000, df[0] = 2743113613312.000, dc_hat[0] = 3617690222592.000 + Gradient do_[0] = 59604545306624.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1847368781660160.000, df[0] = -1450231275192320.000, dc_hat[0] = -835794528894976.000 + Gradient do_[0] = -108065526105767936.000 +Backward Time Step 3: + Gradient di[0] = -2904245074395136.000, df[0] = -2216857167724544.000, dc_hat[0] = -1202044442509312.000 + Gradient do_[0] = -147956658365005824.000 +Backward Time Step 2: + Gradient di[0] = -3809379098820608.000, df[0] = -2870277251792896.000, dc_hat[0] = -2103689074442240.000 + Gradient do_[0] = -169506845751246848.000 +Backward Time Step 1: + Gradient di[0] = -4814266423050240.000, df[0] = -3444852071071744.000, dc_hat[0] = -2921606942818304.000 + Gradient do_[0] = -152333642356490240.000 +Backward Time Step 0: + Gradient di[0] = -5630206056333312.000, df[0] = -4115119835447296.000, dc_hat[0] = -5691950506180608.000 + Gradient do_[0] = -87346698359144448.000 +Epoch 400, Train Loss=0.011534, Weight Norm=12.551555 +Sample Predictions at Epoch 400: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.08 | 63.87 | 6.79 | +| 193 | 2024-10-14 | 56.50 | 66.55 | 10.05 | +| 194 | 2024-10-15 | 56.70 | 66.00 | 9.30 | +| 195 | 2024-10-16 | 57.64 | 67.20 | 9.56 | +| 196 | 2024-10-17 | 57.20 | 66.76 | 9.56 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1221786861568.000, df[0] = 895742443520.000, dc_hat[0] = 530991120384.000 + Gradient do_[0] = 75807825330176.000 +Backward Time Step 3: + Gradient di[0] = 1919313248256.000, df[0] = 1384844492800.000, dc_hat[0] = 752661430272.000 + Gradient do_[0] = 101613381353472.000 +Backward Time Step 2: + Gradient di[0] = 2445662486528.000, df[0] = 1756537159680.000, dc_hat[0] = 1252335157248.000 + Gradient do_[0] = 110896684728320.000 +Backward Time Step 1: + Gradient di[0] = 3073773928448.000, df[0] = 2119822999552.000, dc_hat[0] = 1714195267584.000 + Gradient do_[0] = 97923375300608.000 +Backward Time Step 0: + Gradient di[0] = 3725666287616.000, df[0] = 2650790166528.000, dc_hat[0] = 3495931412480.000 + Gradient do_[0] = 57598464229376.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1873784575361024.000, df[0] = -1470968451039232.000, dc_hat[0] = -847735947264000.000 + Gradient do_[0] = -109609867496390656.000 +Backward Time Step 3: + Gradient di[0] = -2945735263780864.000, df[0] = -2248529196089344.000, dc_hat[0] = -1219186999164928.000 + Gradient do_[0] = -150068699942879232.000 +Backward Time Step 2: + Gradient di[0] = -3863764390641664.000, df[0] = -2911252850409472.000, dc_hat[0] = -2133642176364544.000 + Gradient do_[0] = -171924465662296064.000 +Backward Time Step 1: + Gradient di[0] = -4882906979762176.000, df[0] = -3493958848086016.000, dc_hat[0] = -2963102769348608.000 + Gradient do_[0] = -154503305215606784.000 +Backward Time Step 0: + Gradient di[0] = -5710258341150720.000, df[0] = -4173630174920704.000, dc_hat[0] = -5772880574939136.000 + Gradient do_[0] = -88588622512521216.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1177911296000.000, df[0] = 863575605248.000, dc_hat[0] = 511916572672.000 + Gradient do_[0] = 73084883173376.000 +Backward Time Step 3: + Gradient di[0] = 1850363084800.000, df[0] = 1335096115200.000, dc_hat[0] = 725605679104.000 + Gradient do_[0] = 97961895788544.000 +Backward Time Step 2: + Gradient di[0] = 2357782380544.000, df[0] = 1693418913792.000, dc_hat[0] = 1207289511936.000 + Gradient do_[0] = 106910300766208.000 +Backward Time Step 1: + Gradient di[0] = 2963254542336.000, df[0] = 2043598471168.000, dc_hat[0] = 1652467564544.000 + Gradient do_[0] = 94400948469760.000 +Backward Time Step 0: + Gradient di[0] = 3591572029440.000, df[0] = 2555382595584.000, dc_hat[0] = 3370105700352.000 + Gradient do_[0] = 55525370757120.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1900930245066752.000, df[0] = -1492279139237888.000, dc_hat[0] = -860007407026176.000 + Gradient do_[0] = -111196918041804800.000 +Backward Time Step 3: + Gradient di[0] = -2988370867257344.000, df[0] = -2281075652952064.000, dc_hat[0] = -1236804418142208.000 + Gradient do_[0] = -152239118716239872.000 +Backward Time Step 2: + Gradient di[0] = -3919655605370880.000, df[0] = -2953363125698560.000, dc_hat[0] = -2164424475410432.000 + Gradient do_[0] = -174408983983947776.000 +Backward Time Step 1: + Gradient di[0] = -4953444301406208.000, df[0] = -3544423371636736.000, dc_hat[0] = -3005747499630592.000 + Gradient do_[0] = -156732857098698752.000 +Backward Time Step 0: + Gradient di[0] = -5792513407320064.000, df[0] = -4233750053388288.000, dc_hat[0] = -5856037584240640.000 + Gradient do_[0] = -89864726015639552.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1133031063552.000, df[0] = 830672207872.000, dc_hat[0] = 492406243328.000 + Gradient do_[0] = 70299680768000.000 +Backward Time Step 3: + Gradient di[0] = 1779837173760.000, df[0] = 1284210425856.000, dc_hat[0] = 697933889536.000 + Gradient do_[0] = 94227044237312.000 +Backward Time Step 2: + Gradient di[0] = 2267890319360.000, df[0] = 1628855468032.000, dc_hat[0] = 1161217441792.000 + Gradient do_[0] = 102832724246528.000 +Backward Time Step 1: + Gradient di[0] = 2850206253056.000, df[0] = 1965630423040.000, dc_hat[0] = 1589337653248.000 + Gradient do_[0] = 90798133608448.000 +Backward Time Step 0: + Gradient di[0] = 3454425890816.000, df[0] = 2457804210176.000, dc_hat[0] = 3241416851456.000 + Gradient do_[0] = 53405108142080.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1928686739652608.000, df[0] = -1514069253160960.000, dc_hat[0] = -872554482892800.000 + Gradient do_[0] = -112819642585579520.000 +Backward Time Step 3: + Gradient di[0] = -3031964785311744.000, df[0] = -2314353864867840.000, dc_hat[0] = -1254817779417088.000 + Gradient do_[0] = -154458276778475520.000 +Backward Time Step 2: + Gradient di[0] = -3976798266195968.000, df[0] = -2996416951615488.000, dc_hat[0] = -2195898264190976.000 + Gradient do_[0] = -176949216621363200.000 +Backward Time Step 1: + Gradient di[0] = -5025573445304320.000, df[0] = -3596026329956352.000, dc_hat[0] = -3049355107893248.000 + Gradient do_[0] = -159012745998499840.000 +Backward Time Step 0: + Gradient di[0] = -5876630878683136.000, df[0] = -4295231436488704.000, dc_hat[0] = -5941077399830528.000 + Gradient do_[0] = -91169708878856192.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1085995352064.000, df[0] = 796188475392.000, dc_hat[0] = 471959142400.000 + Gradient do_[0] = 67380738785280.000 +Backward Time Step 3: + Gradient di[0] = 1705926721536.000, df[0] = 1230882865152.000, dc_hat[0] = 668935716864.000 + Gradient do_[0] = 90313179791360.000 +Backward Time Step 2: + Gradient di[0] = 2173689135104.000, df[0] = 1561196756992.000, dc_hat[0] = 1112942313472.000 + Gradient do_[0] = 98559961595904.000 +Backward Time Step 1: + Gradient di[0] = 2731754389504.000, df[0] = 1883936522240.000, dc_hat[0] = 1523203178496.000 + Gradient do_[0] = 87023335505920.000 +Backward Time Step 0: + Gradient di[0] = 3310738210816.000, df[0] = 2355571458048.000, dc_hat[0] = 3106589376512.000 + Gradient do_[0] = 51183716663296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1956948698202112.000, df[0] = -1536256517341184.000, dc_hat[0] = -885330534203392.000 + Gradient do_[0] = -114472019583565824.000 +Backward Time Step 3: + Gradient di[0] = -3076352735444992.000, df[0] = -2348238472478720.000, dc_hat[0] = -1273159437254656.000 + Gradient do_[0] = -156717910612508672.000 +Backward Time Step 2: + Gradient di[0] = -4034983261896704.000, df[0] = -3040255145934848.000, dc_hat[0] = -2227944894234624.000 + Gradient do_[0] = -179535731826360320.000 +Backward Time Step 1: + Gradient di[0] = -5099009869873152.000, df[0] = -3648564785840128.000, dc_hat[0] = -3093754332315648.000 + Gradient do_[0] = -161334055562903552.000 +Backward Time Step 0: + Gradient di[0] = -5962280579629056.000, df[0] = -4357832732311552.000, dc_hat[0] = -6027666624872448.000 + Gradient do_[0] = -92498477270958080.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1037819183104.000, df[0] = 760868634624.000, dc_hat[0] = 451017015296.000 + Gradient do_[0] = 64391110197248.000 +Backward Time Step 3: + Gradient di[0] = 1630229233664.000, df[0] = 1176265687040.000, dc_hat[0] = 639238602752.000 + Gradient do_[0] = 86304708624384.000 +Backward Time Step 2: + Gradient di[0] = 2077214900224.000, df[0] = 1491905806336.000, dc_hat[0] = 1063508049920.000 + Gradient do_[0] = 94184245559296.000 +Backward Time Step 1: + Gradient di[0] = 2610447777792.000, df[0] = 1800274051072.000, dc_hat[0] = 1455484698624.000 + Gradient do_[0] = 83157655224320.000 +Backward Time Step 0: + Gradient di[0] = 3163602550784.000, df[0] = 2250885038080.000, dc_hat[0] = 2968526258176.000 + Gradient do_[0] = 48909007192064.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1985599082856448.000, df[0] = -1558748053110784.000, dc_hat[0] = -898281806757888.000 + Gradient do_[0] = -116147013879332864.000 +Backward Time Step 3: + Gradient di[0] = -3121350839369728.000, df[0] = -2382588547170304.000, dc_hat[0] = -1291753021767680.000 + Gradient do_[0] = -159008605650026496.000 +Backward Time Step 2: + Gradient di[0] = -4093972490223616.000, df[0] = -3084699735949312.000, dc_hat[0] = -2260432429514752.000 + Gradient do_[0] = -182157963979390976.000 +Backward Time Step 1: + Gradient di[0] = -5173457759240192.000, df[0] = -3701826943713280.000, dc_hat[0] = -3138763710529536.000 + Gradient do_[0] = -163687285324251136.000 +Backward Time Step 0: + Gradient di[0] = -6049099048550400.000, df[0] = -4421288726626304.000, dc_hat[0] = -6115436965920768.000 + Gradient do_[0] = -93845379014983680.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 987559559168.000, df[0] = 724021149696.000, dc_hat[0] = 429169672192.000 + Gradient do_[0] = 61272280268800.000 +Backward Time Step 3: + Gradient di[0] = 1551257567232.000, df[0] = 1119285936128.000, dc_hat[0] = 608258883584.000 + Gradient do_[0] = 82122979147776.000 +Backward Time Step 2: + Gradient di[0] = 1976568250368.000, df[0] = 1419618549760.000, dc_hat[0] = 1011941834752.000 + Gradient do_[0] = 89619492241408.000 +Backward Time Step 1: + Gradient di[0] = 2483906412544.000, df[0] = 1713002250240.000, dc_hat[0] = 1384856682496.000 + Gradient do_[0] = 79125427519488.000 +Backward Time Step 0: + Gradient di[0] = 3010140045312.000, df[0] = 2141697474560.000, dc_hat[0] = 2824526888960.000 + Gradient do_[0] = 46536494940160.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2014644604502016.000, df[0] = -1581549900267520.000, dc_hat[0] = -911411655999488.000 + Gradient do_[0] = -117845140868956160.000 +Backward Time Step 3: + Gradient di[0] = -3166971445116928.000, df[0] = -2417413484183552.000, dc_hat[0] = -1310602425270272.000 + Gradient do_[0] = -161330877287104512.000 +Backward Time Step 2: + Gradient di[0] = -4153769977708544.000, df[0] = -3129753137577984.000, dc_hat[0] = -2293367044046848.000 + Gradient do_[0] = -184816119238885376.000 +Backward Time Step 1: + Gradient di[0] = -5248924629598208.000, df[0] = -3755817903849472.000, dc_hat[0] = -3184389148114944.000 + Gradient do_[0] = -166072658620841984.000 +Backward Time Step 0: + Gradient di[0] = -6137097022865408.000, df[0] = -4485606398754816.000, dc_hat[0] = -6204400234135552.000 + Gradient do_[0] = -95210577319690240.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 935386415104.000, df[0] = 685771128832.000, dc_hat[0] = 406492184576.000 + Gradient do_[0] = 58034759925760.000 +Backward Time Step 3: + Gradient di[0] = 1469285793792.000, df[0] = 1060141531136.000, dc_hat[0] = 576104497152.000 + Gradient do_[0] = 77782604316672.000 +Backward Time Step 2: + Gradient di[0] = 1872098754560.000, df[0] = 1344585203712.000, dc_hat[0] = 958421794816.000 + Gradient do_[0] = 84881606443008.000 +Backward Time Step 1: + Gradient di[0] = 2352564928512.000, df[0] = 1622420357120.000, dc_hat[0] = 1311560695808.000 + Gradient do_[0] = 74940376154112.000 +Backward Time Step 0: + Gradient di[0] = 2850869477376.000, df[0] = 2028377341952.000, dc_hat[0] = 2675077021696.000 + Gradient do_[0] = 44074186833920.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2044089289670656.000, df[0] = -1604665548472320.000, dc_hat[0] = -924722028085248.000 + Gradient do_[0] = -119566632480669696.000 +Backward Time Step 3: + Gradient di[0] = -3213218579218432.000, df[0] = -2452717310050304.000, dc_hat[0] = -1329711942729728.000 + Gradient do_[0] = -163685103480864768.000 +Backward Time Step 2: + Gradient di[0] = -4214393709527040.000, df[0] = -3175429041029120.000, dc_hat[0] = -2326755851370496.000 + Gradient do_[0] = -187510936339218432.000 +Backward Time Step 1: + Gradient di[0] = -5325426050203648.000, df[0] = -3810549208973312.000, dc_hat[0] = -3230639503441920.000 + Gradient do_[0] = -168490759568228352.000 +Backward Time Step 0: + Gradient di[0] = -6226311546667008.000, df[0] = -4550813129113600.000, dc_hat[0] = -6294592936738816.000 + Gradient do_[0] = -96594630530826240.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 881278844928.000, df[0] = 646102777856.000, dc_hat[0] = 382974099456.000 + Gradient do_[0] = 54677290876928.000 +Backward Time Step 3: + Gradient di[0] = 1384273674240.000, df[0] = 998803177472.000, dc_hat[0] = 542759485440.000 + Gradient do_[0] = 73281319206912.000 +Backward Time Step 2: + Gradient di[0] = 1763761192960.000, df[0] = 1266774310912.000, dc_hat[0] = 902926696448.000 + Gradient do_[0] = 79968423903232.000 +Backward Time Step 1: + Gradient di[0] = 2216371159040.000, df[0] = 1528492457984.000, dc_hat[0] = 1235567378432.000 + Gradient do_[0] = 70600907292672.000 +Backward Time Step 0: + Gradient di[0] = 2685730553856.000, df[0] = 1910881779712.000, dc_hat[0] = 2520121344000.000 + Gradient do_[0] = 41521151737856.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2073838380646400.000, df[0] = -1628019567362048.000, dc_hat[0] = -938169369362432.000 + Gradient do_[0] = -121305845127446528.000 +Backward Time Step 3: + Gradient di[0] = -3259933864755200.000, df[0] = -2488378960379904.000, dc_hat[0] = -1349013928411136.000 + Gradient do_[0] = -166063106613575680.000 +Backward Time Step 2: + Gradient di[0] = -4275623031734272.000, df[0] = -3221561553190912.000, dc_hat[0] = -2360476713353216.000 + Gradient do_[0] = -190232914812731392.000 +Backward Time Step 1: + Gradient di[0] = -5402703249276928.000, df[0] = -3865835101749248.000, dc_hat[0] = -3277358278639616.000 + Gradient do_[0] = -170933410548678656.000 +Backward Time Step 0: + Gradient di[0] = -6316421571149824.000, df[0] = -4616674841985024.000, dc_hat[0] = -6385691340570624.000 + Gradient do_[0] = -97992608025935872.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 825474154496.000, df[0] = 605190160384.000, dc_hat[0] = 358719094784.000 + Gradient do_[0] = 51214595129344.000 +Backward Time Step 3: + Gradient di[0] = 1296600399872.000, df[0] = 935544815616.000, dc_hat[0] = 508372746240.000 + Gradient do_[0] = 68639348752384.000 +Backward Time Step 2: + Gradient di[0] = 1652036337664.000, df[0] = 1186530328576.000, dc_hat[0] = 845701382144.000 + Gradient do_[0] = 74901855666176.000 +Backward Time Step 1: + Gradient di[0] = 2075924627456.000, df[0] = 1431632216064.000, dc_hat[0] = 1157213323264.000 + Gradient do_[0] = 66126121795584.000 +Backward Time Step 0: + Gradient di[0] = 2515452035072.000, df[0] = 1789729701888.000, dc_hat[0] = 2360342740992.000 + Gradient do_[0] = 38888663941120.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2103900333146112.000, df[0] = -1651620010000384.000, dc_hat[0] = -951758981431296.000 + Gradient do_[0] = -123063311385231360.000 +Backward Time Step 3: + Gradient di[0] = -3307149782417408.000, df[0] = -2524422057492480.000, dc_hat[0] = -1368522609393664.000 + Gradient do_[0] = -168466535952678912.000 +Backward Time Step 2: + Gradient di[0] = -4337509483937792.000, df[0] = -3268189060333568.000, dc_hat[0] = -2394558352588800.000 + Gradient do_[0] = -192983789826211840.000 +Backward Time Step 1: + Gradient di[0] = -5480804545200128.000, df[0] = -3921710747222016.000, dc_hat[0] = -3324574464737280.000 + Gradient do_[0] = -173402054671204352.000 +Backward Time Step 0: + Gradient di[0] = -6407495278919680.000, df[0] = -4683240929492992.000, dc_hat[0] = -6477764165107712.000 + Gradient do_[0] = -99405532007235584.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 767638044672.000, df[0] = 562788040704.000, dc_hat[0] = 333581877248.000 + Gradient do_[0] = 47625919266816.000 +Backward Time Step 3: + Gradient di[0] = 1205739061248.000, df[0] = 869985878016.000, dc_hat[0] = 472737087488.000 + Gradient do_[0] = 63828658225152.000 +Backward Time Step 2: + Gradient di[0] = 1536249298944.000, df[0] = 1103369076736.000, dc_hat[0] = 786400935936.000 + Gradient do_[0] = 69651203620864.000 +Backward Time Step 1: + Gradient di[0] = 1930382278656.000, df[0] = 1331258064896.000, dc_hat[0] = 1076026408960.000 + Gradient do_[0] = 61489180311552.000 +Backward Time Step 0: + Gradient di[0] = 2339013132288.000, df[0] = 1664194314240.000, dc_hat[0] = 2194783207424.000 + Gradient do_[0] = 36160931889152.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2134311654391808.000, df[0] = -1675494256803840.000, dc_hat[0] = -965505762459648.000 + Gradient do_[0] = -124841264637018112.000 +Backward Time Step 3: + Gradient di[0] = -3354906865958912.000, df[0] = -2560878276771840.000, dc_hat[0] = -1388254360240128.000 + Gradient do_[0] = -170897676420775936.000 +Backward Time Step 2: + Gradient di[0] = -4400104874180608.000, df[0] = -3315350485598208.000, dc_hat[0] = -2429030565412864.000 + Gradient do_[0] = -195766344518467584.000 +Backward Time Step 1: + Gradient di[0] = -5559797583708160.000, df[0] = -3978224732209152.000, dc_hat[0] = -3372327790182400.000 + Gradient do_[0] = -175898856599322624.000 +Backward Time Step 0: + Gradient di[0] = -6499599778840576.000, df[0] = -4750559710019584.000, dc_hat[0] = -6570877982343168.000 + Gradient do_[0] = -100834424676941824.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 707754983424.000, df[0] = 518885376000.000, dc_hat[0] = 307555926016.000 + Gradient do_[0] = 43910298599424.000 +Backward Time Step 3: + Gradient di[0] = 1111665278976.000, df[0] = 802108997632.000, dc_hat[0] = 435844251648.000 + Gradient do_[0] = 58847976751104.000 +Backward Time Step 2: + Gradient di[0] = 1416373075968.000, df[0] = 1017270763520.000, dc_hat[0] = 725011267584.000 + Gradient do_[0] = 64215326916608.000 +Backward Time Step 1: + Gradient di[0] = 1779710558208.000, df[0] = 1227347066880.000, dc_hat[0] = 991990120448.000 + Gradient do_[0] = 56688954572800.000 +Backward Time Step 0: + Gradient di[0] = 2156370067456.000, df[0] = 1534244945920.000, dc_hat[0] = 2023402504192.000 + Gradient do_[0] = 33337286590464.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2164964701765632.000, df[0] = -1699558421692416.000, dc_hat[0] = -979361461174272.000 + Gradient do_[0] = -126633356921143296.000 +Backward Time Step 3: + Gradient di[0] = -3403046201589760.000, df[0] = -2597626285391872.000, dc_hat[0] = -1408143011610624.000 + Gradient do_[0] = -173348058342359040.000 +Backward Time Step 2: + Gradient di[0] = -4463205191516160.000, df[0] = -3362892552339456.000, dc_hat[0] = -2463781951111168.000 + Gradient do_[0] = -198571267400400896.000 +Backward Time Step 1: + Gradient di[0] = -5639421445537792.000, df[0] = -4035189151891456.000, dc_hat[0] = -3420461220233216.000 + Gradient do_[0] = -178415604355563520.000 +Backward Time Step 0: + Gradient di[0] = -6592436033814528.000, df[0] = -4818413750845440.000, dc_hat[0] = -6664732681437184.000 + Gradient do_[0] = -102274681830113280.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 645849743360.000, df[0] = 473500123136.000, dc_hat[0] = 280651825152.000 + Gradient do_[0] = 40069289213952.000 +Backward Time Step 3: + Gradient di[0] = 1014417260544.000, df[0] = 731941437440.000, dc_hat[0] = 397708296192.000 + Gradient do_[0] = 53699443425280.000 +Backward Time Step 2: + Gradient di[0] = 1292455116800.000, df[0] = 928269664256.000, dc_hat[0] = 661557673984.000 + Gradient do_[0] = 58596394008576.000 +Backward Time Step 1: + Gradient di[0] = 1623963992064.000, df[0] = 1119936970752.000, dc_hat[0] = 905134014464.000 + Gradient do_[0] = 51727256518656.000 +Backward Time Step 0: + Gradient di[0] = 1967592177664.000, df[0] = 1399930617856.000, dc_hat[0] = 1846265118720.000 + Gradient do_[0] = 30418797592576.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2196039595458560.000, df[0] = -1723953701715968.000, dc_hat[0] = -993408017498112.000 + Gradient do_[0] = -128450076547743744.000 +Backward Time Step 3: + Gradient di[0] = -3451844814700544.000, df[0] = -2634878415798272.000, dc_hat[0] = -1428304259186688.000 + Gradient do_[0] = -175832129987411968.000 +Backward Time Step 2: + Gradient di[0] = -4527172555374592.000, df[0] = -3411087454109696.000, dc_hat[0] = -2499006856953856.000 + Gradient do_[0] = -201414604469829632.000 +Backward Time Step 1: + Gradient di[0] = -5720131397222400.000, df[0] = -4092930692218880.000, dc_hat[0] = -3469251780280320.000 + Gradient do_[0] = -180966729030041600.000 +Backward Time Step 0: + Gradient di[0] = -6686540914753536.000, df[0] = -4887194967736320.000, dc_hat[0] = -6759869965139968.000 + Gradient do_[0] = -103734609933500416.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 581724930048.000, df[0] = 426487676928.000, dc_hat[0] = 252783673344.000 + Gradient do_[0] = 36090626965504.000 +Backward Time Step 3: + Gradient di[0] = 913684889600.000, df[0] = 659259916288.000, dc_hat[0] = 358208176128.000 + Gradient do_[0] = 48366566244352.000 +Backward Time Step 2: + Gradient di[0] = 1164102729728.000, df[0] = 836083908608.000, dc_hat[0] = 595839156224.000 + Gradient do_[0] = 52776537161728.000 +Backward Time Step 1: + Gradient di[0] = 1462654599168.000, df[0] = 1008690724864.000, dc_hat[0] = 815186247680.000 + Gradient do_[0] = 46588500115456.000 +Backward Time Step 0: + Gradient di[0] = 1772088590336.000, df[0] = 1260830851072.000, dc_hat[0] = 1662816878592.000 + Gradient do_[0] = 27396329701376.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2227351517659136.000, df[0] = -1748535141728256.000, dc_hat[0] = -1007561276915712.000 + Gradient do_[0] = -130280548659625984.000 +Backward Time Step 3: + Gradient di[0] = -3501014674046976.000, df[0] = -2672412940304384.000, dc_hat[0] = -1448619051843584.000 + Gradient do_[0] = -178334858970398720.000 +Backward Time Step 2: + Gradient di[0] = -4591611023458304.000, df[0] = -3459638301294592.000, dc_hat[0] = -2534492950495232.000 + Gradient do_[0] = -204278918159532032.000 +Backward Time Step 1: + Gradient di[0] = -5801451234263040.000, df[0] = -4151108708597760.000, dc_hat[0] = -3518409560031232.000 + Gradient do_[0] = -183537026438529024.000 +Backward Time Step 0: + Gradient di[0] = -6781354465296384.000, df[0] = -4956494265057280.000, dc_hat[0] = -6855723434639360.000 + Gradient do_[0] = -105205541743099904.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 515513090048.000, df[0] = 377944965120.000, dc_hat[0] = 224009273344.000 + Gradient do_[0] = 31982547369984.000 +Backward Time Step 3: + Gradient di[0] = 809677946880.000, df[0] = 584215363584.000, dc_hat[0] = 317425778688.000 + Gradient do_[0] = 42860447531008.000 +Backward Time Step 2: + Gradient di[0] = 1031578058752.000, df[0] = 740901453824.000, dc_hat[0] = 527989080064.000 + Gradient do_[0] = 46767693365248.000 +Backward Time Step 1: + Gradient di[0] = 1296110452736.000, df[0] = 893835018240.000, dc_hat[0] = 722330779648.000 + Gradient do_[0] = 41283171123200.000 +Backward Time Step 0: + Gradient di[0] = 1570255273984.000, df[0] = 1117227581440.000, dc_hat[0] = 1473429110784.000 + Gradient do_[0] = 24276006600704.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2258989857374208.000, df[0] = -1773373071818752.000, dc_hat[0] = -1021862914031616.000 + Gradient do_[0] = -132130193505517568.000 +Backward Time Step 3: + Gradient di[0] = -3550703150694400.000, df[0] = -2710344749285376.000, dc_hat[0] = -1469149263953920.000 + Gradient do_[0] = -180864199570751488.000 +Backward Time Step 2: + Gradient di[0] = -4656735075696640.000, df[0] = -3508704544555008.000, dc_hat[0] = -2570355390545920.000 + Gradient do_[0] = -207173760476774400.000 +Backward Time Step 1: + Gradient di[0] = -5883618790473728.000, df[0] = -4209892852236288.000, dc_hat[0] = -3568078709325824.000 + Gradient do_[0] = -186134193162420224.000 +Backward Time Step 0: + Gradient di[0] = -6877153173962752.000, df[0] = -5026513506271232.000, dc_hat[0] = -6952572262809600.000 + Gradient do_[0] = -106691763636273152.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 447387828224.000, df[0] = 327999488000.000, dc_hat[0] = 194404319232.000 + Gradient do_[0] = 27755836080128.000 +Backward Time Step 3: + Gradient di[0] = 702669979648.000, df[0] = 507005272064.000, dc_hat[0] = 275468615680.000 + Gradient do_[0] = 37195561828352.000 +Backward Time Step 2: + Gradient di[0] = 895233425408.000, df[0] = 642975727616.000, dc_hat[0] = 458189373440.000 + Gradient do_[0] = 40585838723072.000 +Backward Time Step 1: + Gradient di[0] = 1124774969344.000, df[0] = 775675576320.000, dc_hat[0] = 626814746624.000 + Gradient do_[0] = 35825383374848.000 +Backward Time Step 0: + Gradient di[0] = 1362632245248.000, df[0] = 969504980992.000, dc_hat[0] = 1278608670720.000 + Gradient do_[0] = 21066170761216.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2290945219362816.000, df[0] = -1798459438923776.000, dc_hat[0] = -1036307157483520.000 + Gradient do_[0] = -133998298120847360.000 +Backward Time Step 3: + Gradient di[0] = -3600884474839040.000, df[0] = -2748651562598400.000, dc_hat[0] = -1489879728914432.000 + Gradient do_[0] = -183418450981421056.000 +Backward Time Step 2: + Gradient di[0] = -4722504446771200.000, df[0] = -3558257998168064.000, dc_hat[0] = -2606571628527616.000 + Gradient do_[0] = -210097138556731392.000 +Backward Time Step 1: + Gradient di[0] = -5966610980405248.000, df[0] = -4269267285442560.000, dc_hat[0] = -3618244732649472.000 + Gradient do_[0] = -188757267129040896.000 +Backward Time Step 0: + Gradient di[0] = -6973897849176064.000, df[0] = -5097224237219840.000, dc_hat[0] = -7050377794945024.000 + Gradient do_[0] = -108192648547794944.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 377028116480.000, df[0] = 276415807488.000, dc_hat[0] = 163828924416.000 + Gradient do_[0] = 23390534500352.000 +Backward Time Step 3: + Gradient di[0] = 592154066944.000, df[0] = 427264016384.000, dc_hat[0] = 232138178560.000 + Gradient do_[0] = 31345107533824.000 +Backward Time Step 2: + Gradient di[0] = 754420875264.000, df[0] = 541840998400.000, dc_hat[0] = 386107572224.000 + Gradient do_[0] = 34201596329984.000 +Backward Time Step 1: + Gradient di[0] = 947834191872.000, df[0] = 653651083264.000, dc_hat[0] = 528184016896.000 + Gradient do_[0] = 30189205585920.000 +Backward Time Step 0: + Gradient di[0] = 1148237250560.000, df[0] = 816964239360.000, dc_hat[0] = 1077433860096.000 + Gradient do_[0] = 17751642996736.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2323200960626688.000, df[0] = -1823781760794624.000, dc_hat[0] = -1050886558187520.000 + Gradient do_[0] = -135883986332286976.000 +Backward Time Step 3: + Gradient di[0] = -3651532339806208.000, df[0] = -2787315126632448.000, dc_hat[0] = -1510804004274176.000 + Gradient do_[0] = -185996548050518016.000 +Backward Time Step 2: + Gradient di[0] = -4788883703201792.000, df[0] = -3608270476410880.000, dc_hat[0] = -2643123142393856.000 + Gradient do_[0] = -213047712369606656.000 +Backward Time Step 1: + Gradient di[0] = -6050366600773632.000, df[0] = -4329188253237248.000, dc_hat[0] = -3668872196521984.000 + Gradient do_[0] = -191404530351472640.000 +Backward Time Step 0: + Gradient di[0] = -7071543393779712.000, df[0] = -5168593708777472.000, dc_hat[0] = -7149094397018112.000 + Gradient do_[0] = -109707517872832512.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 304252092416.000, df[0] = 223060541440.000, dc_hat[0] = 132204265472.000 + Gradient do_[0] = 18875429158912.000 +Backward Time Step 3: + Gradient di[0] = 477846110208.000, df[0] = 344786337792.000, dc_hat[0] = 187322957824.000 + Gradient do_[0] = 25294092107776.000 +Backward Time Step 2: + Gradient di[0] = 608782974976.000, df[0] = 437240561664.000, dc_hat[0] = 311560699904.000 + Gradient do_[0] = 27598761492480.000 +Backward Time Step 1: + Gradient di[0] = 764840574976.000, df[0] = 527452962816.000, dc_hat[0] = 426190503936.000 + Gradient do_[0] = 24360412774400.000 +Backward Time Step 0: + Gradient di[0] = 926521425920.000, df[0] = 659214696448.000, dc_hat[0] = 869389631488.000 + Gradient do_[0] = 14323937181696.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2355782045663232.000, df[0] = -1849359633219584.000, dc_hat[0] = -1065613262848000.000 + Gradient do_[0] = -137788787148193792.000 +Backward Time Step 3: + Gradient di[0] = -3702696406155264.000, df[0] = -2826372485480448.000, dc_hat[0] = -1531940880515072.000 + Gradient do_[0] = -188600964679204864.000 +Backward Time Step 2: + Gradient di[0] = -4855946396303360.000, df[0] = -3658796740116480.000, dc_hat[0] = -2680051271204864.000 + Gradient do_[0] = -216028608651591680.000 +Backward Time Step 1: + Gradient di[0] = -6134972087795712.000, df[0] = -4389715885162496.000, dc_hat[0] = -3720011566809088.000 + Gradient do_[0] = -194078680069177344.000 +Backward Time Step 0: + Gradient di[0] = -7170167654055936.000, df[0] = -5240677755518976.000, dc_hat[0] = -7248799915311104.000 + Gradient do_[0] = -111237574202228736.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 229311381504.000, df[0] = 168118321152.000, dc_hat[0] = 99639861248.000 + Gradient do_[0] = 14226089312256.000 +Backward Time Step 3: + Gradient di[0] = 360142438400.000, df[0] = 259858382848.000, dc_hat[0] = 141178388480.000 + Gradient do_[0] = 19063419961344.000 +Backward Time Step 2: + Gradient di[0] = 458821500928.000, df[0] = 329534996480.000, dc_hat[0] = 234806378496.000 + Gradient do_[0] = 20800105086976.000 +Backward Time Step 1: + Gradient di[0] = 576423591936.000, df[0] = 397515227136.000, dc_hat[0] = 321184563200.000 + Gradient do_[0] = 18359024353280.000 +Backward Time Step 0: + Gradient di[0] = 698250952704.000, df[0] = 496801546240.000, dc_hat[0] = 655194849280.000 + Gradient do_[0] = 10794895933440.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2388696795971584.000, df[0] = -1875199632867328.000, dc_hat[0] = -1080491029561344.000 + Gradient do_[0] = -139712932496801792.000 +Backward Time Step 3: + Gradient di[0] = -3754374526402560.000, df[0] = -2865823102271488.000, dc_hat[0] = -1553290760290304.000 + Gradient do_[0] = -191231477529182208.000 +Backward Time Step 2: + Gradient di[0] = -4923671588110336.000, df[0] = -3709824172818432.000, dc_hat[0] = -2717343398494208.000 + Gradient do_[0] = -219038985589096448.000 +Backward Time Step 1: + Gradient di[0] = -6220422609633280.000, df[0] = -4450849912782848.000, dc_hat[0] = -3771662038204416.000 + Gradient do_[0] = -196779527303593984.000 +Backward Time Step 0: + Gradient di[0] = -7269780293681152.000, df[0] = -5313484967378944.000, dc_hat[0] = -7349505087242240.000 + Gradient do_[0] = -112782946385002496.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 152166465536.000, df[0] = 111559999488.000, dc_hat[0] = 66118258688.000 + Gradient do_[0] = 9440084361216.000 +Backward Time Step 3: + Gradient di[0] = 238979956736.000, df[0] = 172434571264.000, dc_hat[0] = 93679968256.000 + Gradient do_[0] = 12649802760192.000 +Backward Time Step 2: + Gradient di[0] = 304457285632.000, df[0] = 218667417600.000, dc_hat[0] = 155804008448.000 + Gradient do_[0] = 13802020012032.000 +Backward Time Step 1: + Gradient di[0] = 382484709376.000, df[0] = 263769915392.000, dc_hat[0] = 213111685120.000 + Gradient do_[0] = 12181939683328.000 +Backward Time Step 0: + Gradient di[0] = 463308685312.000, df[0] = 329641459712.000, dc_hat[0] = 434739838976.000 + Gradient do_[0] = 7162710982656.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2421923736715264.000, df[0] = -1901284445650944.000, dc_hat[0] = -1095509255127040.000 + Gradient do_[0] = -141655331456417792.000 +Backward Time Step 3: + Gradient di[0] = -3806551668162560.000, df[0] = -2905653823668224.000, dc_hat[0] = -1574845724753920.000 + Gradient do_[0] = -193887227606990848.000 +Backward Time Step 2: + Gradient di[0] = -4992049614946304.000, df[0] = -3761342842404864.000, dc_hat[0] = -2754993618681856.000 + Gradient do_[0] = -222078241886699520.000 +Backward Time Step 1: + Gradient di[0] = -6306690248998912.000, df[0] = -4512567519084544.000, dc_hat[0] = -3823805088661504.000 + Gradient do_[0] = -199506041262571520.000 +Backward Time Step 0: + Gradient di[0] = -7370338362982400.000, df[0] = -5386982595231744.000, dc_hat[0] = -7451165889396736.000 + Gradient do_[0] = -114342998765993984.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 72700993536.000, df[0] = 53300350976.000, dc_hat[0] = 31589171200.000 + Gradient do_[0] = 4510180179968.000 +Backward Time Step 3: + Gradient di[0] = 114176606208.000, df[0] = 82383536128.000, dc_hat[0] = 44756267008.000 + Gradient do_[0] = 6043591507968.000 +Backward Time Step 2: + Gradient di[0] = 145457889280.000, df[0] = 104470781952.000, dc_hat[0] = 74434813952.000 + Gradient do_[0] = 6593989050368.000 +Backward Time Step 1: + Gradient di[0] = 182731898880.000, df[0] = 126015717376.000, dc_hat[0] = 101809463296.000 + Gradient do_[0] = 5819841642496.000 +Backward Time Step 0: + Gradient di[0] = 221338042368.000, df[0] = 157480747008.000, dc_hat[0] = 207689744384.000 + Gradient do_[0] = 3421865771008.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2455471726264320.000, df[0] = -1927621453545472.000, dc_hat[0] = -1110672435838976.000 + Gradient do_[0] = -143616550962724864.000 +Backward Time Step 3: + Gradient di[0] = -3859232931708928.000, df[0] = -2945869481508864.000, dc_hat[0] = -1596607652954112.000 + Gradient do_[0] = -196568627229491200.000 +Backward Time Step 2: + Gradient di[0] = -5061094435454976.000, df[0] = -3813363217858560.000, dc_hat[0] = -2793011595444224.000 + Gradient do_[0] = -225147270897598464.000 +Backward Time Step 1: + Gradient di[0] = -6393802923180032.000, df[0] = -4574889105162240.000, dc_hat[0] = -3876457092743168.000 + Gradient do_[0] = -202259441716822016.000 +Backward Time Step 0: + Gradient di[0] = -7471885348503552.000, df[0] = -5461203925073920.000, dc_hat[0] = -7553827418931200.000 + Gradient do_[0] = -115918418539970560.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -8865286144.000, df[0] = -6499538944.000, dc_hat[0] = -3851997184.000 + Gradient do_[0] = -549975031808.000 +Backward Time Step 3: + Gradient di[0] = -13922705408.000, df[0] = -10045867008.000, dc_hat[0] = -5457469952.000 + Gradient do_[0] = -736948912128.000 +Backward Time Step 2: + Gradient di[0] = -17736943616.000, df[0] = -12739026944.000, dc_hat[0] = -9076195328.000 + Gradient do_[0] = -804052729856.000 +Backward Time Step 1: + Gradient di[0] = -22281558016.000, df[0] = -15365799936.000, dc_hat[0] = -12413666304.000 + Gradient do_[0] = -709638160384.000 +Backward Time Step 0: + Gradient di[0] = -26988093440.000, df[0] = -19201871872.000, dc_hat[0] = -25323933696.000 + Gradient do_[0] = -417233469440.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865 + c_state[0] = 0.725, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 16122168475648.000, df[0] = 11059813416960.000, dc_hat[0] = 6746993065984.000 + Gradient do_[0] = 1070457180651520.000 +Backward Time Step 3: + Gradient di[0] = 25356742426624.000, df[0] = 17274792574976.000, dc_hat[0] = 9451647008768.000 + Gradient do_[0] = 1403907502768128.000 +Backward Time Step 2: + Gradient di[0] = 31141193056256.000, df[0] = 21278228480000.000, dc_hat[0] = 14793055404032.000 + Gradient do_[0] = 1442264110858240.000 +Backward Time Step 1: + Gradient di[0] = 38861535182848.000, df[0] = 25818883424256.000, dc_hat[0] = 19879405027328.000 + Gradient do_[0] = 1246827663851520.000 +Backward Time Step 0: + Gradient di[0] = 48432311959552.000, df[0] = 33608131674112.000, dc_hat[0] = 42224985309184.000 + Gradient do_[0] = 747012186177536.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1371785396224.000, df[0] = 1005723779072.000, dc_hat[0] = 596016562176.000 + Gradient do_[0] = 85098745561088.000 +Backward Time Step 3: + Gradient di[0] = 2154399399936.000, df[0] = 1554512609280.000, dc_hat[0] = 844442959872.000 + Gradient do_[0] = 114032832937984.000 +Backward Time Step 2: + Gradient di[0] = 2744470732800.000, df[0] = 1971141476352.000, dc_hat[0] = 1404334899200.000 + Gradient do_[0] = 124411403304960.000 +Backward Time Step 1: + Gradient di[0] = 3447840309248.000, df[0] = 2377698508800.000, dc_hat[0] = 1920886243328.000 + Gradient do_[0] = 109809152032768.000 +Backward Time Step 0: + Gradient di[0] = 4177542512640.000, df[0] = 2972297461760.000, dc_hat[0] = 3919943827456.000 + Gradient do_[0] = 64584429862912.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1903799283220480.000, df[0] = -1494549297889280.000, dc_hat[0] = -861094167969792.000 + Gradient do_[0] = -111346331364098048.000 +Backward Time Step 3: + Gradient di[0] = -2992190166925312.000, df[0] = -2284050723110912.000, dc_hat[0] = -1237817090899968.000 + Gradient do_[0] = -152401485659897856.000 +Backward Time Step 2: + Gradient di[0] = -3923735555866624.000, df[0] = -2956409062817792.000, dc_hat[0] = -2165228976472064.000 + Gradient do_[0] = -174547402189963264.000 +Backward Time Step 1: + Gradient di[0] = -4957071401287680.000, df[0] = -3546873650479104.000, dc_hat[0] = -3005265389551616.000 + Gradient do_[0] = -156808517242585088.000 +Backward Time Step 0: + Gradient di[0] = -5794594855845888.000, df[0] = -4235271277117440.000, dc_hat[0] = -5858141581344768.000 + Gradient do_[0] = -89897015579770880.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1330531270656.000, df[0] = 975478849536.000, dc_hat[0] = 578085781504.000 + Gradient do_[0] = 82538894721024.000 +Backward Time Step 3: + Gradient di[0] = 2089578921984.000, df[0] = 1507742580736.000, dc_hat[0] = 819019317248.000 + Gradient do_[0] = 110600784969728.000 +Backward Time Step 2: + Gradient di[0] = 2661863129088.000, df[0] = 1911810031616.000, dc_hat[0] = 1362022760448.000 + Gradient do_[0] = 120665227132928.000 +Backward Time Step 1: + Gradient di[0] = 3343979380736.000, df[0] = 2306069233664.000, dc_hat[0] = 1862940884992.000 + Gradient do_[0] = 106499980394496.000 +Backward Time Step 0: + Gradient di[0] = 4051569737728.000, df[0] = 2882668855296.000, dc_hat[0] = 3801739165696.000 + Gradient do_[0] = 62636909658112.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1928301803208704.000, df[0] = -1513785785319424.000, dc_hat[0] = -872168338489344.000 + Gradient do_[0] = -112778634237837312.000 +Backward Time Step 3: + Gradient di[0] = -3030657504641024.000, df[0] = -2313417025126400.000, dc_hat[0] = -1253706590846976.000 + Gradient do_[0] = -154359406631321600.000 +Backward Time Step 2: + Gradient di[0] = -3974145587019776.000, df[0] = -2994389995487232.000, dc_hat[0] = -2192979968131072.000 + Gradient do_[0] = -176787846110117888.000 +Backward Time Step 1: + Gradient di[0] = -5020667518910464.000, df[0] = -3592370775916544.000, dc_hat[0] = -3043694340997120.000 + Gradient do_[0] = -158818476037767168.000 +Backward Time Step 0: + Gradient di[0] = -5868732433825792.000, df[0] = -4289458463571968.000, dc_hat[0] = -5933092518756352.000 + Gradient do_[0] = -91047182051835904.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.861 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1289409789952.000, df[0] = 945330716672.000, dc_hat[0] = 560213327872.000 + Gradient do_[0] = 79987356991488.000 +Backward Time Step 3: + Gradient di[0] = 2024969732096.000, df[0] = 1461125382144.000, dc_hat[0] = 793680347136.000 + Gradient do_[0] = 107179977736192.000 +Backward Time Step 2: + Gradient di[0] = 2579527368704.000, df[0] = 1852674539520.000, dc_hat[0] = 1319853096960.000 + Gradient do_[0] = 116931457712128.000 +Backward Time Step 1: + Gradient di[0] = 3240470249472.000, df[0] = 2234683490304.000, dc_hat[0] = 1805197508608.000 + Gradient do_[0] = 103202158542848.000 +Backward Time Step 0: + Gradient di[0] = 3926034481152.000, df[0] = 2793350889472.000, dc_hat[0] = 3683944497152.000 + Gradient do_[0] = 60696133894144.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1954169720143872.000, df[0] = -1534093598654464.000, dc_hat[0] = -883859642122240.000 + Gradient do_[0] = -114290771963674624.000 +Backward Time Step 3: + Gradient di[0] = -3071274205052928.000, df[0] = -2344423199342592.000, dc_hat[0] = -1270483269976064.000 + Gradient do_[0] = -156426694649970688.000 +Backward Time Step 2: + Gradient di[0] = -4027370163929088.000, df[0] = -3034491299823616.000, dc_hat[0] = -2222282382508032.000 + Gradient do_[0] = -179153582816231424.000 +Backward Time Step 1: + Gradient di[0] = -5087804300197888.000, df[0] = -3640401395187712.000, dc_hat[0] = -3084269165477888.000 + Gradient do_[0] = -160940310141075456.000 +Backward Time Step 0: + Gradient di[0] = -5947001770344448.000, df[0] = -4346665548906496.000, dc_hat[0] = -6012220311863296.000 + Gradient do_[0] = -92261446615826432.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1247468847104.000, df[0] = 914582339584.000, dc_hat[0] = 541985472512.000 + Gradient do_[0] = 77385084960768.000 +Backward Time Step 3: + Gradient di[0] = 1959076823040.000, df[0] = 1413581242368.000, dc_hat[0] = 767837929472.000 + Gradient do_[0] = 103691340218368.000 +Backward Time Step 2: + Gradient di[0] = 2495561072640.000, df[0] = 1792367788032.000, dc_hat[0] = 1276851781632.000 + Gradient do_[0] = 113123885318144.000 +Backward Time Step 1: + Gradient di[0] = 3134914822144.000, df[0] = 2161886756864.000, dc_hat[0] = 1746320228352.000 + Gradient do_[0] = 99839232704512.000 +Backward Time Step 0: + Gradient di[0] = 3798026158080.000, df[0] = 2702273937408.000, dc_hat[0] = 3563829854208.000 + Gradient do_[0] = 58717143826432.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -1981037961805824.000, df[0] = -1555186854133760.000, dc_hat[0] = -896003058171904.000 + Gradient do_[0] = -115861381374279680.000 +Backward Time Step 3: + Gradient di[0] = -3113456689479680.000, df[0] = -2376625253515264.000, dc_hat[0] = -1287907281207296.000 + Gradient do_[0] = -158573680081764352.000 +Backward Time Step 2: + Gradient di[0] = -4082645856157696.000, df[0] = -3076137718644736.000, dc_hat[0] = -2252715178590208.000 + Gradient do_[0] = -181610372829020160.000 +Backward Time Step 1: + Gradient di[0] = -5157547589763072.000, df[0] = -3690296835571712.000, dc_hat[0] = -3126422122004480.000 + Gradient do_[0] = -163144659156074496.000 +Backward Time Step 0: + Gradient di[0] = -6028315701805056.000, df[0] = -4406097964171264.000, dc_hat[0] = -6094425985908736.000 + Gradient do_[0] = -93522947230138368.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1204307361792.000, df[0] = 882938675200.000, dc_hat[0] = 523227561984.000 + Gradient do_[0] = 74707055411200.000 +Backward Time Step 3: + Gradient di[0] = 1891267510272.000, df[0] = 1364654948352.000, dc_hat[0] = 741246763008.000 + Gradient do_[0] = 100101326372864.000 +Backward Time Step 2: + Gradient di[0] = 2409154215936.000, df[0] = 1730307948544.000, dc_hat[0] = 1232604364800.000 + Gradient do_[0] = 109205784625152.000 +Backward Time Step 1: + Gradient di[0] = 3026296766464.000, df[0] = 2086978322432.000, dc_hat[0] = 1685742682112.000 + Gradient do_[0] = 96378856407040.000 +Backward Time Step 0: + Gradient di[0] = 3666310332416.000, df[0] = 2608558768128.000, dc_hat[0] = 3440235511808.000 + Gradient do_[0] = 56680830205952.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.773, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2008600645992448.000, df[0] = -1576825570459648.000, dc_hat[0] = -908460275269632.000 + Gradient do_[0] = -117472603995635712.000 +Backward Time Step 3: + Gradient di[0] = -3156739490840576.000, df[0] = -2409666705358848.000, dc_hat[0] = -1305785619447808.000 + Gradient do_[0] = -160776637527359488.000 +Backward Time Step 2: + Gradient di[0] = -4139358214946816.000, df[0] = -3118867274530816.000, dc_hat[0] = -2283940396138496.000 + Gradient do_[0] = -184131123494780928.000 +Backward Time Step 1: + Gradient di[0] = -5229089396883456.000, df[0] = -3741478887096320.000, dc_hat[0] = -3169661436821504.000 + Gradient do_[0] = -165405821998465024.000 +Backward Time Step 0: + Gradient di[0] = -6111727724789760.000, df[0] = -4467063682760704.000, dc_hat[0] = -6178752300056576.000 + Gradient do_[0] = -94816986516684800.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1159640252416.000, df[0] = 850191450112.000, dc_hat[0] = 503816257536.000 + Gradient do_[0] = 71935669043200.000 +Backward Time Step 3: + Gradient di[0] = 1821096148992.000, df[0] = 1314023800832.000, dc_hat[0] = 713731014656.000 + Gradient do_[0] = 96386389377024.000 +Backward Time Step 2: + Gradient di[0] = 2319741878272.000, df[0] = 1666089484288.000, dc_hat[0] = 1186823012352.000 + Gradient do_[0] = 105151536824320.000 +Backward Time Step 1: + Gradient di[0] = 2913908817920.000, df[0] = 2009470599168.000, dc_hat[0] = 1623071784960.000 + Gradient do_[0] = 92798531403776.000 +Backward Time Step 0: + Gradient di[0] = 3530045521920.000, df[0] = 2511606906880.000, dc_hat[0] = 3312373465088.000 + Gradient do_[0] = 54574182301696.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2036784087171072.000, df[0] = -1598951362920448.000, dc_hat[0] = -921197940310016.000 + Gradient do_[0] = -119120119090642944.000 +Backward Time Step 3: + Gradient di[0] = -3200984633311232.000, df[0] = -2443443401916416.000, dc_hat[0] = -1324062450122752.000 + Gradient do_[0] = -163028695039082496.000 +Backward Time Step 2: + Gradient di[0] = -4197335978475520.000, df[0] = -3162550313156608.000, dc_hat[0] = -2315863277436928.000 + Gradient do_[0] = -186708172591857664.000 +Backward Time Step 1: + Gradient di[0] = -5302240206127104.000, df[0] = -3793812258291712.000, dc_hat[0] = -3213878393569280.000 + Gradient do_[0] = -167717905973116928.000 +Backward Time Step 0: + Gradient di[0] = -6197006984806400.000, df[0] = -4529394664079360.000, dc_hat[0] = -6264967326072832.000 + Gradient do_[0] = -96140025422413824.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1113643679744.000, df[0] = 816469245952.000, dc_hat[0] = 483827286016.000 + Gradient do_[0] = 69081889767424.000 +Backward Time Step 3: + Gradient di[0] = 1748838121472.000, df[0] = 1261886898176.000, dc_hat[0] = 685397966848.000 + Gradient do_[0] = 92561142185984.000 +Backward Time Step 2: + Gradient di[0] = 2227672449024.000, df[0] = 1599962873856.000, dc_hat[0] = 1139684671488.000 + Gradient do_[0] = 100977021550592.000 +Backward Time Step 1: + Gradient di[0] = 2798190067712.000, df[0] = 1929665970176.000, dc_hat[0] = 1558551592960.000 + Gradient do_[0] = 89112233115648.000 +Backward Time Step 0: + Gradient di[0] = 3389752082432.000, df[0] = 2411789287424.000, dc_hat[0] = 3180731039744.000 + Gradient do_[0] = 52405261565952.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2065406017667072.000, df[0] = -1621421155418112.000, dc_hat[0] = -934133979152384.000 + Gradient do_[0] = -120793283730341888.000 +Backward Time Step 3: + Gradient di[0] = -3245925023612928.000, df[0] = -2477751063805952.000, dc_hat[0] = -1342626372517888.000 + Gradient do_[0] = -165316108721586176.000 +Backward Time Step 2: + Gradient di[0] = -4256224275070976.000, df[0] = -3206919741243392.000, dc_hat[0] = -2348287596167168.000 + Gradient do_[0] = -189325680280862720.000 +Backward Time Step 1: + Gradient di[0] = -5376524886736896.000, df[0] = -3846956841435136.000, dc_hat[0] = -3258776639504384.000 + Gradient do_[0] = -170065792795148288.000 +Backward Time Step 0: + Gradient di[0] = -6283612315975680.000, df[0] = -4592694428958720.000, dc_hat[0] = -6352522381885440.000 + Gradient do_[0] = -97483602861752320.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1065880387584.000, df[0] = 781451919360.000, dc_hat[0] = 463071903744.000 + Gradient do_[0] = 66118588825600.000 +Backward Time Step 3: + Gradient di[0] = 1673808838656.000, df[0] = 1207750492160.000, dc_hat[0] = 655980494848.000 + Gradient do_[0] = 88589195018240.000 +Backward Time Step 2: + Gradient di[0] = 2132075872256.000, df[0] = 1531303034880.000, dc_hat[0] = 1090745532416.000 + Gradient do_[0] = 96642619408384.000 +Backward Time Step 1: + Gradient di[0] = 2678041083904.000, df[0] = 1846806970368.000, dc_hat[0] = 1491569868800.000 + Gradient do_[0] = 85284955881472.000 +Backward Time Step 0: + Gradient di[0] = 3244099371008.000, df[0] = 2308157997056.000, dc_hat[0] = 3044059643904.000 + Gradient do_[0] = 50153486548992.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2094492341501952.000, df[0] = -1644255751700480.000, dc_hat[0] = -947279733194752.000 + Gradient do_[0] = -122493566793547776.000 +Backward Time Step 3: + Gradient di[0] = -3291595289919488.000, df[0] = -2512615460831232.000, dc_hat[0] = -1361490674188288.000 + Gradient do_[0] = -167640596561788928.000 +Backward Time Step 2: + Gradient di[0] = -4316071691550720.000, df[0] = -3252010723835904.000, dc_hat[0] = -2381237779955712.000 + Gradient do_[0] = -191985639426621440.000 +Backward Time Step 1: + Gradient di[0] = -5452016453156864.000, df[0] = -3900964981440512.000, dc_hat[0] = -3304406908928000.000 + Gradient do_[0] = -172451801746898944.000 +Backward Time Step 0: + Gradient di[0] = -6371629617643520.000, df[0] = -4657026059730944.000, dc_hat[0] = -6441504440582144.000 + Gradient do_[0] = -98849101814169600.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1016350638080.000, df[0] = 745139404800.000, dc_hat[0] = 441549029376.000 + Gradient do_[0] = 63045657165824.000 +Backward Time Step 3: + Gradient di[0] = 1596007383040.000, df[0] = 1151613403136.000, dc_hat[0] = 625477550080.000 + Gradient do_[0] = 84470589816832.000 +Backward Time Step 2: + Gradient di[0] = 2032950312960.000, df[0] = 1460108656640.000, dc_hat[0] = 1040003694592.000 + Gradient do_[0] = 92148397506560.000 +Backward Time Step 1: + Gradient di[0] = 2553472090112.000, df[0] = 1760899760128.000, dc_hat[0] = 1422132641792.000 + Gradient do_[0] = 81316984913920.000 +Backward Time Step 0: + Gradient di[0] = 3093105737728.000, df[0] = 2200726929408.000, dc_hat[0] = 2902376579072.000 + Gradient do_[0] = 47819134074880.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2123878776176640.000, df[0] = -1667326302748672.000, dc_hat[0] = -960560845815808.000 + Gradient do_[0] = -124211390503190528.000 +Backward Time Step 3: + Gradient di[0] = -3337727802081280.000, df[0] = -2547832850481152.000, dc_hat[0] = -1380546638774272.000 + Gradient do_[0] = -169988689542250496.000 +Backward Time Step 2: + Gradient di[0] = -4376524966854656.000, df[0] = -3297558583574528.000, dc_hat[0] = -2414521629016064.000 + Gradient do_[0] = -194672639686475776.000 +Backward Time Step 1: + Gradient di[0] = -5528278429335552.000, df[0] = -3955524756307968.000, dc_hat[0] = -3350501034819584.000 + Gradient do_[0] = -174862154573283328.000 +Backward Time Step 0: + Gradient di[0] = -6460539198767104.000, df[0] = -4722010525532160.000, dc_hat[0] = -6531389516152832.000 + Gradient do_[0] = -100228439151214592.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 965164990464.000, df[0] = 707612704768.000, dc_hat[0] = 419307192320.000 + Gradient do_[0] = 59870153801728.000 +Backward Time Step 3: + Gradient di[0] = 1515607687168.000, df[0] = 1093601329152.000, dc_hat[0] = 593957486592.000 + Gradient do_[0] = 80214595993600.000 +Backward Time Step 2: + Gradient di[0] = 1930513481728.000, df[0] = 1386536108032.000, dc_hat[0] = 987571683328.000 + Gradient do_[0] = 87504220848128.000 +Backward Time Step 1: + Gradient di[0] = 2424749686784.000, df[0] = 1672128757760.000, dc_hat[0] = 1350388416512.000 + Gradient do_[0] = 77216868204544.000 +Backward Time Step 0: + Gradient di[0] = 2937090998272.000, df[0] = 2089723232256.000, dc_hat[0] = 2755982000128.000 + Gradient do_[0] = 45407161810944.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2153623974838272.000, df[0] = -1690678174154752.000, dc_hat[0] = -974004428996608.000 + Gradient do_[0] = -125950199423041536.000 +Backward Time Step 3: + Gradient di[0] = -3384426981490688.000, df[0] = -2583482689650688.000, dc_hat[0] = -1399834397376512.000 + Gradient do_[0] = -172365507263987712.000 +Backward Time Step 2: + Gradient di[0] = -4437715097485312.000, df[0] = -3343661567836160.000, dc_hat[0] = -2448211084050432.000 + Gradient do_[0] = -197392436316602368.000 +Backward Time Step 1: + Gradient di[0] = -5605465434095616.000, df[0] = -4010745956139008.000, dc_hat[0] = -3397154311766016.000 + Gradient do_[0] = -177301833436364800.000 +Backward Time Step 0: + Gradient di[0] = -6550519837360128.000, df[0] = -4787777212252160.000, dc_hat[0] = -6622356923482112.000 + Gradient do_[0] = -101624398011695104.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 912043999232.000, df[0] = 668667412480.000, dc_hat[0] = 396225314816.000 + Gradient do_[0] = 56574588485632.000 +Backward Time Step 3: + Gradient di[0] = 1432170266624.000, df[0] = 1033397665792.000, dc_hat[0] = 561248862208.000 + Gradient do_[0] = 75797943549952.000 +Backward Time Step 2: + Gradient di[0] = 1824214876160.000, df[0] = 1310189944832.000, dc_hat[0] = 933166841856.000 + Gradient do_[0] = 82685133324288.000 +Backward Time Step 1: + Gradient di[0] = 2291180765184.000, df[0] = 1580016074752.000, dc_hat[0] = 1275951710208.000 + Gradient do_[0] = 72962526937088.000 +Backward Time Step 0: + Gradient di[0] = 2775216553984.000, df[0] = 1974550528000.000, dc_hat[0] = 2604089212928.000 + Gradient do_[0] = 42904596774912.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2183712502448128.000, df[0] = -1714299957411840.000, dc_hat[0] = -987602563891200.000 + Gradient do_[0] = -127709005710622720.000 +Backward Time Step 3: + Gradient di[0] = -3431671621746688.000, df[0] = -2619548872212480.000, dc_hat[0] = -1419348849721344.000 + Gradient do_[0] = -174770104834195456.000 +Backward Time Step 2: + Gradient di[0] = -4499618729558016.000, df[0] = -3390302228316160.000, dc_hat[0] = -2482294333898752.000 + Gradient do_[0] = -200143758006681600.000 +Backward Time Step 1: + Gradient di[0] = -5683546865795072.000, df[0] = -4066606569226240.000, dc_hat[0] = -3444347680849920.000 + Gradient do_[0] = -179769687284908032.000 +Backward Time Step 0: + Gradient di[0] = -6641546837360640.000, df[0] = -4854308940021760.000, dc_hat[0] = -6714382503378944.000 + Gradient do_[0] = -103036574668685312.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 856914788352.000, df[0] = 628249460736.000, dc_hat[0] = 372271251456.000 + Gradient do_[0] = 53154511060992.000 +Backward Time Step 3: + Gradient di[0] = 1345582399488.000, df[0] = 970920361984.000, dc_hat[0] = 527306293248.000 + Gradient do_[0] = 71214601076736.000 +Backward Time Step 2: + Gradient di[0] = 1713902583808.000, df[0] = 1230961115136.000, dc_hat[0] = 876712558592.000 + Gradient do_[0] = 77684222722048.000 +Backward Time Step 1: + Gradient di[0] = 2152580644864.000, df[0] = 1484433653760.000, dc_hat[0] = 1198718976000.000 + Gradient do_[0] = 68548055531520.000 +Backward Time Step 0: + Gradient di[0] = 2607253553152.000, df[0] = 1855045763072.000, dc_hat[0] = 2446483259392.000 + Gradient do_[0] = 40307903168512.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2214115904847872.000, df[0] = -1738168432852992.000, dc_hat[0] = -1001343640666112.000 + Gradient do_[0] = -129486245997838336.000 +Backward Time Step 3: + Gradient di[0] = -3479399445823488.000, df[0] = -2655984421961728.000, dc_hat[0] = -1439063689134080.000 + Gradient do_[0] = -177199286797205504.000 +Backward Time Step 2: + Gradient di[0] = -4562149158420480.000, df[0] = -3437415335198720.000, dc_hat[0] = -2516720375824384.000 + Gradient do_[0] = -202922996984184832.000 +Backward Time Step 1: + Gradient di[0] = -5762416960864256.000, df[0] = -4123031970512896.000, dc_hat[0] = -3492016717561856.000 + Gradient do_[0] = -182262486303506432.000 +Backward Time Step 0: + Gradient di[0] = -6733486517911552.000, df[0] = -4921507461464064.000, dc_hat[0] = -6807329890631680.000 + Gradient do_[0] = -104462933307686912.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 799988908032.000, df[0] = 586514563072.000, dc_hat[0] = 347538325504.000 + Gradient do_[0] = 49623137779712.000 +Backward Time Step 3: + Gradient di[0] = 1256177139712.000, df[0] = 906410196992.000, dc_hat[0] = 492263243776.000 + Gradient do_[0] = 66482419531776.000 +Backward Time Step 2: + Gradient di[0] = 1600004030464.000, df[0] = 1149156982784.000, dc_hat[0] = 818435391488.000 + Gradient do_[0] = 72521135161344.000 +Backward Time Step 1: + Gradient di[0] = 2009479118848.000, df[0] = 1385748758528.000, dc_hat[0] = 1119005179904.000 + Gradient do_[0] = 63990663217152.000 +Backward Time Step 0: + Gradient di[0] = 2433866530816.000, df[0] = 1731681976320.000, dc_hat[0] = 2283787517952.000 + Gradient do_[0] = 37627357036544.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2244764657254400.000, df[0] = -1762230316040192.000, dc_hat[0] = -1015196923461632.000 + Gradient do_[0] = -131277968914776064.000 +Backward Time Step 3: + Gradient di[0] = -3527515964440576.000, df[0] = -2692717129760768.000, dc_hat[0] = -1458944958529536.000 + Gradient do_[0] = -179648637926637568.000 +Backward Time Step 2: + Gradient di[0] = -4625194983358464.000, df[0] = -3484917941927936.000, dc_hat[0] = -2551454581653504.000 + Gradient do_[0] = -205725892641554432.000 +Backward Time Step 1: + Gradient di[0] = -5841943649058816.000, df[0] = -4179929281331200.000, dc_hat[0] = -3540136188968960.000 + Gradient do_[0] = -184776811698192384.000 +Backward Time Step 0: + Gradient di[0] = -6826221304283136.000, df[0] = -4989287414104064.000, dc_hat[0] = -6901081510510592.000 + Gradient do_[0] = -105901609912893440.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 741205475328.000, df[0] = 543417794560.000, dc_hat[0] = 321998225408.000 + Gradient do_[0] = 45976555356160.000 +Backward Time Step 3: + Gradient di[0] = 1163854348288.000, df[0] = 839794622464.000, dc_hat[0] = 456078131200.000 + Gradient do_[0] = 61595866628096.000 +Backward Time Step 2: + Gradient di[0] = 1482393255936.000, df[0] = 1064686845952.000, dc_hat[0] = 758261219328.000 + Gradient do_[0] = 67189864398848.000 +Backward Time Step 1: + Gradient di[0] = 1861723095040.000, df[0] = 1283854172160.000, dc_hat[0] = 1036702908416.000 + Gradient do_[0] = 59285115502592.000 +Backward Time Step 0: + Gradient di[0] = 2254852849664.000, df[0] = 1604314988544.000, dc_hat[0] = 2115812589568.000 + Gradient do_[0] = 34859823136768.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.313, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2275753114730496.000, df[0] = -1786558621417472.000, dc_hat[0] = -1029203885555712.000 + Gradient do_[0] = -133089586120228864.000 +Backward Time Step 3: + Gradient di[0] = -3576158348115968.000, df[0] = -2729850611695616.000, dc_hat[0] = -1479042855337984.000 + Gradient do_[0] = -182124789651996672.000 +Backward Time Step 2: + Gradient di[0] = -4688921560612864.000, df[0] = -3532932455071744.000, dc_hat[0] = -2586563523379200.000 + Gradient do_[0] = -208559076408295424.000 +Backward Time Step 1: + Gradient di[0] = -5922321814519808.000, df[0] = -4237436477505536.000, dc_hat[0] = -3588772398628864.000 + Gradient do_[0] = -187318006408282112.000 +Backward Time Step 0: + Gradient di[0] = -6919959502389248.000, df[0] = -5057801269280768.000, dc_hat[0] = -6995848353284096.000 + Gradient do_[0] = -107355860069515264.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 680337276928.000, df[0] = 498792366080.000, dc_hat[0] = 295552909312.000 + Gradient do_[0] = 42200679317504.000 +Backward Time Step 3: + Gradient di[0] = 1068263211008.000, df[0] = 770820407296.000, dc_hat[0] = 418612805632.000 + Gradient do_[0] = 56536411930624.000 +Backward Time Step 2: + Gradient di[0] = 1360621862912.000, df[0] = 977228070912.000, dc_hat[0] = 695961583616.000 + Gradient do_[0] = 61670063865856.000 +Backward Time Step 1: + Gradient di[0] = 1708750012416.000, df[0] = 1178362052608.000, dc_hat[0] = 951499358208.000 + Gradient do_[0] = 54413469155328.000 +Backward Time Step 0: + Gradient di[0] = 2069532114944.000, df[0] = 1472460095488.000, dc_hat[0] = 1941919236096.000 + Gradient do_[0] = 31994775863296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2307031214063616.000, df[0] = -1811114962714624.000, dc_hat[0] = -1043341978370048.000 + Gradient do_[0] = -134918177036435456.000 +Backward Time Step 3: + Gradient di[0] = -3625261098598400.000, df[0] = -2767336012513280.000, dc_hat[0] = -1499331072884736.000 + Gradient do_[0] = -184624305999446016.000 +Backward Time Step 2: + Gradient di[0] = -4753250507030528.000, df[0] = -3581402234748928.000, dc_hat[0] = -2622005056634880.000 + Gradient do_[0] = -211418923332009984.000 +Backward Time Step 1: + Gradient di[0] = -6003457504837632.000, df[0] = -4295484571123712.000, dc_hat[0] = -3637864143257600.000 + Gradient do_[0] = -189883270115098624.000 +Backward Time Step 0: + Gradient di[0] = -7014567968243712.000, df[0] = -5126950242746368.000, dc_hat[0] = -7091494053740544.000 + Gradient do_[0] = -108823613603315712.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 617450438656.000, df[0] = 452686970880.000, dc_hat[0] = 268231507968.000 + Gradient do_[0] = 38299670413312.000 +Backward Time Step 3: + Gradient di[0] = 969504980992.000, df[0] = 699561017344.000, dc_hat[0] = 379908063232.000 + Gradient do_[0] = 51309398982656.000 +Backward Time Step 2: + Gradient di[0] = 1234818564096.000, df[0] = 886873653248.000, dc_hat[0] = 631601233920.000 + Gradient do_[0] = 55967634948096.000 +Backward Time Step 1: + Gradient di[0] = 1550718468096.000, df[0] = 1069381910528.000, dc_hat[0] = 863483002880.000 + Gradient do_[0] = 49380807671808.000 +Backward Time Step 0: + Gradient di[0] = 1878092021760.000, df[0] = 1336251645952.000, dc_hat[0] = 1762283880448.000 + Gradient do_[0] = 29035134451712.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2338555468709888.000, df[0] = -1835864443322368.000, dc_hat[0] = -1057591807442944.000 + Gradient do_[0] = -136761078783672320.000 +Backward Time Step 3: + Gradient di[0] = -3674752812056576.000, df[0] = -2805118302945280.000, dc_hat[0] = -1519779546398720.000 + Gradient do_[0] = -187143562016587776.000 +Backward Time Step 2: + Gradient di[0] = -4818093238910976.000, df[0] = -3630258293047296.000, dc_hat[0] = -2657727104942080.000 + Gradient do_[0] = -214301705381085184.000 +Backward Time Step 1: + Gradient di[0] = -6085241198346240.000, df[0] = -4353997058080768.000, dc_hat[0] = -3687347535216640.000 + Gradient do_[0] = -192468874787028992.000 +Backward Time Step 0: + Gradient di[0] = -7109933422084096.000, df[0] = -5196653266993152.000, dc_hat[0] = -7187905868988416.000 + Gradient do_[0] = -110303100987768832.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 552736718848.000, df[0] = 405242118144.000, dc_hat[0] = 240116711424.000 + Gradient do_[0] = 34285354483712.000 +Backward Time Step 3: + Gradient di[0] = 867879747584.000, df[0] = 626232524800.000, dc_hat[0] = 340080623616.000 + Gradient do_[0] = 45930778722304.000 +Backward Time Step 2: + Gradient di[0] = 1105369366528.000, df[0] = 793900613632.000, dc_hat[0] = 565378678784.000 + Gradient do_[0] = 50100063698944.000 +Backward Time Step 1: + Gradient di[0] = 1388119457792.000, df[0] = 957252042752.000, dc_hat[0] = 772927127552.000 + Gradient do_[0] = 44202759028736.000 +Backward Time Step 0: + Gradient di[0] = 1681123704832.000, df[0] = 1196109856768.000, dc_hat[0] = 1577460957184.000 + Gradient do_[0] = 25990021513216.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2370433521287168.000, df[0] = -1860891083538432.000, dc_hat[0] = -1071999476563968.000 + Gradient do_[0] = -138624656503472128.000 +Backward Time Step 3: + Gradient di[0] = -3724794281328640.000, df[0] = -2843320694865920.000, dc_hat[0] = -1540454445219840.000 + Gradient do_[0] = -189690821220499456.000 +Backward Time Step 2: + Gradient di[0] = -4883651082846208.000, df[0] = -3679653369741312.000, dc_hat[0] = -2693844021805056.000 + Gradient do_[0] = -217216081209589760.000 +Backward Time Step 1: + Gradient di[0] = -6167918781923328.000, df[0] = -4413149226729472.000, dc_hat[0] = -3737371824619520.000 + Gradient do_[0] = -195082757523636224.000 +Backward Time Step 0: + Gradient di[0] = -7206339331751936.000, df[0] = -5267116500451328.000, dc_hat[0] = -7285368877481984.000 + Gradient do_[0] = -111798746039189504.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 485604655104.000, df[0] = 356024156160.000, dc_hat[0] = 210951880704.000 + Gradient do_[0] = 30121096380416.000 +Backward Time Step 3: + Gradient di[0] = 762461421568.000, df[0] = 550167052288.000, dc_hat[0] = 298768367616.000 + Gradient do_[0] = 40351444238336.000 +Backward Time Step 2: + Gradient di[0] = 971091607552.000, df[0] = 697459474432.000, dc_hat[0] = 496688824320.000 + Gradient do_[0] = 44013704970240.000 +Backward Time Step 1: + Gradient di[0] = 1219462823936.000, df[0] = 840945106944.000, dc_hat[0] = 679002112000.000 + Gradient do_[0] = 38831902425088.000 +Backward Time Step 0: + Gradient di[0] = 1476834492416.000, df[0] = 1050759266304.000, dc_hat[0] = 1385768943616.000 + Gradient do_[0] = 22831729475584.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2402609537220608.000, df[0] = -1886151799472128.000, dc_hat[0] = -1086543108243456.000 + Gradient do_[0] = -140505637430755328.000 +Backward Time Step 3: + Gradient di[0] = -3775295580536832.000, df[0] = -2881873663492096.000, dc_hat[0] = -1561320335867904.000 + Gradient do_[0] = -192261548125716480.000 +Backward Time Step 2: + Gradient di[0] = -4949806464106496.000, df[0] = -3729498881130496.000, dc_hat[0] = -2730289772101632.000 + Gradient do_[0] = -220157257634021376.000 +Backward Time Step 1: + Gradient di[0] = -6251357648453632.000, df[0] = -4472845245612032.000, dc_hat[0] = -3787855138652160.000 + Gradient do_[0] = -197720760796577792.000 +Backward Time Step 0: + Gradient di[0] = -7303629467811840.000, df[0] = -5338225052745728.000, dc_hat[0] = -7383725776044032.000 + Gradient do_[0] = -113308092036284416.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 416843628544.000, df[0] = 305611800576.000, dc_hat[0] = 181079916544.000 + Gradient do_[0] = 25855852019712.000 +Backward Time Step 3: + Gradient di[0] = 654489157632.000, df[0] = 472258510848.000, dc_hat[0] = 256456228864.000 + Gradient do_[0] = 34637048971264.000 +Backward Time Step 2: + Gradient di[0] = 833563721728.000, df[0] = 598684139520.000, dc_hat[0] = 426339270656.000 + Gradient do_[0] = 37780130365440.000 +Backward Time Step 1: + Gradient di[0] = 1046735421440.000, df[0] = 721831133184.000, dc_hat[0] = 582814531584.000 + Gradient do_[0] = 33331462799360.000 +Backward Time Step 0: + Gradient di[0] = 1267622346752.000, df[0] = 901906104320.000, dc_hat[0] = 1189457428480.000 + Gradient do_[0] = 19597327597568.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2435089958961152.000, df[0] = -1911651959832576.000, dc_hat[0] = -1101223642005504.000 + Gradient do_[0] = -142404511191793664.000 +Backward Time Step 3: + Gradient di[0] = -3826281674178560.000, df[0] = -2920797073047552.000, dc_hat[0] = -1582385002971136.000 + Gradient do_[0] = -194856807884128256.000 +Backward Time Step 2: + Gradient di[0] = -5016609848557568.000, df[0] = -3779832408178688.000, dc_hat[0] = -2767092273119232.000 + Gradient do_[0] = -223126918281560064.000 +Backward Time Step 1: + Gradient di[0] = -6335596989513728.000, df[0] = -4533115179499520.000, dc_hat[0] = -3838823783989248.000 + Gradient do_[0] = -200383984117481472.000 +Backward Time Step 0: + Gradient di[0] = -7401856443613184.000, df[0] = -5410019726065664.000, dc_hat[0] = -7483029714894848.000 + Gradient do_[0] = -114831972202708992.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 345775865856.000, df[0] = 253508190208.000, dc_hat[0] = 150206349312.000 + Gradient do_[0] = 21447565115392.000 +Backward Time Step 3: + Gradient di[0] = 542896357376.000, df[0] = 391737212928.000, dc_hat[0] = 212726611968.000 + Gradient do_[0] = 28731106131968.000 +Backward Time Step 2: + Gradient di[0] = 691429507072.000, df[0] = 496600252416.000, dc_hat[0] = 353636286464.000 + Gradient do_[0] = 31337870262272.000 +Backward Time Step 1: + Gradient di[0] = 868231544832.000, df[0] = 598733946880.000, dc_hat[0] = 483414900736.000 + Gradient do_[0] = 27647155372032.000 +Backward Time Step 0: + Gradient di[0] = 1051426291712.000, df[0] = 748083740672.000, dc_hat[0] = 986592444416.000 + Gradient do_[0] = 16254953848832.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2467807946080256.000, df[0] = -1937338682834944.000, dc_hat[0] = -1116011952603136.000 + Gradient do_[0] = -144317068718637056.000 +Backward Time Step 3: + Gradient di[0] = -3877639014055936.000, df[0] = -2960003950444544.000, dc_hat[0] = -1603603752026112.000 + Gradient do_[0] = -197471120117465088.000 +Backward Time Step 2: + Gradient di[0] = -5083886753153024.000, df[0] = -3830523491254272.000, dc_hat[0] = -2804153545916416.000 + Gradient do_[0] = -226117847607148544.000 +Backward Time Step 1: + Gradient di[0] = -6420443531575296.000, df[0] = -4593818636648448.000, dc_hat[0] = -3890156696240128.000 + Gradient do_[0] = -203066448891871232.000 +Backward Time Step 0: + Gradient di[0] = -7500771151052800.000, df[0] = -5482315836817408.000, dc_hat[0] = -7583028364705792.000 + Gradient do_[0] = -116366529657831424.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 272426106880.000, df[0] = 199731363840.000, dc_hat[0] = 118341902336.000 + Gradient do_[0] = 16897786511360.000 +Backward Time Step 3: + Gradient di[0] = 427725520896.000, df[0] = 308633862144.000, dc_hat[0] = 167596097536.000 + Gradient do_[0] = 22635893227520.000 +Backward Time Step 2: + Gradient di[0] = 544740704256.000, df[0] = 391245135872.000, dc_hat[0] = 278606610432.000 + Gradient do_[0] = 24689273470976.000 +Backward Time Step 1: + Gradient di[0] = 684016533504.000, df[0] = 471698636800.000, dc_hat[0] = 380839886848.000 + Gradient do_[0] = 21781062615040.000 +Backward Time Step 0: + Gradient di[0] = 828323790848.000, df[0] = 589347618816.000, dc_hat[0] = 777247129600.000 + Gradient do_[0] = 12805809897472.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2500882147049472.000, df[0] = -1963304847147008.000, dc_hat[0] = -1130960854712320.000 + Gradient do_[0] = -146250594275819520.000 +Backward Time Step 3: + Gradient di[0] = -3929556578729984.000, df[0] = -2999638445522944.000, dc_hat[0] = -1625052281831424.000 + Gradient do_[0] = -200113830674563072.000 +Backward Time Step 2: + Gradient di[0] = -5151901318381568.000, df[0] = -3881768893546496.000, dc_hat[0] = -2841618277203968.000 + Gradient do_[0] = -229141401504317440.000 +Backward Time Step 1: + Gradient di[0] = -6506213491605504.000, df[0] = -4655182445019136.000, dc_hat[0] = -3942048759545856.000 + Gradient do_[0] = -205778067904266240.000 +Backward Time Step 0: + Gradient di[0] = -7600766579638272.000, df[0] = -5555402758422528.000, dc_hat[0] = -7684121694306304.000 + Gradient do_[0] = -117917863255212032.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 196988370944.000, df[0] = 144423747584.000, dc_hat[0] = 85571158016.000 + Gradient do_[0] = 12218533937152.000 +Backward Time Step 3: + Gradient di[0] = 309279195136.000, df[0] = 223166857216.000, dc_hat[0] = 121183559680.000 + Gradient do_[0] = 16367420964864.000 +Backward Time Step 2: + Gradient di[0] = 393885843456.000, df[0] = 282897842176.000, dc_hat[0] = 201448620032.000 + Gradient do_[0] = 17851953971200.000 +Backward Time Step 1: + Gradient di[0] = 494579646464.000, df[0] = 341062418432.000, dc_hat[0] = 275361431552.000 + Gradient do_[0] = 15748754833408.000 +Backward Time Step 0: + Gradient di[0] = 598906961920.000, df[0] = 426118873088.000, dc_hat[0] = 561976705024.000 + Gradient do_[0] = 9259047714816.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2534257264164864.000, df[0] = -1989507100442624.000, dc_hat[0] = -1146045585162240.000 + Gradient do_[0] = -148201660479438848.000 +Backward Time Step 3: + Gradient di[0] = -3981941221097472.000, df[0] = -3039629422886912.000, dc_hat[0] = -1646694756253696.000 + Gradient do_[0] = -202780300990742528.000 +Backward Time Step 2: + Gradient di[0] = -5220527379578880.000, df[0] = -3933476004823040.000, dc_hat[0] = -2879423653085184.000 + Gradient do_[0] = -232192185494142976.000 +Backward Time Step 1: + Gradient di[0] = -6592749029556224.000, df[0] = -4717095472332800.000, dc_hat[0] = -3994403068706816.000 + Gradient do_[0] = -208513876172472320.000 +Backward Time Step 0: + Gradient di[0] = -7701650529583104.000, df[0] = -5629139293831168.000, dc_hat[0] = -7786112135200768.000 + Gradient do_[0] = -119482975107678208.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 119240450048.000, df[0] = 87422246912.000, dc_hat[0] = 51797237760.000 + Gradient do_[0] = 7396046929920.000 +Backward Time Step 3: + Gradient di[0] = 187208941568.000, df[0] = 135084703744.000, dc_hat[0] = 73352298496.000 + Gradient do_[0] = 9907250135040.000 +Backward Time Step 2: + Gradient di[0] = 238418493440.000, df[0] = 171237687296.000, dc_hat[0] = 121934430208.000 + Gradient do_[0] = 10805681586176.000 +Backward Time Step 1: + Gradient di[0] = 299361140736.000, df[0] = 206439432192.000, dc_hat[0] = 166668484608.000 + Gradient do_[0] = 9532413575168.000 +Backward Time Step 0: + Gradient di[0] = 362500554752.000, df[0] = 257917059072.000, dc_hat[0] = 340147798016.000 + Gradient do_[0] = 5604226105344.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2567915043815424.000, df[0] = -2015931618295808.000, dc_hat[0] = -1161258359324672.000 + Gradient do_[0] = -150169270897082368.000 +Backward Time Step 3: + Gradient di[0] = -4034771734757376.000, df[0] = -3079961044844544.000, dc_hat[0] = -1668520303656960.000 + Gradient do_[0] = -205469397194637312.000 +Backward Time Step 2: + Gradient di[0] = -5289731113877504.000, df[0] = -3985617981538304.000, dc_hat[0] = -2917544440627200.000 + Gradient do_[0] = -235268704928006144.000 +Backward Time Step 1: + Gradient di[0] = -6680019006914560.000, df[0] = -4779531411914752.000, dc_hat[0] = -4047199222628352.000 + Gradient do_[0] = -211272877264076800.000 +Backward Time Step 0: + Gradient di[0] = -7803387030536192.000, df[0] = -5703498599497728.000, dc_hat[0] = -7888964253908992.000 + Gradient do_[0] = -121061306869481472.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 39789289472.000, df[0] = 29171914752.000, dc_hat[0] = 17284067328.000 + Gradient do_[0] = 2467970678784.000 +Backward Time Step 3: + Gradient di[0] = 62468841472.000, df[0] = 45075816448.000, dc_hat[0] = 24476250112.000 + Gradient do_[0] = 3305880158208.000 +Backward Time Step 2: + Gradient di[0] = 79555682304.000, df[0] = 57138749440.000, dc_hat[0] = 40686473216.000 + Gradient do_[0] = 3605623472128.000 +Backward Time Step 1: + Gradient di[0] = 99888693248.000, df[0] = 68883185664.000, dc_hat[0] = 55611645952.000 + Gradient do_[0] = 3180689620992.000 +Backward Time Step 0: + Gradient di[0] = 120953806848.000, df[0] = 86057943040.000, dc_hat[0] = 113495474176.000 + Gradient do_[0] = 1869935017984.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2601873471176704.000, df[0] = -2042592493568000.000, dc_hat[0] = -1176606424956928.000 + Gradient do_[0] = -152154370421555200.000 +Backward Time Step 3: + Gradient di[0] = -4088070399852544.000, df[0] = -3120649954394112.000, dc_hat[0] = -1690539124588544.000 + Gradient do_[0] = -208182287517351936.000 +Backward Time Step 2: + Gradient di[0] = -5359545270403072.000, df[0] = -4038220593496064.000, dc_hat[0] = -2956001846231040.000 + Gradient do_[0] = -238372162396749824.000 +Backward Time Step 1: + Gradient di[0] = -6768051340967936.000, df[0] = -4842514422956032.000, dc_hat[0] = -4100458696146944.000 + Gradient do_[0] = -214056050431623168.000 +Backward Time Step 0: + Gradient di[0] = -7906012052848640.000, df[0] = -5778506982096896.000, dc_hat[0] = -7992714557652992.000 + Gradient do_[0] = -122653425476304896.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -42455273472.000, df[0] = -31126528000.000, dc_hat[0] = -18441992192.000 + Gradient do_[0] = -2633315385344.000 +Backward Time Step 3: + Gradient di[0] = -66653335552.000, df[0] = -48095293440.000, dc_hat[0] = -26115430400.000 + Gradient do_[0] = -3527304282112.000 +Backward Time Step 2: + Gradient di[0] = -84883742720.000, df[0] = -60965490688.000, dc_hat[0] = -43410608128.000 + Gradient do_[0] = -3847075921920.000 +Backward Time Step 1: + Gradient di[0] = -106575855616.000, df[0] = -73494577152.000, dc_hat[0] = -59333414912.000 + Gradient do_[0] = -3393605599232.000 +Backward Time Step 0: + Gradient di[0] = -129048444928.000, df[0] = -91817230336.000, dc_hat[0] = -121090965504.000 + Gradient do_[0] = -1995077320704.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.865 + c_state[0] = 0.725, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 17143145955328.000, df[0] = 11760413179904.000, dc_hat[0] = 7172251451392.000 + Gradient do_[0] = 1138026478567424.000 +Backward Time Step 3: + Gradient di[0] = 26953801269248.000, df[0] = 18363577270272.000, dc_hat[0] = 10042543702016.000 + Gradient do_[0] = 1491999429492736.000 +Backward Time Step 2: + Gradient di[0] = 33088583237632.000, df[0] = 22609125179392.000, dc_hat[0] = 15708831350784.000 + Gradient do_[0] = 1532085030354944.000 +Backward Time Step 1: + Gradient di[0] = 41263004909568.000, df[0] = 27413561999360.000, dc_hat[0] = 21091753918464.000 + Gradient do_[0] = 1323585306099712.000 +Backward Time Step 0: + Gradient di[0] = 51407944155136.000, df[0] = 35672979144704.000, dc_hat[0] = 44819246219264.000 + Gradient do_[0] = 792907736940544.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1366039330816.000, df[0] = 1001532489728.000, dc_hat[0] = 593359601664.000 + Gradient do_[0] = 84726870179840.000 +Backward Time Step 3: + Gradient di[0] = 2144686571520.000, df[0] = 1547564613632.000, dc_hat[0] = 840263860224.000 + Gradient do_[0] = 113494435299328.000 +Backward Time Step 2: + Gradient di[0] = 2731131797504.000, df[0] = 1961570598912.000, dc_hat[0] = 1396693401600.000 + Gradient do_[0] = 123778398945280.000 +Backward Time Step 1: + Gradient di[0] = 3429249056768.000, df[0] = 2364806004736.000, dc_hat[0] = 1909147959296.000 + Gradient do_[0] = 109194711662592.000 +Backward Time Step 0: + Gradient di[0] = 4153747177472.000, df[0] = 2955367415808.000, dc_hat[0] = 3897615974400.000 + Gradient do_[0] = 64216555847680.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2018455851106304.000, df[0] = -1584591005548544.000, dc_hat[0] = -912734237491200.000 + Gradient do_[0] = -118033028508286976.000 +Backward Time Step 3: + Gradient di[0] = -3171415930961920.000, df[0] = -2420940189204480.000, dc_hat[0] = -1311394444083200.000 + Gradient do_[0] = -161498037414264832.000 +Backward Time Step 2: + Gradient di[0] = -4157474923872256.000, df[0] = -3132513459372032.000, dc_hat[0] = -2292916877787136.000 + Gradient do_[0] = -184905969954717696.000 +Backward Time Step 1: + Gradient di[0] = -5250202919239680.000, df[0] = -3756496240246784.000, dc_hat[0] = -3180800971374592.000 + Gradient do_[0] = -166049465797443584.000 +Backward Time Step 0: + Gradient di[0] = -6134809952780288.000, df[0] = -4483934851170304.000, dc_hat[0] = -6202088467988480.000 + Gradient do_[0] = -95175083709956096.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1325226000384.000, df[0] = 971608293376.000, dc_hat[0] = 575621300224.000 + Gradient do_[0] = 82194458476544.000 +Backward Time Step 3: + Gradient di[0] = 2080582664192.000, df[0] = 1501308649472.000, dc_hat[0] = 815117565952.000 + Gradient do_[0] = 110100085735424.000 +Backward Time Step 2: + Gradient di[0] = 2649488883712.000, df[0] = 1902929379328.000, dc_hat[0] = 1354843422720.000 + Gradient do_[0] = 120074945953792.000 +Backward Time Step 1: + Gradient di[0] = 3326656643072.000, df[0] = 2294048096256.000, dc_hat[0] = 1851828076544.000 + Gradient do_[0] = 105924605771776.000 +Backward Time Step 0: + Gradient di[0] = 4029252108288.000, df[0] = 2866789744640.000, dc_hat[0] = 3780797792256.000 + Gradient do_[0] = 62291869433856.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2043501147586560.000, df[0] = -1604252292087808.000, dc_hat[0] = -924043490361344.000 + Gradient do_[0] = -119496169247211520.000 +Backward Time Step 3: + Gradient di[0] = -3210728303493120.000, df[0] = -2450951004749824.000, dc_hat[0] = -1327599758344192.000 + Gradient do_[0] = -163497018273038336.000 +Backward Time Step 2: + Gradient di[0] = -4208988493185024.000, df[0] = -3171321441681408.000, dc_hat[0] = -2321166756741120.000 + Gradient do_[0] = -187192421564547072.000 +Backward Time Step 1: + Gradient di[0] = -5315161078366208.000, df[0] = -3802955975229440.000, dc_hat[0] = -3219832828854272.000 + Gradient do_[0] = -168099230349524992.000 +Backward Time Step 0: + Gradient di[0] = -6210387418546176.000, df[0] = -4539174304612352.000, dc_hat[0] = -6278494862442496.000 + Gradient do_[0] = -96347592601894912.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1283910402048.000, df[0] = 941318144000.000, dc_hat[0] = 557671383040.000 + Gradient do_[0] = 79631470297088.000 +Backward Time Step 3: + Gradient di[0] = 2015687999488.000, df[0] = 1454484094976.000, dc_hat[0] = 789683372032.000 + Gradient do_[0] = 106665353412608.000 +Backward Time Step 2: + Gradient di[0] = 2566811025408.000, df[0] = 1843548651520.000, dc_hat[0] = 1312543997952.000 + Gradient do_[0] = 116327209500672.000 +Backward Time Step 1: + Gradient di[0] = 3222769762304.000, df[0] = 2222406238208.000, dc_hat[0] = 1793963065344.000 + Gradient do_[0] = 102616155553792.000 +Backward Time Step 0: + Gradient di[0] = 3903339102208.000, df[0] = 2777203081216.000, dc_hat[0] = 3662648705024.000 + Gradient do_[0] = 60345267781632.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2069838692352000.000, df[0] = -1624929606828032.000, dc_hat[0] = -935946052698112.000 + Gradient do_[0] = -121035777583874048.000 +Backward Time Step 3: + Gradient di[0] = -3252072195555328.000, df[0] = -2482513645666304.000, dc_hat[0] = -1344676950966272.000 + Gradient do_[0] = -165601243010433024.000 +Backward Time Step 2: + Gradient di[0] = -4263140783030272.000, df[0] = -3212123094122496.000, dc_hat[0] = -2350992083386368.000 + Gradient do_[0] = -189599544575524864.000 +Backward Time Step 1: + Gradient di[0] = -5383434952245248.000, df[0] = -3851803175157760.000, dc_hat[0] = -3261131892195328.000 + Gradient do_[0] = -170257640394326016.000 +Backward Time Step 0: + Gradient di[0] = -6289980678733824.000, df[0] = -4597349099765760.000, dc_hat[0] = -6358960537862144.000 + Gradient do_[0] = -97582412879364096.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1241858572288.000, df[0] = 910487977984.000, dc_hat[0] = 539401453568.000 + Gradient do_[0] = 77022906810368.000 +Backward Time Step 3: + Gradient di[0] = 1949639376896.000, df[0] = 1406826315776.000, dc_hat[0] = 763796979712.000 + Gradient do_[0] = 103169560412160.000 +Backward Time Step 2: + Gradient di[0] = 2482672762880.000, df[0] = 1783118692352.000, dc_hat[0] = 1269497593856.000 + Gradient do_[0] = 112513295319040.000 +Backward Time Step 1: + Gradient di[0] = 3117049970688.000, df[0] = 2149500715008.000, dc_hat[0] = 1735079755776.000 + Gradient do_[0] = 99249412898816.000 +Backward Time Step 0: + Gradient di[0] = 3775215435776.000, df[0] = 2686044078080.000, dc_hat[0] = 3542425534464.000 + Gradient do_[0] = 58364486746112.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2097083012087808.000, df[0] = -1646319080833024.000, dc_hat[0] = -948258717302784.000 + Gradient do_[0] = -122628248378015744.000 +Backward Time Step 3: + Gradient di[0] = -3294829131857920.000, df[0] = -2515155933986816.000, dc_hat[0] = -1362339869753344.000 + Gradient do_[0] = -167777502939316224.000 +Backward Time Step 2: + Gradient di[0] = -4319148498747392.000, df[0] = -3254322758418432.000, dc_hat[0] = -2381839880683520.000 + Gradient do_[0] = -192089302757277696.000 +Backward Time Step 1: + Gradient di[0] = -5454051730784256.000, df[0] = -3902324607025152.000, dc_hat[0] = -3303847221002240.000 + Gradient do_[0] = -172490095675310080.000 +Backward Time Step 0: + Gradient di[0] = -6372329160441856.000, df[0] = -4657537160839168.000, dc_hat[0] = -6442211499573248.000 + Gradient do_[0] = -98859942311624704.000 +Epoch 500, Train Loss=0.011462, Weight Norm=12.689054 +Sample Predictions at Epoch 500: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.17 | 63.87 | 6.70 | +| 193 | 2024-10-14 | 56.58 | 66.55 | 9.97 | +| 194 | 2024-10-15 | 56.78 | 66.00 | 9.22 | +| 195 | 2024-10-16 | 57.72 | 67.20 | 9.48 | +| 196 | 2024-10-17 | 57.27 | 66.76 | 9.49 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1199029616640.000, df[0] = 879087976448.000, dc_hat[0] = 520794767360.000 + Gradient do_[0] = 74366159159296.000 +Backward Time Step 3: + Gradient di[0] = 1882376503296.000, df[0] = 1358292451328.000, dc_hat[0] = 737436434432.000 + Gradient do_[0] = 99609535840256.000 +Backward Time Step 2: + Gradient di[0] = 2396986540032.000, df[0] = 1721577111552.000, dc_hat[0] = 1225661874176.000 + Gradient do_[0] = 108629294317568.000 +Backward Time Step 1: + Gradient di[0] = 3009398964224.000, df[0] = 2075263238144.000, dc_hat[0] = 1675124277248.000 + Gradient do_[0] = 95821181747200.000 +Backward Time Step 0: + Gradient di[0] = 3644745318400.000, df[0] = 2593215217664.000, dc_hat[0] = 3420000354304.000 + Gradient do_[0] = 56347433369600.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2125064052932608.000, df[0] = -1668287435898880.000, dc_hat[0] = -960904980070400.000 + Gradient do_[0] = -124263943723024384.000 +Backward Time Step 3: + Gradient di[0] = -3338746246201344.000, df[0] = -2548683254005760.000, dc_hat[0] = -1380480335216640.000 + Gradient do_[0] = -170012844438323200.000 +Backward Time Step 2: + Gradient di[0] = -4376667237646336.000, df[0] = -3297661125918720.000, dc_hat[0] = -2413521170071552.000 + Gradient do_[0] = -194646062428848128.000 +Backward Time Step 1: + Gradient di[0] = -5526585675350016.000, df[0] = -3954218549379072.000, dc_hat[0] = -3347724875333632.000 + Gradient do_[0] = -174783161534775296.000 +Backward Time Step 0: + Gradient di[0] = -6456899213983744.000, df[0] = -4719349793292288.000, dc_hat[0] = -6527709802921984.000 + Gradient do_[0] = -100171960331272192.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1154357264384.000, df[0] = 846336360448.000, dc_hat[0] = 501387296768.000 + Gradient do_[0] = 71595125112832.000 +Backward Time Step 3: + Gradient di[0] = 1812217856000.000, df[0] = 1307668643840.000, dc_hat[0] = 709941854208.000 + Gradient do_[0] = 95896284954624.000 +Backward Time Step 2: + Gradient di[0] = 2307620077568.000, df[0] = 1657392594944.000, dc_hat[0] = 1179945664512.000 + Gradient do_[0] = 104578594897920.000 +Backward Time Step 1: + Gradient di[0] = 2897129242624.000, df[0] = 1997841104896.000, dc_hat[0] = 1612600311808.000 + Gradient do_[0] = 92245931851776.000 +Backward Time Step 0: + Gradient di[0] = 3508695990272.000, df[0] = 2496416972800.000, dc_hat[0] = 3292340420608.000 + Gradient do_[0] = 54244124131328.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2153544383725568.000, df[0] = -1690646901424128.000, dc_hat[0] = -973775789096960.000 + Gradient do_[0] = -125928733176496128.000 +Backward Time Step 3: + Gradient di[0] = -3383442360238080.000, df[0] = -2582805426995200.000, dc_hat[0] = -1398942654791680.000 + Gradient do_[0] = -172287802715668480.000 +Backward Time Step 2: + Gradient di[0] = -4435213815906304.000, df[0] = -3341773929709568.000, dc_hat[0] = -2445768052965376.000 + Gradient do_[0] = -197248709531009024.000 +Backward Time Step 1: + Gradient di[0] = -5600409720717312.000, df[0] = -4007035909701632.000, dc_hat[0] = -3392381797793792.000 + Gradient do_[0] = -177116943684206592.000 +Backward Time Step 0: + Gradient di[0] = -6542970358595584.000, df[0] = -4782259253018624.000, dc_hat[0] = -6614724766597120.000 + Gradient do_[0] = -101507274253533184.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1108524531712.000, df[0] = 812734021632.000, dc_hat[0] = 481476542464.000 + Gradient do_[0] = 68752137781248.000 +Backward Time Step 3: + Gradient di[0] = 1740239536128.000, df[0] = 1255731888128.000, dc_hat[0] = 681734897664.000 + Gradient do_[0] = 92086850289664.000 +Backward Time Step 2: + Gradient di[0] = 2215935475712.000, df[0] = 1591542546432.000, dc_hat[0] = 1133045481472.000 + Gradient do_[0] = 100422861717504.000 +Backward Time Step 1: + Gradient di[0] = 2781950771200.000, df[0] = 1918413832192.000, dc_hat[0] = 1548460490752.000 + Gradient do_[0] = 88578122055680.000 +Backward Time Step 0: + Gradient di[0] = 3369127903232.000, df[0] = 2397114990592.000, dc_hat[0] = 3161378258944.000 + Gradient do_[0] = 52086410575872.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2182527359909888.000, df[0] = -1713401235505152.000, dc_hat[0] = -986874097172480.000 + Gradient do_[0] = -127622917386141696.000 +Backward Time Step 3: + Gradient di[0] = -3428929821999104.000, df[0] = -2617531848196096.000, dc_hat[0] = -1417732734058496.000 + Gradient do_[0] = -174603047786250240.000 +Backward Time Step 2: + Gradient di[0] = -4494789575704576.000, df[0] = -3386661706661888.000, dc_hat[0] = -2478581334671360.000 + Gradient do_[0] = -199896969185853440.000 +Backward Time Step 1: + Gradient di[0] = -5675528698724352.000, df[0] = -4060779640782848.000, dc_hat[0] = -3437822551785472.000 + Gradient do_[0] = -179491734181380096.000 +Backward Time Step 0: + Gradient di[0] = -6630567290339328.000, df[0] = -4846283793629184.000, dc_hat[0] = -6703282160402432.000 + Gradient do_[0] = -102866244855660544.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1060792631296.000, df[0] = 777739108352.000, dc_hat[0] = 460741181440.000 + Gradient do_[0] = 65791403753472.000 +Backward Time Step 3: + Gradient di[0] = 1665281294336.000, df[0] = 1201644634112.000, dc_hat[0] = 652361269248.000 + Gradient do_[0] = 88119844012032.000 +Backward Time Step 2: + Gradient di[0] = 2120462106624.000, df[0] = 1522971443200.000, dc_hat[0] = 1084210872320.000 + Gradient do_[0] = 96095489228800.000 +Backward Time Step 1: + Gradient di[0] = 2662025920512.000, df[0] = 1835712643072.000, dc_hat[0] = 1481680748544.000 + Gradient do_[0] = 84759208263680.000 +Backward Time Step 0: + Gradient di[0] = 3223819911168.000, df[0] = 2293729067008.000, dc_hat[0] = 3025030086656.000 + Gradient do_[0] = 49839962324992.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2211894467231744.000, df[0] = -1736457559474176.000, dc_hat[0] = -1000145881661440.000 + Gradient do_[0] = -129339590044549120.000 +Backward Time Step 3: + Gradient di[0] = -3475027169116160.000, df[0] = -2652723736477696.000, dc_hat[0] = -1436774471565312.000 + Gradient do_[0] = -176949233801232384.000 +Backward Time Step 2: + Gradient di[0] = -4555171447177216.000, df[0] = -3432156953051136.000, dc_hat[0] = -2511836729573376.000 + Gradient do_[0] = -202581014508208128.000 +Backward Time Step 1: + Gradient di[0] = -5751650014724096.000, df[0] = -4115240362967040.000, dc_hat[0] = -3483869433036800.000 + Gradient do_[0] = -181898204357328896.000 +Backward Time Step 0: + Gradient di[0] = -6719309904609280.000, df[0] = -4911145852862464.000, dc_hat[0] = -6792998121635840.000 + Gradient do_[0] = -104242988032458752.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1011394215936.000, df[0] = 741522407424.000, dc_hat[0] = 439281647616.000 + Gradient do_[0] = 62727305297920.000 +Backward Time Step 3: + Gradient di[0] = 1587710918656.000, df[0] = 1145672695808.000, dc_hat[0] = 621966000128.000 + Gradient do_[0] = 84014568308736.000 +Backward Time Step 2: + Gradient di[0] = 2021659115520.000, df[0] = 1452009062400.000, dc_hat[0] = 1033674817536.000 + Gradient do_[0] = 91617289568256.000 +Backward Time Step 1: + Gradient di[0] = 2537927475200.000, df[0] = 1750133899264.000, dc_hat[0] = 1412580900864.000 + Gradient do_[0] = 80807444086784.000 +Backward Time Step 0: + Gradient di[0] = 3073464598528.000, df[0] = 2186752294912.000, dc_hat[0] = 2883946545152.000 + Gradient do_[0] = 47515487436800.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2241602756018176.000, df[0] = -1759781647810560.000, dc_hat[0] = -1013571882319872.000 + Gradient do_[0] = -131076114041798656.000 +Backward Time Step 3: + Gradient di[0] = -3521646623195136.000, df[0] = -2688314519846912.000, dc_hat[0] = -1456031091654656.000 + Gradient do_[0] = -179322100153057280.000 +Backward Time Step 2: + Gradient di[0] = -4616227628515328.000, df[0] = -3478160884629504.000, dc_hat[0] = -2545465518194688.000 + Gradient do_[0] = -205295210500980736.000 +Backward Time Step 1: + Gradient di[0] = -5828639450988544.000, df[0] = -4170322513231872.000, dc_hat[0] = -3530439763427328.000 + Gradient do_[0] = -184332162323972096.000 +Backward Time Step 0: + Gradient di[0] = -6809075794837504.000, df[0] = -4976755773276160.000, dc_hat[0] = -6883748096245760.000 + Gradient do_[0] = -105635622588252160.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 960371359744.000, df[0] = 704114720768.000, dc_hat[0] = 417117863936.000 + Gradient do_[0] = 59562539352064.000 +Backward Time Step 3: + Gradient di[0] = 1507593682944.000, df[0] = 1087862472704.000, dc_hat[0] = 590572814336.000 + Gradient do_[0] = 79774571560960.000 +Backward Time Step 2: + Gradient di[0] = 1919617990656.000, df[0] = 1378720940032.000, dc_hat[0] = 981485617152.000 + Gradient do_[0] = 86992448651264.000 +Backward Time Step 1: + Gradient di[0] = 2409766060032.000, df[0] = 1661753884672.000, dc_hat[0] = 1341223337984.000 + Gradient do_[0] = 76726428237824.000 +Backward Time Step 0: + Gradient di[0] = 2918193561600.000, df[0] = 2076277866496.000, dc_hat[0] = 2738249793536.000 + Gradient do_[0] = 45115011760128.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2271610081902592.000, df[0] = -1783340214517760.000, dc_hat[0] = -1027132906012672.000 + Gradient do_[0] = -132830092786139136.000 +Backward Time Step 3: + Gradient di[0] = -3568739328983040.000, df[0] = -2724266885775360.000, dc_hat[0] = -1475482193231872.000 + Gradient do_[0] = -181718966782132224.000 +Backward Time Step 2: + Gradient di[0] = -4677915170045952.000, df[0] = -3524639678529536.000, dc_hat[0] = -2579438709506048.000 + Gradient do_[0] = -208037186342223872.000 +Backward Time Step 1: + Gradient di[0] = -5906413255655424.000, df[0] = -4225964619857920.000, dc_hat[0] = -3577483882397696.000 + Gradient do_[0] = -186790859302240256.000 +Backward Time Step 0: + Gradient di[0] = -6899746849423360.000, df[0] = -5043027655524352.000, dc_hat[0] = -6975413435760640.000 + Gradient do_[0] = -107042284507234304.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 907439964160.000, df[0] = 665307578368.000, dc_hat[0] = 394124820480.000 + Gradient do_[0] = 56279372398592.000 +Backward Time Step 3: + Gradient di[0] = 1424479092736.000, df[0] = 1027889233920.000, dc_hat[0] = 558006992896.000 + Gradient do_[0] = 75376063676416.000 +Backward Time Step 2: + Gradient di[0] = 1813766209536.000, df[0] = 1302695641088.000, dc_hat[0] = 927348621312.000 + Gradient do_[0] = 82194928238592.000 +Backward Time Step 1: + Gradient di[0] = 2276832837632.000, df[0] = 1570082783232.000, dc_hat[0] = 1267211436032.000 + Gradient do_[0] = 72493461143552.000 +Backward Time Step 0: + Gradient di[0] = 2757146443776.000, df[0] = 1961693937664.000, dc_hat[0] = 2587133476864.000 + Gradient do_[0] = 42625239351296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2301942483124224.000, df[0] = -1807154465996800.000, dc_hat[0] = -1040840898117632.000 + Gradient do_[0] = -134603166955077632.000 +Backward Time Step 3: + Gradient di[0] = -3616345014927360.000, df[0] = -2760610093727744.000, dc_hat[0] = -1495146164125696.000 + Gradient do_[0] = -184141860913020928.000 +Backward Time Step 2: + Gradient di[0] = -4740257157218304.000, df[0] = -3571611856797696.000, dc_hat[0] = -2613773751812096.000 + Gradient do_[0] = -210808402320818176.000 +Backward Time Step 1: + Gradient di[0] = -5985010620301312.000, df[0] = -4282196747616256.000, dc_hat[0] = -3625027559751680.000 + Gradient do_[0] = -189275566602452992.000 +Backward Time Step 0: + Gradient di[0] = -6991380513554432.000, df[0] = -5110002838667264.000, dc_hat[0] = -7068052659109888.000 + Gradient do_[0] = -108463884322471936.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 852902412288.000, df[0] = 625322885120.000, dc_hat[0] = 370434932736.000 + Gradient do_[0] = 52896678805504.000 +Backward Time Step 3: + Gradient di[0] = 1338848706560.000, df[0] = 966100647936.000, dc_hat[0] = 524456460288.000 + Gradient do_[0] = 70844537634816.000 +Backward Time Step 2: + Gradient di[0] = 1704711684096.000, df[0] = 1224370159616.000, dc_hat[0] = 871576764416.000 + Gradient do_[0] = 77252343627776.000 +Backward Time Step 1: + Gradient di[0] = 2139881603072.000, df[0] = 1475641475072.000, dc_hat[0] = 1190966853632.000 + Gradient do_[0] = 68132643274752.000 +Backward Time Step 0: + Gradient di[0] = 2591250710528.000, df[0] = 1843659931648.000, dc_hat[0] = 2431467126784.000 + Gradient do_[0] = 40060502147072.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2332552715042816.000, df[0] = -1831186821283840.000, dc_hat[0] = -1054674048253952.000 + Gradient do_[0] = -136392398790983680.000 +Backward Time Step 3: + Gradient di[0] = -3664381271343104.000, df[0] = -2797282403549184.000, dc_hat[0] = -1514987436638208.000 + Gradient do_[0] = -186586848355680256.000 +Backward Time Step 2: + Gradient di[0] = -4803176280621056.000, df[0] = -3619019168940032.000, dc_hat[0] = -2648425816391680.000 + Gradient do_[0] = -213605216304496640.000 +Backward Time Step 1: + Gradient di[0] = -6064333297549312.000, df[0] = -4338946687369216.000, dc_hat[0] = -3673005297238016.000 + Gradient do_[0] = -191783174668288000.000 +Backward Time Step 0: + Gradient di[0] = -7083855454404608.000, df[0] = -5177592202133504.000, dc_hat[0] = -7161541212241920.000 + Gradient do_[0] = -109898532248354816.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 796560523264.000, df[0] = 584015085568.000, dc_hat[0] = 345961463808.000 + Gradient do_[0] = 49402131513344.000 +Backward Time Step 3: + Gradient di[0] = 1250385723392.000, df[0] = 902267928576.000, dc_hat[0] = 489797451776.000 + Gradient do_[0] = 66163123945472.000 +Backward Time Step 2: + Gradient di[0] = 1592053858304.000, df[0] = 1143456661504.000, dc_hat[0] = 813964460032.000 + Gradient do_[0] = 72146567036928.000 +Backward Time Step 1: + Gradient di[0] = 1998418345984.000, df[0] = 1378088779776.000, dc_hat[0] = 1112214077440.000 + Gradient do_[0] = 63628220825600.000 +Backward Time Step 0: + Gradient di[0] = 2419897925632.000, df[0] = 1721743441920.000, dc_hat[0] = 2270680580096.000 + Gradient do_[0] = 37411404906496.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2363466547462144.000, df[0] = -1855457413038080.000, dc_hat[0] = -1068644234690560.000 + Gradient do_[0] = -138199394611625984.000 +Backward Time Step 3: + Gradient di[0] = -3712889974161408.000, df[0] = -2834315490623488.000, dc_hat[0] = -1535023056420864.000 + Gradient do_[0] = -189055750176243712.000 +Backward Time Step 2: + Gradient di[0] = -4866707973734400.000, df[0] = -3666888458502144.000, dc_hat[0] = -2683415035904000.000 + Gradient do_[0] = -216429226021093376.000 +Backward Time Step 1: + Gradient di[0] = -6144431216394240.000, df[0] = -4396252825387008.000, dc_hat[0] = -3721455212691456.000 + Gradient do_[0] = -194315349947056128.000 +Backward Time Step 0: + Gradient di[0] = -7177230727774208.000, df[0] = -5245840843079680.000, dc_hat[0] = -7255940835311616.000 + Gradient do_[0] = -111347155997818880.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 737958363136.000, df[0] = 541050404864.000, dc_hat[0] = 320507609088.000 + Gradient do_[0] = 45767465107456.000 +Backward Time Step 3: + Gradient di[0] = 1158379995136.000, df[0] = 835878649856.000, dc_hat[0] = 453753274368.000 + Gradient do_[0] = 61294468136960.000 +Backward Time Step 2: + Gradient di[0] = 1474885189632.000, df[0] = 1059303456768.000, dc_hat[0] = 754054856704.000 + Gradient do_[0] = 66836687224832.000 +Backward Time Step 1: + Gradient di[0] = 1851295137792.000, df[0] = 1276634071040.000, dc_hat[0] = 1030331891712.000 + Gradient do_[0] = 58943875317760.000 +Backward Time Step 0: + Gradient di[0] = 2241710522368.000, df[0] = 1594964049920.000, dc_hat[0] = 2103480418304.000 + Gradient do_[0] = 34656640565248.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2394675927318528.000, df[0] = -1879960872550400.000, dc_hat[0] = -1082751188992000.000 + Gradient do_[0] = -140023905308901376.000 +Backward Time Step 3: + Gradient di[0] = -3761867633721344.000, df[0] = -2871707207467008.000, dc_hat[0] = -1555261479190528.000 + Gradient do_[0] = -191548892792225792.000 +Backward Time Step 2: + Gradient di[0] = -4930841499140096.000, df[0] = -3715211135549440.000, dc_hat[0] = -2718762079879168.000 + Gradient do_[0] = -219280878147207168.000 +Backward Time Step 1: + Gradient di[0] = -6225288807579648.000, df[0] = -4454104961122304.000, dc_hat[0] = -3770424282316800.000 + Gradient do_[0] = -196872384496533504.000 +Backward Time Step 0: + Gradient di[0] = -7271528345370624.000, df[0] = -5314762720149504.000, dc_hat[0] = -7351273003155456.000 + Gradient do_[0] = -112810073398444032.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 677679595520.000, df[0] = 496856137728.000, dc_hat[0] = 294325583872.000 + Gradient do_[0] = 42028872237056.000 +Backward Time Step 3: + Gradient di[0] = 1063744503808.000, df[0] = 767591383040.000, dc_hat[0] = 416679919616.000 + Gradient do_[0] = 56286691459072.000 +Backward Time Step 2: + Gradient di[0] = 1354373267456.000, df[0] = 972749144064.000, dc_hat[0] = 692437057536.000 + Gradient do_[0] = 61375317540864.000 +Backward Time Step 1: + Gradient di[0] = 1699986276352.000, df[0] = 1172293025792.000, dc_hat[0] = 946120032256.000 + Gradient do_[0] = 54126289354752.000 +Backward Time Step 0: + Gradient di[0] = 2058458234880.000, df[0] = 1464581226496.000, dc_hat[0] = 1931528110080.000 + Gradient do_[0] = 31823579054080.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2426130120310784.000, df[0] = -1904655860760576.000, dc_hat[0] = -1096968268939264.000 + Gradient do_[0] = -141862744017076224.000 +Backward Time Step 3: + Gradient di[0] = -3811232645644288.000, df[0] = -2909394203312128.000, dc_hat[0] = -1575658413096960.000 + Gradient do_[0] = -194061878157115392.000 +Backward Time Step 2: + Gradient di[0] = -4995493641846784.000, df[0] = -3763925728362496.000, dc_hat[0] = -2754396618227712.000 + Gradient do_[0] = -222155448218812416.000 +Backward Time Step 1: + Gradient di[0] = -6306787422633984.000, df[0] = -4512415584616448.000, dc_hat[0] = -3819777751515136.000 + Gradient do_[0] = -199449605392302080.000 +Backward Time Step 0: + Gradient di[0] = -7366558791761920.000, df[0] = -5384220931260416.000, dc_hat[0] = -7447345515986944.000 + Gradient do_[0] = -114284381052338176.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 615523418112.000, df[0] = 451285581824.000, dc_hat[0] = 267328716800.000 + Gradient do_[0] = 38173866459136.000 +Backward Time Step 3: + Gradient di[0] = 966163496960.000, df[0] = 697178587136.000, dc_hat[0] = 378452836352.000 + Gradient do_[0] = 51123087998976.000 +Backward Time Step 2: + Gradient di[0] = 1230113603584.000, df[0] = 883503005696.000, dc_hat[0] = 628904230912.000 + Gradient do_[0] = 55744158236672.000 +Backward Time Step 1: + Gradient di[0] = 1543978745856.000, df[0] = 1064711684096.000, dc_hat[0] = 859293417472.000 + Gradient do_[0] = 49159117733888.000 +Backward Time Step 0: + Gradient di[0] = 1869526466560.000, df[0] = 1330157453312.000, dc_hat[0] = 1754246414336.000 + Gradient do_[0] = 28902709788672.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2457847380049920.000, df[0] = -1929557678489600.000, dc_hat[0] = -1111303393378304.000 + Gradient do_[0] = -143716829859151872.000 +Backward Time Step 3: + Gradient di[0] = -3860997089525760.000, df[0] = -2947386410270720.000, dc_hat[0] = -1596220166373376.000 + Gradient do_[0] = -196595084228034560.000 +Backward Time Step 2: + Gradient di[0] = -5060663328112640.000, df[0] = -3813029552586752.000, dc_hat[0] = -2790314087546880.000 + Gradient do_[0] = -225053039315124224.000 +Backward Time Step 1: + Gradient di[0] = -6388934040879104.000, df[0] = -4571190601449472.000, dc_hat[0] = -3869526894575616.000 + Gradient do_[0] = -202047390591483904.000 +Backward Time Step 0: + Gradient di[0] = -7462361795395584.000, df[0] = -5454242856828928.000, dc_hat[0] = -7544199175995392.000 + Gradient do_[0] = -115770663075053568.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 551187513344.000, df[0] = 404116635648.000, dc_hat[0] = 239385608192.000 + Gradient do_[0] = 34183736983552.000 +Backward Time Step 3: + Gradient di[0] = 865164853248.000, df[0] = 624299474944.000, dc_hat[0] = 338888359936.000 + Gradient do_[0] = 45778726813696.000 +Backward Time Step 2: + Gradient di[0] = 1101507461120.000, df[0] = 791134928896.000, dc_hat[0] = 563150061568.000 + Gradient do_[0] = 49916034416640.000 +Backward Time Step 1: + Gradient di[0] = 1382524518400.000, df[0] = 953374539776.000, dc_hat[0] = 769435893760.000 + Gradient do_[0] = 44018503254016.000 +Backward Time Step 0: + Gradient di[0] = 1674003873792.000, df[0] = 1191044055040.000, dc_hat[0] = 1570780217344.000 + Gradient do_[0] = 25879948296192.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2489882735804416.000, df[0] = -1954709141192704.000, dc_hat[0] = -1125781795241984.000 + Gradient do_[0] = -145589598808965120.000 +Backward Time Step 3: + Gradient di[0] = -3911269144854528.000, df[0] = -2985766506463232.000, dc_hat[0] = -1616991165087744.000 + Gradient do_[0] = -199154060102729728.000 +Backward Time Step 2: + Gradient di[0] = -5126492291858432.000, df[0] = -3862630519275520.000, dc_hat[0] = -2826595018473472.000 + Gradient do_[0] = -227979939268263936.000 +Backward Time Step 1: + Gradient di[0] = -6471913345908736.000, df[0] = -4630560471252992.000, dc_hat[0] = -3919778548809728.000 + Gradient do_[0] = -204671546889863168.000 +Backward Time Step 0: + Gradient di[0] = -7559134924767232.000, df[0] = -5524974525743104.000, dc_hat[0] = -7642033162289152.000 + Gradient do_[0] = -117271994663174144.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 485126307840.000, df[0] = 355682648064.000, dc_hat[0] = 210693144576.000 + Gradient do_[0] = 30086579355648.000 +Backward Time Step 3: + Gradient di[0] = 761461080064.000, df[0] = 549468143616.000, dc_hat[0] = 298264788992.000 + Gradient do_[0] = 40291239198720.000 +Backward Time Step 2: + Gradient di[0] = 969459630080.000, df[0] = 696294768640.000, dc_hat[0] = 495636938752.000 + Gradient do_[0] = 43932029288448.000 +Backward Time Step 1: + Gradient di[0] = 1216759070720.000, df[0] = 839064354816.000, dc_hat[0] = 677179424768.000 + Gradient do_[0] = 38740655341568.000 +Backward Time Step 0: + Gradient di[0] = 1473267630080.000, df[0] = 1048221450240.000, dc_hat[0] = 1382421889024.000 + Gradient do_[0] = 22776584863744.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2522201290964992.000, df[0] = -1980083405324288.000, dc_hat[0] = -1140389515886592.000 + Gradient do_[0] = -147478886202998784.000 +Backward Time Step 3: + Gradient di[0] = -3961979286847488.000, df[0] = -3024480536363008.000, dc_hat[0] = -1637944565694464.000 + Gradient do_[0] = -201735421346971648.000 +Backward Time Step 2: + Gradient di[0] = -5192896781221888.000, df[0] = -3912666082967552.000, dc_hat[0] = -2863194045415424.000 + Gradient do_[0] = -230932523125833728.000 +Backward Time Step 1: + Gradient di[0] = -6555629774700544.000, df[0] = -4690458085163008.000, dc_hat[0] = -3970478221819904.000 + Gradient do_[0] = -207318930371379200.000 +Backward Time Step 0: + Gradient di[0] = -7656749867728896.000, df[0] = -5596321448722432.000, dc_hat[0] = -7740718625849344.000 + Gradient do_[0] = -118786391541809152.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 416951894016.000, df[0] = 305699258368.000, dc_hat[0] = 181083717632.000 + Gradient do_[0] = 25858444099584.000 +Backward Time Step 3: + Gradient di[0] = 654443806720.000, df[0] = 472245567488.000, dc_hat[0] = 256343769088.000 + Gradient do_[0] = 34628494688256.000 +Backward Time Step 2: + Gradient di[0] = 833197703168.000, df[0] = 598427697152.000, dc_hat[0] = 425969909760.000 + Gradient do_[0] = 37757065887744.000 +Backward Time Step 1: + Gradient di[0] = 1045711945728.000, df[0] = 721111941120.000, dc_hat[0] = 581983010816.000 + Gradient do_[0] = 33294636810240.000 +Backward Time Step 0: + Gradient di[0] = 1266143854592.000, df[0] = 900854120448.000, dc_hat[0] = 1188070031360.000 + Gradient do_[0] = 19574470737920.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2554768417357824.000, df[0] = -2005652419379200.000, dc_hat[0] = -1155108368809984.000 + Gradient do_[0] = -149382673406623744.000 +Backward Time Step 3: + Gradient di[0] = -4013080002428928.000, df[0] = -3063492798054400.000, dc_hat[0] = -1659057819615232.000 + Gradient do_[0] = -204336642519990272.000 +Backward Time Step 2: + Gradient di[0] = -5259808076726272.000, df[0] = -3963081751265280.000, dc_hat[0] = -2900070366183424.000 + Gradient do_[0] = -233907509532819456.000 +Backward Time Step 1: + Gradient di[0] = -6639972731846656.000, df[0] = -4750804523155456.000, dc_hat[0] = -4021555583516672.000 + Gradient do_[0] = -209986173781671936.000 +Backward Time Step 0: + Gradient di[0] = -7755100860710912.000, df[0] = -5668205779484672.000, dc_hat[0] = -7840148192493568.000 + Gradient do_[0] = -120312204443516928.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 346640384000.000, df[0] = 254148771840.000, dc_hat[0] = 150546284544.000 + Gradient do_[0] = 21497789808640.000 +Backward Time Step 3: + Gradient di[0] = 544075317248.000, df[0] = 392604385280.000, dc_hat[0] = 213110964224.000 + Gradient do_[0] = 28788463239168.000 +Backward Time Step 2: + Gradient di[0] = 692673642496.000, df[0] = 497499439104.000, dc_hat[0] = 354125086720.000 + Gradient do_[0] = 31389011410944.000 +Backward Time Step 1: + Gradient di[0] = 869324488704.000, df[0] = 599476928512.000, dc_hat[0] = 483815424000.000 + Gradient do_[0] = 27678591680512.000 +Backward Time Step 0: + Gradient di[0] = 1052558295040.000, df[0] = 748889178112.000, dc_hat[0] = 987654717440.000 + Gradient do_[0] = 16272456679424.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2587629480574976.000, df[0] = -2031452690579456.000, dc_hat[0] = -1169961171025920.000 + Gradient do_[0] = -151303692019040256.000 +Backward Time Step 3: + Gradient di[0] = -4064643232301056.000, df[0] = -3102859126112256.000, dc_hat[0] = -1680362065362944.000 + Gradient do_[0] = -206961451653398528.000 +Backward Time Step 2: + Gradient di[0] = -5327324962619392.000, df[0] = -4013954833580032.000, dc_hat[0] = -2937280889094144.000 + Gradient do_[0] = -236909485514293248.000 +Backward Time Step 1: + Gradient di[0] = -6725060328947712.000, df[0] = -4811683000221696.000, dc_hat[0] = -4073084722085888.000 + Gradient do_[0] = -212677005152354304.000 +Backward Time Step 0: + Gradient di[0] = -7854317289603072.000, df[0] = -5740723081052160.000, dc_hat[0] = -7940452321853440.000 + Gradient do_[0] = -121851443412992000.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 274768936960.000, df[0] = 201454534656.000, dc_hat[0] = 119331733504.000 + Gradient do_[0] = 17040438984704.000 +Backward Time Step 3: + Gradient di[0] = 431261941760.000, df[0] = 311198744576.000, dc_hat[0] = 168921235456.000 + Gradient do_[0] = 22819117203456.000 +Backward Time Step 2: + Gradient di[0] = 549040553984.000, df[0] = 394338009088.000, dc_hat[0] = 280691769344.000 + Gradient do_[0] = 24880099622912.000 +Backward Time Step 1: + Gradient di[0] = 689044193280.000, df[0] = 475157626880.000, dc_hat[0] = 383481446400.000 + Gradient do_[0] = 21938600673280.000 +Backward Time Step 0: + Gradient di[0] = 834265088000.000, df[0] = 593574821888.000, dc_hat[0] = 782822080512.000 + Gradient do_[0] = 12897663057920.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2620828772466688.000, df[0] = -2057518041792512.000, dc_hat[0] = -1184965639274496.000 + Gradient do_[0] = -153244364401803264.000 +Backward Time Step 3: + Gradient di[0] = -4116729911312384.000, df[0] = -3142623812386816.000, dc_hat[0] = -1701881864781824.000 + Gradient do_[0] = -209612889584041984.000 +Backward Time Step 2: + Gradient di[0] = -5395536022601728.000, df[0] = -4065350828163072.000, dc_hat[0] = -2974873395658752.000 + Gradient do_[0] = -239942247821344768.000 +Backward Time Step 1: + Gradient di[0] = -6811036984279040.000, df[0] = -4873197669318656.000, dc_hat[0] = -4125150463131648.000 + Gradient do_[0] = -215395891249414144.000 +Backward Time Step 0: + Gradient di[0] = -7954566121259008.000, df[0] = -5813995223121920.000, dc_hat[0] = -8041801202008064.000 + Gradient do_[0] = -123406694020546560.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 200138260480.000, df[0] = 146737102848.000, dc_hat[0] = 86919176192.000 + Gradient do_[0] = 12411996209152.000 +Backward Time Step 3: + Gradient di[0] = 314121420800.000, df[0] = 226670460928.000, dc_hat[0] = 123037310976.000 + Gradient do_[0] = 16620859686912.000 +Backward Time Step 2: + Gradient di[0] = 399903358976.000, df[0] = 287223218176.000, dc_hat[0] = 204445548544.000 + Gradient do_[0] = 18121800810496.000 +Backward Time Step 1: + Gradient di[0] = 501864398848.000, df[0] = 346080346112.000, dc_hat[0] = 279307780096.000 + Gradient do_[0] = 15978943479808.000 +Backward Time Step 0: + Gradient di[0] = 607627182080.000, df[0] = 432323264512.000, dc_hat[0] = 570159267840.000 + Gradient do_[0] = 9393861033984.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2654188320325632.000, df[0] = -2083709691887616.000, dc_hat[0] = -1200043121967104.000 + Gradient do_[0] = -155194537252225024.000 +Backward Time Step 3: + Gradient di[0] = -4169071604006912.000, df[0] = -3182583651237888.000, dc_hat[0] = -1723507964641280.000 + Gradient do_[0] = -212277212416573440.000 +Backward Time Step 2: + Gradient di[0] = -5464070278873088.000, df[0] = -4116989756833792.000, dc_hat[0] = -3012643874930688.000 + Gradient do_[0] = -242989441218510848.000 +Backward Time Step 1: + Gradient di[0] = -6897421124632576.000, df[0] = -4935004396191744.000, dc_hat[0] = -4177464238538752.000 + Gradient do_[0] = -218127576349016064.000 +Backward Time Step 0: + Gradient di[0] = -8055286325575680.000, df[0] = -5887610962575360.000, dc_hat[0] = -8143625212919808.000 + Gradient do_[0] = -124969246072504320.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 123744026624.000, df[0] = 90726604800.000, dc_hat[0] = 53741150208.000 + Gradient do_[0] = 7674214219776.000 +Backward Time Step 3: + Gradient di[0] = 194215936000.000, df[0] = 140146737152.000, dc_hat[0] = 76071247872.000 + Gradient do_[0] = 10276350984192.000 +Backward Time Step 2: + Gradient di[0] = 247249698816.000, df[0] = 177582653440.000, dc_hat[0] = 126402412544.000 + Gradient do_[0] = 11204199186432.000 +Backward Time Step 1: + Gradient di[0] = 310281863168.000, df[0] = 213967028224.000, dc_hat[0] = 172684066816.000 + Gradient do_[0] = 9879112646656.000 +Backward Time Step 0: + Gradient di[0] = 375664803840.000, df[0] = 267283349504.000, dc_hat[0] = 352500318208.000 + Gradient do_[0] = 5807744221184.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2687958842867712.000, df[0] = -2110224672489472.000, dc_hat[0] = -1215305556688896.000 + Gradient do_[0] = -157168590120812544.000 +Backward Time Step 3: + Gradient di[0] = -4222054320570368.000, df[0] = -3223033653231616.000, dc_hat[0] = -1745398070771712.000 + Gradient do_[0] = -214974211360292864.000 +Backward Time Step 2: + Gradient di[0] = -5533445274992640.000, df[0] = -4169262998487040.000, dc_hat[0] = -3050877942235136.000 + Gradient do_[0] = -246073983651282944.000 +Backward Time Step 1: + Gradient di[0] = -6984863974424576.000, df[0] = -4997567574179840.000, dc_hat[0] = -4230418500943872.000 + Gradient do_[0] = -220892865272741888.000 +Backward Time Step 0: + Gradient di[0] = -8157238648635392.000, df[0] = -5962128645160960.000, dc_hat[0] = -8246695838089216.000 + Gradient do_[0] = -126550945088667648.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 45274275840.000, df[0] = 33194215424.000, dc_hat[0] = 19662186496.000 + Gradient do_[0] = 2807757012992.000 +Backward Time Step 3: + Gradient di[0] = 71056785408.000, df[0] = 51274842112.000, dc_hat[0] = 27831568384.000 + Gradient do_[0] = 3759739240448.000 +Backward Time Step 2: + Gradient di[0] = 90458726400.000, df[0] = 64970395648.000, dc_hat[0] = 46245285888.000 + Gradient do_[0] = 4099152281600.000 +Backward Time Step 1: + Gradient di[0] = 113516748800.000, df[0] = 78279933952.000, dc_hat[0] = 63176474624.000 + Gradient do_[0] = 3614275272704.000 +Backward Time Step 0: + Gradient di[0] = 137435185152.000, df[0] = 97784332288.000, dc_hat[0] = 128960552960.000 + Gradient do_[0] = 2124735447040.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2721959951466496.000, df[0] = -2136919907500032.000, dc_hat[0] = -1230672949673984.000 + Gradient do_[0] = -159156197906186240.000 +Backward Time Step 3: + Gradient di[0] = -4275404256837632.000, df[0] = -3263763096535040.000, dc_hat[0] = -1767438903410688.000 + Gradient do_[0] = -217689730202992640.000 +Backward Time Step 2: + Gradient di[0] = -5603300233707520.000, df[0] = -4221897285828608.000, dc_hat[0] = -3089375344721920.000 + Gradient do_[0] = -249179760402366464.000 +Backward Time Step 1: + Gradient di[0] = -7072899529703424.000, df[0] = -5060555417059328.000, dc_hat[0] = -4283731124682752.000 + Gradient do_[0] = -223676914613616640.000 +Backward Time Step 0: + Gradient di[0] = -8259883535171584.000, df[0] = -6037152060145664.000, dc_hat[0] = -8350467079798784.000 + Gradient do_[0] = -128143372933136384.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -35373416448.000, df[0] = -25935124480.000, dc_hat[0] = -15362240512.000 + Gradient do_[0] = -2193729650688.000 +Backward Time Step 3: + Gradient di[0] = -55516766208.000, df[0] = -40061165568.000, dc_hat[0] = -21744658432.000 + Gradient do_[0] = -2937477136384.000 +Backward Time Step 2: + Gradient di[0] = -70674472960.000, df[0] = -50760744960.000, dc_hat[0] = -36130746368.000 + Gradient do_[0] = -3202617180160.000 +Backward Time Step 1: + Gradient di[0] = -88687484928.000, df[0] = -61157933056.000, dc_hat[0] = -49357950976.000 + Gradient do_[0] = -2823732068352.000 +Backward Time Step 0: + Gradient di[0] = -107372666880.000, df[0] = -76395036672.000, dc_hat[0] = -100751769600.000 + Gradient do_[0] = -1659971698688.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.866 + c_state[0] = 0.725, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.981, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 18135912546304.000, df[0] = 12441803030528.000, dc_hat[0] = 7585777319936.000 + Gradient do_[0] = 1203732901527552.000 +Backward Time Step 3: + Gradient di[0] = 28505244958720.000, df[0] = 19421506568192.000, dc_hat[0] = 10616921128960.000 + Gradient do_[0] = 1577605643894784.000 +Backward Time Step 2: + Gradient di[0] = 34977192345600.000, df[0] = 23900119367680.000, dc_hat[0] = 16599175135232.000 + Gradient do_[0] = 1619275819253760.000 +Backward Time Step 1: + Gradient di[0] = 43587513352192.000, df[0] = 28957470621696.000, dc_hat[0] = 22271919915008.000 + Gradient do_[0] = 1398003667566592.000 +Backward Time Step 0: + Gradient di[0] = 54296229969920.000, df[0] = 37677218922496.000, dc_hat[0] = 47337355345920.000 + Gradient do_[0] = 837456211476480.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.690, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.973, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1369055821824.000, df[0] = 1003772379136.000, dc_hat[0] = 594535514112.000 + Gradient do_[0] = 84901244174336.000 +Backward Time Step 3: + Gradient di[0] = 2148711268352.000, df[0] = 1550535753728.000, dc_hat[0] = 841555574784.000 + Gradient do_[0] = 113688983896064.000 +Backward Time Step 2: + Gradient di[0] = 2735230156800.000, df[0] = 1964540297216.000, dc_hat[0] = 1398283173888.000 + Gradient do_[0] = 123945894281216.000 +Backward Time Step 1: + Gradient di[0] = 3432536080384.000, df[0] = 2367039995904.000, dc_hat[0] = 1910336651264.000 + Gradient do_[0] = 109288949284864.000 +Backward Time Step 0: + Gradient di[0] = 4157123067904.000, df[0] = 2957769179136.000, dc_hat[0] = 3900783722496.000 + Gradient do_[0] = 64268745572352.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2111750862274560.000, df[0] = -1657874287689728.000, dc_hat[0] = -954738816319488.000 + Gradient do_[0] = -123472827926970368.000 +Backward Time Step 3: + Gradient di[0] = -3316958413979648.000, df[0] = -2532127497256960.000, dc_hat[0] = -1371143613186048.000 + Gradient do_[0] = -168884728328355840.000 +Backward Time Step 2: + Gradient di[0] = -4346831173582848.000, df[0] = -3275201131315200.000, dc_hat[0] = -2396550412107776.000 + Gradient do_[0] = -193302321780752384.000 +Backward Time Step 1: + Gradient di[0] = -5487022047232000.000, df[0] = -3925883576385536.000, dc_hat[0] = -3323234434940928.000 + Gradient do_[0] = -173524220720971776.000 +Backward Time Step 0: + Gradient di[0] = -6409828519903232.000, df[0] = -4684946031509504.000, dc_hat[0] = -6480122639024128.000 + Gradient do_[0] = -99441712811737088.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1329353064448.000, df[0] = 974662270976.000, dc_hat[0] = 577283883008.000 + Gradient do_[0] = 82438072041472.000 +Backward Time Step 3: + Gradient di[0] = 2086373163008.000, df[0] = 1505551974400.000, dc_hat[0] = 817110253568.000 + Gradient do_[0] = 110388628684800.000 +Backward Time Step 2: + Gradient di[0] = 2655855837184.000, df[0] = 1907527778304.000, dc_hat[0] = 1357611663360.000 + Gradient do_[0] = 120345914769408.000 +Backward Time Step 1: + Gradient di[0] = 3332848222208.000, df[0] = 2298285916160.000, dc_hat[0] = 1854656741376.000 + Gradient do_[0] = 106111730450432.000 +Backward Time Step 0: + Gradient di[0] = 4036179787776.000, df[0] = 2871718838272.000, dc_hat[0] = 3787297914880.000 + Gradient do_[0] = 62398970986496.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2137065667952640.000, df[0] = -1677747504021504.000, dc_hat[0] = -966166918987776.000 + Gradient do_[0] = -124951567987113984.000 +Backward Time Step 3: + Gradient di[0] = -3356683103371264.000, df[0] = -2562454261334016.000, dc_hat[0] = -1387512270422016.000 + Gradient do_[0] = -170904273490542592.000 +Backward Time Step 2: + Gradient di[0] = -4398863897067520.000, df[0] = -3314400224083968.000, dc_hat[0] = -2425072484614144.000 + Gradient do_[0] = -195611485177643008.000 +Backward Time Step 1: + Gradient di[0] = -5552604587229184.000, df[0] = -3972788914225152.000, dc_hat[0] = -3362625727496192.000 + Gradient do_[0] = -175593467244707840.000 +Backward Time Step 0: + Gradient di[0] = -6486109286563840.000, df[0] = -4740700075720704.000, dc_hat[0] = -6557240387436544.000 + Gradient do_[0] = -100625139510542336.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1289449111552.000, df[0] = 945404116992.000, dc_hat[0] = 559945220096.000 + Gradient do_[0] = 79962459602944.000 +Backward Time Step 3: + Gradient di[0] = 2023719436288.000, df[0] = 1460340916224.000, dc_hat[0] = 792543428608.000 + Gradient do_[0] = 107071697584128.000 +Backward Time Step 2: + Gradient di[0] = 2576081485824.000, df[0] = 1850228473856.000, dc_hat[0] = 1316741316608.000 + Gradient do_[0] = 116728059133952.000 +Backward Time Step 1: + Gradient di[0] = 3232657309696.000, df[0] = 2229186330624.000, dc_hat[0] = 1798712066048.000 + Gradient do_[0] = 102918766198784.000 +Backward Time Step 0: + Gradient di[0] = 3914639867904.000, df[0] = 2785243824128.000, dc_hat[0] = 3673252691968.000 + Gradient do_[0] = 60519977320448.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2163623195574272.000, df[0] = -1698596617453568.000, dc_hat[0] = -978157662371840.000 + Gradient do_[0] = -126502746965671936.000 +Backward Time Step 3: + Gradient di[0] = -3398355291996160.000, df[0] = -2594268694708224.000, dc_hat[0] = -1404687307767808.000 + Gradient do_[0] = -173023066757005312.000 +Backward Time Step 2: + Gradient di[0] = -4453450247045120.000, df[0] = -3355524267507712.000, dc_hat[0] = -2455002769522688.000 + Gradient do_[0] = -198034018531278848.000 +Backward Time Step 1: + Gradient di[0] = -5621405131472896.000, df[0] = -4021996354535424.000, dc_hat[0] = -3403962103365632.000 + Gradient do_[0] = -177764298334928896.000 +Backward Time Step 0: + Gradient di[0] = -6566135264706560.000, df[0] = -4799190550970368.000, dc_hat[0] = -6638143612649472.000 + Gradient do_[0] = -101866659936993280.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1248551108608.000, df[0] = 915419430912.000, dc_hat[0] = 542182178816.000 + Gradient do_[0] = 77425954258944.000 +Backward Time Step 3: + Gradient di[0] = 1959504510976.000, df[0] = 1414004736000.000, dc_hat[0] = 767389138944.000 + Gradient do_[0] = 103673824804864.000 +Backward Time Step 2: + Gradient di[0] = 2494307762176.000, df[0] = 1791496945664.000, dc_hat[0] = 1274935508992.000 + Gradient do_[0] = 113022332829696.000 +Backward Time Step 1: + Gradient di[0] = 3129966854144.000, df[0] = 2158372192256.000, dc_hat[0] = 1741570834432.000 + Gradient do_[0] = 99649306230784.000 +Backward Time Step 0: + Gradient di[0] = 3790228160512.000, df[0] = 2696725659648.000, dc_hat[0] = 3556512366592.000 + Gradient do_[0] = 58596586946560.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2190970460307456.000, df[0] = -1720068098490368.000, dc_hat[0] = -990516699201536.000 + Gradient do_[0] = -128101316613373952.000 +Backward Time Step 3: + Gradient di[0] = -3441261746847744.000, df[0] = -2627025604968448.000, dc_hat[0] = -1422411564056576.000 + Gradient do_[0] = -175206903008198656.000 +Backward Time Step 2: + Gradient di[0] = -4509624392744960.000, df[0] = -3397850901774336.000, dc_hat[0] = -2485956330389504.000 + Gradient do_[0] = -200531456114556928.000 +Backward Time Step 1: + Gradient di[0] = -5692195319316480.000, df[0] = -4072645561679872.000, dc_hat[0] = -3446825071673344.000 + Gradient do_[0] = -180002886829211648.000 +Backward Time Step 0: + Gradient di[0] = -6648679167426560.000, df[0] = -4859521956577280.000, dc_hat[0] = -6721592142856192.000 + Gradient do_[0] = -103147230206099456.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1206771253248.000, df[0] = 884788166656.000, dc_hat[0] = 524036177920.000 + Gradient do_[0] = 74834771968000.000 +Backward Time Step 3: + Gradient di[0] = 1893906907136.000, df[0] = 1366670704640.000, dc_hat[0] = 741693456384.000 + Gradient do_[0] = 100202719477760.000 +Backward Time Step 2: + Gradient di[0] = 2410766925824.000, df[0] = 1731496116224.000, dc_hat[0] = 1232226746368.000 + Gradient do_[0] = 109236587593728.000 +Backward Time Step 1: + Gradient di[0] = 3025061281792.000, df[0] = 2086031327232.000, dc_hat[0] = 1683197001728.000 + Gradient do_[0] = 96309407121408.000 +Backward Time Step 0: + Gradient di[0] = 3663137603584.000, df[0] = 2606301446144.000, dc_hat[0] = 3437258604544.000 + Gradient do_[0] = 56631773626368.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2218989082116096.000, df[0] = -1742065981456384.000, dc_hat[0] = -1003178464116736.000 + Gradient do_[0] = -129739090722553856.000 +Backward Time Step 3: + Gradient di[0] = -3485218321203200.000, df[0] = -2660584868806656.000, dc_hat[0] = -1440570148913152.000 + Gradient do_[0] = -177444288911638528.000 +Backward Time Step 2: + Gradient di[0] = -4567170512060416.000, df[0] = -3441210475675648.000, dc_hat[0] = -2517663389581312.000 + Gradient do_[0] = -203089813513961472.000 +Backward Time Step 1: + Gradient di[0] = -5764719600205824.000, df[0] = -4124535209066496.000, dc_hat[0] = -3490737085743104.000 + Gradient do_[0] = -182296296286060544.000 +Backward Time Step 0: + Gradient di[0] = -6733257810903040.000, df[0] = -4921340494610432.000, dc_hat[0] = -6807099036139520.000 + Gradient do_[0] = -104459377074765824.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1163511070720.000, df[0] = 853071364096.000, dc_hat[0] = 505247629312.000 + Gradient do_[0] = 72151818305536.000 +Backward Time Step 3: + Gradient di[0] = 1825985658880.000, df[0] = 1317659869184.000, dc_hat[0] = 715087806464.000 + Gradient do_[0] = 96608754597888.000 +Backward Time Step 2: + Gradient di[0] = 2324278280192.000, df[0] = 1669377818624.000, dc_hat[0] = 1188011442176.000 + Gradient do_[0] = 105317346050048.000 +Backward Time Step 1: + Gradient di[0] = 2916461051904.000, df[0] = 2011141898240.000, dc_hat[0] = 1622767173632.000 + Gradient do_[0] = 92851857784832.000 +Backward Time Step 0: + Gradient di[0] = 3531576967168.000, df[0] = 2512696639488.000, dc_hat[0] = 3313810538496.000 + Gradient do_[0] = 54597859147776.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2247540950958080.000, df[0] = -1764483294822400.000, dc_hat[0] = -1016081418289152.000 + Gradient do_[0] = -131408046294302720.000 +Backward Time Step 3: + Gradient di[0] = -3530011072004096.000, df[0] = -2694782472159232.000, dc_hat[0] = -1459073673330688.000 + Gradient do_[0] = -179724263710785536.000 +Backward Time Step 2: + Gradient di[0] = -4625817216745472.000, df[0] = -3485400320442368.000, dc_hat[0] = -2549979260387328.000 + Gradient do_[0] = -205697202260017152.000 +Backward Time Step 1: + Gradient di[0] = -5838628471177216.000, df[0] = -4177415114850304.000, dc_hat[0] = -3535486081564672.000 + Gradient do_[0] = -184633376970375168.000 +Backward Time Step 0: + Gradient di[0] = -6819443309019136.000, df[0] = -4984333706199040.000, dc_hat[0] = -6894229427060736.000 + Gradient do_[0] = -105796460523552768.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1118650236928.000, df[0] = 820180942848.000, dc_hat[0] = 485764169728.000 + Gradient do_[0] = 69369627410432.000 +Backward Time Step 3: + Gradient di[0] = 1755556216832.000, df[0] = 1266838798336.000, dc_hat[0] = 687500623872.000 + Gradient do_[0] = 92882098716672.000 +Backward Time Step 2: + Gradient di[0] = 2234597244928.000, df[0] = 1604967202816.000, dc_hat[0] = 1142165209088.000 + Gradient do_[0] = 101253409406976.000 +Backward Time Step 1: + Gradient di[0] = 2803861815296.000, df[0] = 1933495631872.000, dc_hat[0] = 1560113577984.000 + Gradient do_[0] = 89266977767424.000 +Backward Time Step 0: + Gradient di[0] = 3395176890368.000, df[0] = 2415648833536.000, dc_hat[0] = 3185821089792.000 + Gradient do_[0] = 52489126674432.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2276485675089920.000, df[0] = -1787208906309632.000, dc_hat[0] = -1029162479386624.000 + Gradient do_[0] = -133100005710888960.000 +Backward Time Step 3: + Gradient di[0] = -3575421224353792.000, df[0] = -2729450911301632.000, dc_hat[0] = -1477832479866880.000 + Gradient do_[0] = -182035626130931712.000 +Backward Time Step 2: + Gradient di[0] = -4685270301540352.000, df[0] = -3530196560904192.000, dc_hat[0] = -2582738049695744.000 + Gradient do_[0] = -208340393853452288.000 +Backward Time Step 1: + Gradient di[0] = -5913555249397760.000, df[0] = -4231023286026240.000, dc_hat[0] = -3580851673628672.000 + Gradient do_[0] = -187002704269148160.000 +Backward Time Step 0: + Gradient di[0] = -6906815828721664.000, df[0] = -5048193964310528.000, dc_hat[0] = -6982560261341184.000 + Gradient do_[0] = -107151952202170368.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1072428023808.000, df[0] = 786292408320.000, dc_hat[0] = 465689772032.000 + Gradient do_[0] = 66503072284672.000 +Backward Time Step 3: + Gradient di[0] = 1682990956544.000, df[0] = 1214476320768.000, dc_hat[0] = 659077529600.000 + Gradient do_[0] = 89042507005952.000 +Backward Time Step 2: + Gradient di[0] = 2142200528896.000, df[0] = 1538605318144.000, dc_hat[0] = 1094931972096.000 + Gradient do_[0] = 97066478993408.000 +Backward Time Step 1: + Gradient di[0] = 2687863095296.000, df[0] = 1853504618496.000, dc_hat[0] = 1495568220160.000 + Gradient do_[0] = 85573876318208.000 +Backward Time Step 0: + Gradient di[0] = 3254669017088.000, df[0] = 2315678384128.000, dc_hat[0] = 3053977337856.000 + Gradient do_[0] = 50316884049920.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2305829160091648.000, df[0] = -1810247379320832.000, dc_hat[0] = -1042422385606656.000 + Gradient do_[0] = -134815235260284928.000 +Backward Time Step 3: + Gradient di[0] = -3621461931589632.000, df[0] = -2764601192087552.000, dc_hat[0] = -1496850997706752.000 + Gradient do_[0] = -184378977467498496.000 +Backward Time Step 2: + Gradient di[0] = -4745546946314240.000, df[0] = -3575613961011200.000, dc_hat[0] = -2615950494924800.000 + Gradient do_[0] = -211020281647464448.000 +Backward Time Step 1: + Gradient di[0] = -5989520335962112.000, df[0] = -4285374218108928.000, dc_hat[0] = -3626846209966080.000 + Gradient do_[0] = -189404896657670144.000 +Backward Time Step 0: + Gradient di[0] = -6995392013008896.000, df[0] = -5112934690717696.000, dc_hat[0] = -7072107645108224.000 + Gradient do_[0] = -108526118398590976.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1024842137600.000, df[0] = 751403728896.000, dc_hat[0] = 445023289344.000 + Gradient do_[0] = 63551909658624.000 +Backward Time Step 3: + Gradient di[0] = 1608289484800.000, df[0] = 1160572174336.000, dc_hat[0] = 629818458112.000 + Gradient do_[0] = 85089920745472.000 +Backward Time Step 2: + Gradient di[0] = 2047086821376.000, df[0] = 1470292426752.000, dc_hat[0] = 1046310354944.000 + Gradient do_[0] = 92756445757440.000 +Backward Time Step 1: + Gradient di[0] = 2568454930432.000, df[0] = 1771162828800.000, dc_hat[0] = 1429125595136.000 + Gradient do_[0] = 81772251447296.000 +Backward Time Step 0: + Gradient di[0] = 3110030802944.000, df[0] = 2212769038336.000, dc_hat[0] = 2918258049024.000 + Gradient do_[0] = 48080795729920.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2335488862060544.000, df[0] = -1833534423564288.000, dc_hat[0] = -1055826240339968.000 + Gradient do_[0] = -136548907399249920.000 +Backward Time Step 3: + Gradient di[0] = -3667985285775360.000, df[0] = -2800120034754560.000, dc_hat[0] = -1516068694654976.000 + Gradient do_[0] = -186747016276082688.000 +Backward Time Step 2: + Gradient di[0] = -4806454414409728.000, df[0] = -3621507834052608.000, dc_hat[0] = -2649513785294848.000 + Gradient do_[0] = -213728206987984896.000 +Backward Time Step 1: + Gradient di[0] = -6066275696508928.000, df[0] = -4340290743697408.000, dc_hat[0] = -3673320172027904.000 + Gradient do_[0] = -191832068575985664.000 +Backward Time Step 0: + Gradient di[0] = -7084895910232064.000, df[0] = -5178352948215808.000, dc_hat[0] = -7162593479229440.000 + Gradient do_[0] = -109914664145518592.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 975017017344.000, df[0] = 714873372672.000, dc_hat[0] = 423385137152.000 + Gradient do_[0] = 60461978484736.000 +Backward Time Step 3: + Gradient di[0] = 1530075283456.000, df[0] = 1104132964352.000, dc_hat[0] = 599183523840.000 + Gradient do_[0] = 80951468097536.000 +Backward Time Step 2: + Gradient di[0] = 1947503034368.000, df[0] = 1398768795648.000, dc_hat[0] = 995404546048.000 + Gradient do_[0] = 88243894747136.000 +Backward Time Step 1: + Gradient di[0] = 2443452088320.000, df[0] = 1684962541568.000, dc_hat[0] = 1359569747968.000 + Gradient do_[0] = 77792460931072.000 +Backward Time Step 0: + Gradient di[0] = 2958628487168.000, df[0] = 2105047252992.000, dc_hat[0] = 2776191467520.000 + Gradient do_[0] = 45740130828288.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2365378344779776.000, df[0] = -1857001722216448.000, dc_hat[0] = -1069333644050432.000 + Gradient do_[0] = -138295979836178432.000 +Backward Time Step 3: + Gradient di[0] = -3714878812454912.000, df[0] = -2835921539956736.000, dc_hat[0] = -1535439936684032.000 + Gradient do_[0] = -189133763962208256.000 +Backward Time Step 2: + Gradient di[0] = -4867848287551488.000, df[0] = -3667765705572352.000, dc_hat[0] = -2683339068669952.000 + Gradient do_[0] = -216457538445508608.000 +Backward Time Step 1: + Gradient di[0] = -6143639868669952.000, df[0] = -4395643208466432.000, dc_hat[0] = -3720162964406272.000 + Gradient do_[0] = -194278361688702976.000 +Backward Time Step 0: + Gradient di[0] = -7175097739640832.000, df[0] = -5244281769951232.000, dc_hat[0] = -7253784224858112.000 + Gradient do_[0] = -111314058979835904.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 923929739264.000, df[0] = 677417451520.000, dc_hat[0] = 401198710784.000 + Gradient do_[0] = 57293727072256.000 +Backward Time Step 3: + Gradient di[0] = 1449882812416.000, df[0] = 1046266118144.000, dc_hat[0] = 567775330304.000 + Gradient do_[0] = 76708417896448.000 +Backward Time Step 2: + Gradient di[0] = 1845408169984.000, df[0] = 1325441089536.000, dc_hat[0] = 943215869952.000 + Gradient do_[0] = 83617585823744.000 +Backward Time Step 1: + Gradient di[0] = 2315297226752.000, df[0] = 1596589211648.000, dc_hat[0] = 1288261468160.000 + Gradient do_[0] = 73712342663168.000 +Backward Time Step 0: + Gradient di[0] = 2803409879040.000, df[0] = 1994609917952.000, dc_hat[0] = 2630543736832.000 + Gradient do_[0] = 43340460457984.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2395648737411072.000, df[0] = -1880768192184320.000, dc_hat[0] = -1083013249105920.000 + Gradient do_[0] = -140065343153373184.000 +Backward Time Step 3: + Gradient di[0] = -3762363970879488.000, df[0] = -2872174822031360.000, dc_hat[0] = -1555055454978048.000 + Gradient do_[0] = -191550679498620928.000 +Backward Time Step 2: + Gradient di[0] = -4930008275484672.000, df[0] = -3714603397677056.000, dc_hat[0] = -2717591701291008.000 + Gradient do_[0] = -219221057842708480.000 +Backward Time Step 1: + Gradient di[0] = -6221973092827136.000, df[0] = -4451688505147392.000, dc_hat[0] = -3767590677643264.000 + Gradient do_[0] = -196755423947128832.000 +Backward Time Step 0: + Gradient di[0] = -7266433440415744.000, df[0] = -5311038983503872.000, dc_hat[0] = -7346121726754816.000 + Gradient do_[0] = -112731037410263040.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 871019184128.000, df[0] = 638624727040.000, dc_hat[0] = 378221068288.000 + Gradient do_[0] = 54012472721408.000 +Backward Time Step 3: + Gradient di[0] = 1366833364992.000, df[0] = 986337574912.000, dc_hat[0] = 535248994304.000 + Gradient do_[0] = 72314263699456.000 +Backward Time Step 2: + Gradient di[0] = 1739676712960.000, df[0] = 1249501773824.000, dc_hat[0] = 889169444864.000 + Gradient do_[0] = 78826541416448.000 +Backward Time Step 1: + Gradient di[0] = 2182589579264.000, df[0] = 1505075920896.000, dc_hat[0] = 1214419697664.000 + Gradient do_[0] = 69487294414848.000 +Backward Time Step 0: + Gradient di[0] = 2642684149760.000, df[0] = 1880254578688.000, dc_hat[0] = 2479728885760.000 + Gradient do_[0] = 40855658299392.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2426150789840896.000, df[0] = -1904716661391360.000, dc_hat[0] = -1096796872900608.000 + Gradient do_[0] = -141848269977288704.000 +Backward Time Step 3: + Gradient di[0] = -3810214201524224.000, df[0] = -2908706471673856.000, dc_hat[0] = -1574820760256512.000 + Gradient do_[0] = -193986183653490688.000 +Backward Time Step 2: + Gradient di[0] = -4992653057851392.000, df[0] = -3761805625131008.000, dc_hat[0] = -2752109548142592.000 + Gradient do_[0] = -222006137975734272.000 +Backward Time Step 1: + Gradient di[0] = -6300908149276672.000, df[0] = -4508164640735232.000, dc_hat[0] = -3815382926229504.000 + Gradient do_[0] = -199251555860348928.000 +Backward Time Step 0: + Gradient di[0] = -7358474589569024.000, df[0] = -5378311593132032.000, dc_hat[0] = -7439171656351744.000 + Gradient do_[0] = -114158959417360384.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 816155852800.000, df[0] = 598399844352.000, dc_hat[0] = 354395881472.000 + Gradient do_[0] = 50610166562816.000 +Backward Time Step 3: + Gradient di[0] = 1280720502784.000, df[0] = 924197781504.000, dc_hat[0] = 501522628608.000 + Gradient do_[0] = 67758033207296.000 +Backward Time Step 2: + Gradient di[0] = 1630049271808.000, df[0] = 1170763939840.000, dc_hat[0] = 833132167168.000 + Gradient do_[0] = 73858992308224.000 +Backward Time Step 1: + Gradient di[0] = 2045000286208.000, df[0] = 1410197094400.000, dc_hat[0] = 1137861591040.000 + Gradient do_[0] = 65106826231808.000 +Backward Time Step 0: + Gradient di[0] = 2476052578304.000, df[0] = 1761697202176.000, dc_hat[0] = 2323372310528.000 + Gradient do_[0] = 38279546142720.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2456929330790400.000, df[0] = -1928882160664576.000, dc_hat[0] = -1110705520508928.000 + Gradient do_[0] = -143647371648040960.000 +Backward Time Step 3: + Gradient di[0] = -3858498760736768.000, df[0] = -2945570175975424.000, dc_hat[0] = -1594765380419584.000 + Gradient do_[0] = -196443643681177600.000 +Backward Time Step 2: + Gradient di[0] = -5055857796579328.000, df[0] = -3809429296250880.000, dc_hat[0] = -2786934753591296.000 + Gradient do_[0] = -224816163278815232.000 +Backward Time Step 1: + Gradient di[0] = -6380554559684608.000, df[0] = -4565149729947648.000, dc_hat[0] = -3863605745287168.000 + Gradient do_[0] = -201770073143115776.000 +Backward Time Step 0: + Gradient di[0] = -7451329635024896.000, df[0] = -5446179592601600.000, dc_hat[0] = -7533045682798592.000 + Gradient do_[0] = -115599508628307968.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 759645470720.000, df[0] = 556967526400.000, dc_hat[0] = 329855500288.000 + Gradient do_[0] = 47105766850560.000 +Backward Time Step 3: + Gradient di[0] = 1192024211456.000, df[0] = 860193816576.000, dc_hat[0] = 466785894400.000 + Gradient do_[0] = 63065215205376.000 +Backward Time Step 2: + Gradient di[0] = 1517140705280.000, df[0] = 1089669300224.000, dc_hat[0] = 775418675200.000 + Gradient do_[0] = 68742780289024.000 +Backward Time Step 1: + Gradient di[0] = 1903304638464.000, df[0] = 1312485933056.000, dc_hat[0] = 1059018702848.000 + Gradient do_[0] = 60595634176000.000 +Backward Time Step 0: + Gradient di[0] = 2304457310208.000, df[0] = 1639608090624.000, dc_hat[0] = 2162358353920.000 + Gradient do_[0] = 35626703388672.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2487958053584896.000, df[0] = -1953244557344768.000, dc_hat[0] = -1124727917641728.000 + Gradient do_[0] = -145461007488122880.000 +Backward Time Step 3: + Gradient di[0] = -3907168256393216.000, df[0] = -2982727548665856.000, dc_hat[0] = -1614869451243520.000 + Gradient do_[0] = -198920894918164480.000 +Backward Time Step 2: + Gradient di[0] = -5119574710157312.000, df[0] = -3857438977556480.000, dc_hat[0] = -2822041547833344.000 + Gradient do_[0] = -227648659850788864.000 +Backward Time Step 1: + Gradient di[0] = -6460836625252352.000, df[0] = -4622589011951616.000, dc_hat[0] = -3912213500788736.000 + Gradient do_[0] = -204308742412435456.000 +Backward Time Step 0: + Gradient di[0] = -7544928783564800.000, df[0] = -5514591442305024.000, dc_hat[0] = -7627671328522240.000 + Gradient do_[0] = -117051602711347200.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 701234544640.000, df[0] = 514141683712.000, dc_hat[0] = 304490381312.000 + Gradient do_[0] = 43483494612992.000 +Backward Time Step 3: + Gradient di[0] = 1100351799296.000, df[0] = 794042040320.000, dc_hat[0] = 430884421632.000 + Gradient do_[0] = 58214951419904.000 +Backward Time Step 2: + Gradient di[0] = 1400444944384.000, df[0] = 1005854392320.000, dc_hat[0] = 715770101760.000 + Gradient do_[0] = 63454996070400.000 +Backward Time Step 1: + Gradient di[0] = 1756859596800.000, df[0] = 1211499937792.000, dc_hat[0] = 977534124032.000 + Gradient do_[0] = 55933237460992.000 +Backward Time Step 0: + Gradient di[0] = 2127110864896.000, df[0] = 1513427042304.000, dc_hat[0] = 1995947507712.000 + Gradient do_[0] = 32884939292672.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2519256285577216.000, df[0] = -1977817944293376.000, dc_hat[0] = -1138871110729728.000 + Gradient do_[0] = -147290371498442752.000 +Backward Time Step 3: + Gradient di[0] = -3956264027553792.000, df[0] = -3020210801999872.000, dc_hat[0] = -1635148944637952.000 + Gradient do_[0] = -201419775610454016.000 +Backward Time Step 2: + Gradient di[0] = -5183838158323712.000, df[0] = -3905860975722496.000, dc_hat[0] = -2857451137269760.000 + Gradient do_[0] = -230505792355172352.000 +Backward Time Step 1: + Gradient di[0] = -6541811254296576.000, df[0] = -4680524899549184.000, dc_hat[0] = -3961240284037120.000 + Gradient do_[0] = -206869298835095552.000 +Backward Time Step 0: + Gradient di[0] = -7639330017247232.000, df[0] = -5583589555044352.000, dc_hat[0] = -7723107649323008.000 + Gradient do_[0] = -118516143609610240.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 641061093376.000, df[0] = 470023274496.000, dc_hat[0] = 278360129536.000 + Gradient do_[0] = 39752011087872.000 +Backward Time Step 3: + Gradient di[0] = 1005914750976.000, df[0] = 725894955008.000, dc_hat[0] = 393900392448.000 + Gradient do_[0] = 53218461614080.000 +Backward Time Step 2: + Gradient di[0] = 1280234749952.000, df[0] = 919515561984.000, dc_hat[0] = 654326235136.000 + Gradient do_[0] = 58008038014976.000 +Backward Time Step 1: + Gradient di[0] = 1606017351680.000, df[0] = 1107481591808.000, dc_hat[0] = 893602955264.000 + Gradient do_[0] = 51130855849984.000 +Backward Time Step 0: + Gradient di[0] = 1944451416064.000, df[0] = 1383465877504.000, dc_hat[0] = 1824551206912.000 + Gradient do_[0] = 30061042335744.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2550819731800064.000, df[0] = -2002600039809024.000, dc_hat[0] = -1153134697119744.000 + Gradient do_[0] = -149135334829981696.000 +Backward Time Step 3: + Gradient di[0] = -4005779900203008.000, df[0] = -3058015104139264.000, dc_hat[0] = -1655601444683776.000 + Gradient do_[0] = -203939959340531712.000 +Backward Time Step 2: + Gradient di[0] = -5248657804754944.000, df[0] = -3954701464764416.000, dc_hat[0] = -2893162985029632.000 + Gradient do_[0] = -233387474892619776.000 +Backward Time Step 1: + Gradient di[0] = -6623483815526400.000, df[0] = -4738959003353088.000, dc_hat[0] = -4010691195305984.000 + Gradient do_[0] = -209451965749395456.000 +Backward Time Step 0: + Gradient di[0] = -7734550515941376.000, df[0] = -5653185741979648.000, dc_hat[0] = -7819372361940992.000 + Gradient do_[0] = -119993380431200256.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 578910289920.000, df[0] = 424455077888.000, dc_hat[0] = 251371618304.000 + Gradient do_[0] = 35897919668224.000 +Backward Time Step 3: + Gradient di[0] = 908378374144.000, df[0] = 655511060480.000, dc_hat[0] = 355703586816.000 + Gradient do_[0] = 48058045825024.000 +Backward Time Step 2: + Gradient di[0] = 1156081254400.000, df[0] = 830344200192.000, dc_hat[0] = 590867988480.000 + Gradient do_[0] = 52382448746496.000 +Backward Time Step 1: + Gradient di[0] = 1450233167872.000, df[0] = 1000055635968.000, dc_hat[0] = 806922027008.000 + Gradient do_[0] = 46171112341504.000 +Backward Time Step 0: + Gradient di[0] = 1755812724736.000, df[0] = 1249250639872.000, dc_hat[0] = 1647544631296.000 + Gradient do_[0] = 27144707112960.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2582669867089920.000, df[0] = -2027607084236800.000, dc_hat[0] = -1167527132528640.000 + Gradient do_[0] = -150996996994367488.000 +Backward Time Step 3: + Gradient di[0] = -4055739496660992.000, df[0] = -3096156829646848.000, dc_hat[0] = -1676237688799232.000 + Gradient do_[0] = -206482700238848000.000 +Backward Time Step 2: + Gradient di[0] = -5314055661158400.000, df[0] = -4003977624551424.000, dc_hat[0] = -2929195613159424.000 + Gradient do_[0] = -236294875694235648.000 +Backward Time Step 1: + Gradient di[0] = -6705884910583808.000, df[0] = -4797916019425280.000, dc_hat[0] = -4060581535416320.000 + Gradient do_[0] = -212057619328663552.000 +Backward Time Step 0: + Gradient di[0] = -7830605848903680.000, df[0] = -5723392888012800.000, dc_hat[0] = -7916481572503552.000 + Gradient do_[0] = -121483588054024192.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 515061710848.000, df[0] = 377641959424.000, dc_hat[0] = 223646220288.000 + Gradient do_[0] = 31938565898240.000 +Backward Time Step 3: + Gradient di[0] = 808179990528.000, df[0] = 583206043648.000, dc_hat[0] = 316465053696.000 + Gradient do_[0] = 42756848222208.000 +Backward Time Step 2: + Gradient di[0] = 1028544987136.000, df[0] = 738742829056.000, dc_hat[0] = 525681197056.000 + Gradient do_[0] = 46603607998464.000 +Backward Time Step 1: + Gradient di[0] = 1290216013824.000, df[0] = 889710379008.000, dc_hat[0] = 717885931520.000 + Gradient do_[0] = 41076626817024.000 +Backward Time Step 0: + Gradient di[0] = 1562054623232.000, df[0] = 1111392911360.000, dc_hat[0] = 1465734135808.000 + Gradient do_[0] = 24149223276544.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2614748977823744.000, df[0] = -2052793577766912.000, dc_hat[0] = -1182023184023552.000 + Gradient do_[0] = -152872042276847616.000 +Backward Time Step 3: + Gradient di[0] = -4106052622614528.000, df[0] = -3134569674964992.000, dc_hat[0] = -1697019022278656.000 + Gradient do_[0] = -209043514359545856.000 +Backward Time Step 2: + Gradient di[0] = -5379921132126208.000, df[0] = -4053607045398528.000, dc_hat[0] = -2965487281504256.000 + Gradient do_[0] = -239222943878479872.000 +Backward Time Step 1: + Gradient di[0] = -6788862605000704.000, df[0] = -4857283204874240.000, dc_hat[0] = -4110820036313088.000 + Gradient do_[0] = -214681432029659136.000 +Backward Time Step 0: + Gradient di[0] = -7927340860440576.000, df[0] = -5794096102768640.000, dc_hat[0] = -8014276904091648.000 + Gradient do_[0] = -122984318346723328.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 449361575936.000, df[0] = 329471131648.000, dc_hat[0] = 195117236224.000 + Gradient do_[0] = 27864445485056.000 +Backward Time Step 3: + Gradient di[0] = 705080328192.000, df[0] = 508807184384.000, dc_hat[0] = 276091404288.000 + Gradient do_[0] = 37302181036032.000 +Backward Time Step 2: + Gradient di[0] = 897320091648.000, df[0] = 644492296192.000, dc_hat[0] = 458610245632.000 + Gradient do_[0] = 40657628430336.000 +Backward Time Step 1: + Gradient di[0] = 1125577261056.000, df[0] = 776178302976.000, dc_hat[0] = 626278858752.000 + Gradient do_[0] = 35835013496832.000 +Backward Time Step 0: + Gradient di[0] = 1362705645568.000, df[0] = 969557213184.000, dc_hat[0] = 1278677483520.000 + Gradient do_[0] = 21067305320448.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2647110751092736.000, df[0] = -2078202470072320.000, dc_hat[0] = -1196646608142336.000 + Gradient do_[0] = -154763614593482752.000 +Backward Time Step 3: + Gradient di[0] = -4156812693602304.000, df[0] = -3173323701747712.000, dc_hat[0] = -1717984770916352.000 + Gradient do_[0] = -211626902828351488.000 +Backward Time Step 2: + Gradient di[0] = -5446360518098944.000, df[0] = -4103668647329792.000, dc_hat[0] = -3002093556203520.000 + Gradient do_[0] = -242176713147023360.000 +Backward Time Step 1: + Gradient di[0] = -6872569906987008.000, df[0] = -4917174376333312.000, dc_hat[0] = -4161502261018624.000 + Gradient do_[0] = -217328437554053120.000 +Backward Time Step 0: + Gradient di[0] = -8024916074954752.000, df[0] = -5865414034718720.000, dc_hat[0] = -8112922102333440.000 + Gradient do_[0] = -124498088160133120.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 381364109312.000, df[0] = 279615897600.000, dc_hat[0] = 165591007232.000 + Gradient do_[0] = 23647896993792.000 +Backward Time Step 3: + Gradient di[0] = 598378610688.000, df[0] = 431808741376.000, dc_hat[0] = 234307829760.000 + Gradient do_[0] = 31657006465024.000 +Backward Time Step 2: + Gradient di[0] = 761515540480.000, df[0] = 546952151040.000, dc_hat[0] = 389199659008.000 + Gradient do_[0] = 34504219557888.000 +Backward Time Step 1: + Gradient di[0] = 955204173824.000, df[0] = 658692112384.000, dc_hat[0] = 531481329664.000 + Gradient do_[0] = 30410822123520.000 +Backward Time Step 0: + Gradient di[0] = 1156423090176.000, df[0] = 822788358144.000, dc_hat[0] = 1085114941440.000 + Gradient do_[0] = 17878195634176.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2679727806480384.000, df[0] = -2103812017881088.000, dc_hat[0] = -1211385325289472.000 + Gradient do_[0] = -156670047496962048.000 +Backward Time Step 3: + Gradient di[0] = -4207971391242240.000, df[0] = -3212381865902080.000, dc_hat[0] = -1739114265182208.000 + Gradient do_[0] = -214230683801878528.000 +Backward Time Step 2: + Gradient di[0] = -5513320131985408.000, df[0] = -4154121896591360.000, dc_hat[0] = -3038985714663424.000 + Gradient do_[0] = -245153572159750144.000 +Backward Time Step 1: + Gradient di[0] = -6956931654615040.000, df[0] = -4977532625485824.000, dc_hat[0] = -4212581501763584.000 + Gradient do_[0] = -219996127640944640.000 +Backward Time Step 0: + Gradient di[0] = -8123260088614912.000, df[0] = -5937293533642752.000, dc_hat[0] = -8212344689655808.000 + Gradient do_[0] = -126023797982625792.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 312032002048.000, df[0] = 228781899776.000, dc_hat[0] = 135485702144.000 + Gradient do_[0] = 19348632633344.000 +Backward Time Step 3: + Gradient di[0] = 489585541120.000, df[0] = 353300709376.000, dc_hat[0] = 191705858048.000 + Gradient do_[0] = 25901259554816.000 +Backward Time Step 2: + Gradient di[0] = 623053111296.000, df[0] = 447502974976.000, dc_hat[0] = 318431461376.000 + Gradient do_[0] = 28230421577728.000 +Backward Time Step 1: + Gradient di[0] = 781504544768.000, df[0] = 538911866880.000, dc_hat[0] = 434833162240.000 + Gradient do_[0] = 24880745545728.000 +Backward Time Step 0: + Gradient di[0] = 946117541888.000, df[0] = 673157218304.000, dc_hat[0] = 887777329152.000 + Gradient do_[0] = 14626891759616.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2712580279762944.000, df[0] = -2129606249283584.000, dc_hat[0] = -1226231013965824.000 + Gradient do_[0] = -158590327375003648.000 +Backward Time Step 3: + Gradient di[0] = -4259504824778752.000, df[0] = -3251726450688000.000, dc_hat[0] = -1760401331060736.000 + Gradient do_[0] = -216853517250330624.000 +Backward Time Step 2: + Gradient di[0] = -5580774203981824.000, df[0] = -4204947465830400.000, dc_hat[0] = -3076150066675712.000 + Gradient do_[0] = -248152266786209792.000 +Backward Time Step 1: + Gradient di[0] = -7041906508824576.000, df[0] = -5038330035044352.000, dc_hat[0] = -4264029304389632.000 + Gradient do_[0] = -222683179440406528.000 +Backward Time Step 0: + Gradient di[0] = -8222306329427968.000, df[0] = -6009686818029568.000, dc_hat[0] = -8312478094065664.000 + Gradient do_[0] = -127560399842181120.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 240040394752.000, df[0] = 175997861888.000, dc_hat[0] = 104226013184.000 + Gradient do_[0] = 14884472356864.000 +Backward Time Step 3: + Gradient di[0] = 376623005696.000, df[0] = 271783739392.000, dc_hat[0] = 147472171008.000 + Gradient do_[0] = 19924946780160.000 +Backward Time Step 2: + Gradient di[0] = 479288557568.000, df[0] = 344245436416.000, dc_hat[0] = 244954300416.000 + Gradient do_[0] = 21716405321728.000 +Backward Time Step 1: + Gradient di[0] = 601163825152.000, df[0] = 414551998464.000, dc_hat[0] = 334490271744.000 + Gradient do_[0] = 19139236200448.000 +Backward Time Step 0: + Gradient di[0] = 727779835904.000, df[0] = 517811142656.000, dc_hat[0] = 682902945792.000 + Gradient do_[0] = 11251410272256.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2745722663337984.000, df[0] = -2155627442864128.000, dc_hat[0] = -1241206759620608.000 + Gradient do_[0] = -160527460704714752.000 +Backward Time Step 3: + Gradient di[0] = -4311484934914048.000, df[0] = -3291411680067584.000, dc_hat[0] = -1781869725089792.000 + Gradient do_[0] = -219499114025451520.000 +Backward Time Step 2: + Gradient di[0] = -5648812928401408.000, df[0] = -4256214611394560.000, dc_hat[0] = -3113638151847936.000 + Gradient do_[0] = -251177023274221568.000 +Backward Time Step 1: + Gradient di[0] = -7127614728699904.000, df[0] = -5099652504354816.000, dc_hat[0] = -4315921099259904.000 + Gradient do_[0] = -225393406883397632.000 +Backward Time Step 0: + Gradient di[0] = -8322211563700224.000, df[0] = -6082707704512512.000, dc_hat[0] = -8413478544998400.000 + Gradient do_[0] = -129110324690288640.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 166459424768.000, df[0] = 122048176128.000, dc_hat[0] = 72275935232.000 + Gradient do_[0] = 10321738596352.000 +Backward Time Step 3: + Gradient di[0] = 261171298304.000, df[0] = 188470050816.000, dc_hat[0] = 102262194176.000 + Gradient do_[0] = 13816839536640.000 +Backward Time Step 2: + Gradient di[0] = 332362678272.000, df[0] = 238716796928.000, dc_hat[0] = 169853616128.000 + Gradient do_[0] = 15058904023040.000 +Backward Time Step 1: + Gradient di[0] = 416867418112.000, df[0] = 287463407616.000, dc_hat[0] = 231926415360.000 + Gradient do_[0] = 13271459430400.000 +Backward Time Step 0: + Gradient di[0] = 504644534272.000, df[0] = 359051689984.000, dc_hat[0] = 473526796288.000 + Gradient do_[0] = 7801758810112.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2779217771102208.000, df[0] = -2181923917004800.000, dc_hat[0] = -1256330144776192.000 + Gradient do_[0] = -162484058826211328.000 +Backward Time Step 3: + Gradient di[0] = -4364029296377856.000, df[0] = -3331527211483136.000, dc_hat[0] = -1803528708292608.000 + Gradient do_[0] = -222170858561470464.000 +Backward Time Step 2: + Gradient di[0] = -5717611862032384.000, df[0] = -4308045471416320.000, dc_hat[0] = -3151379103219712.000 + Gradient do_[0] = -254231037079453696.000 +Backward Time Step 1: + Gradient di[0] = -7214292537442304.000, df[0] = -5161649820401664.000, dc_hat[0] = -4368059049443328.000 + Gradient do_[0] = -228129318230818816.000 +Backward Time Step 0: + Gradient di[0] = -8423038068457472.000, df[0] = -6156401827119104.000, dc_hat[0] = -8515410466963456.000 + Gradient do_[0] = -130674534599622656.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 90576175104.000, df[0] = 66410582016.000, dc_hat[0] = 39327514624.000 + Gradient do_[0] = 5616381722624.000 +Backward Time Step 3: + Gradient di[0] = 142110113792.000, df[0] = 102551617536.000, dc_hat[0] = 55643009024.000 + Gradient do_[0] = 7518076010496.000 +Backward Time Step 2: + Gradient di[0] = 180844593152.000, df[0] = 129890222080.000, dc_hat[0] = 92419874816.000 + Gradient do_[0] = 8193802502144.000 +Backward Time Step 1: + Gradient di[0] = 226819244032.000, df[0] = 156409987072.000, dc_hat[0] = 126191927296.000 + Gradient do_[0] = 7221053227008.000 +Backward Time Step 0: + Gradient di[0] = 274574966784.000, df[0] = 195358507008.000, dc_hat[0] = 257643937792.000 + Gradient do_[0] = 4244904083456.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2812910715797504.000, df[0] = -2208378365411328.000, dc_hat[0] = -1271554327445504.000 + Gradient do_[0] = -164453335691165696.000 +Backward Time Step 3: + Gradient di[0] = -4416872158068736.000, df[0] = -3371871449907200.000, dc_hat[0] = -1825353316171776.000 + Gradient do_[0] = -224860212463403008.000 +Backward Time Step 2: + Gradient di[0] = -5786784457818112.000, df[0] = -4360167315472384.000, dc_hat[0] = -3189489690214400.000 + Gradient do_[0] = -257306182123782144.000 +Backward Time Step 1: + Gradient di[0] = -7301432055169024.000, df[0] = -5223996102541312.000, dc_hat[0] = -4420814938046464.000 + Gradient do_[0] = -230884711549894656.000 +Backward Time Step 0: + Gradient di[0] = -8524582906494976.000, df[0] = -6230620472606720.000, dc_hat[0] = -8618068238401536.000 + Gradient do_[0] = -132249894244057088.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 12755316736.000, df[0] = 9352227840.000, dc_hat[0] = 5538235904.000 + Gradient do_[0] = 790919184384.000 +Backward Time Step 3: + Gradient di[0] = 20012228608.000, df[0] = 14441545728.000, dc_hat[0] = 7835696640.000 + Gradient do_[0] = 1058705702912.000 +Backward Time Step 2: + Gradient di[0] = 25466525696.000, df[0] = 18291146752.000, dc_hat[0] = 13014478848.000 + Gradient do_[0] = 1153847001088.000 +Backward Time Step 1: + Gradient di[0] = 31939893248.000, df[0] = 22025105408.000, dc_hat[0] = 17769887744.000 + Gradient do_[0] = 1016842813440.000 +Backward Time Step 0: + Gradient di[0] = 38664093696.000, df[0] = 27509280768.000, dc_hat[0] = 36279959552.000 + Gradient do_[0] = 597743370240.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2846825388179456.000, df[0] = -2235006625775616.000, dc_hat[0] = -1286879978717184.000 + Gradient do_[0] = -166435600537223168.000 +Backward Time Step 3: + Gradient di[0] = -4470060496191488.000, df[0] = -3412479560384512.000, dc_hat[0] = -1847320597495808.000 + Gradient do_[0] = -227567192911118336.000 +Backward Time Step 2: + Gradient di[0] = -5856394603397120.000, df[0] = -4412618261397504.000, dc_hat[0] = -3227840795377664.000 + Gradient do_[0] = -260400706060550144.000 +Backward Time Step 1: + Gradient di[0] = -7389119181225984.000, df[0] = -5286733763575808.000, dc_hat[0] = -4473900733825024.000 + Gradient do_[0] = -233657490896584704.000 +Backward Time Step 0: + Gradient di[0] = -8626802054397952.000, df[0] = -6305333039333376.000, dc_hat[0] = -8721408909639680.000 + Gradient do_[0] = -133835716428824576.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = -66722713600.000, df[0] = -48921309184.000, dc_hat[0] = -28970186752.000 + Gradient do_[0] = -4137259892736.000 +Backward Time Step 3: + Gradient di[0] = -104681857024.000, df[0] = -75542315008.000, dc_hat[0] = -40987357184.000 + Gradient do_[0] = -5537956102144.000 +Backward Time Step 2: + Gradient di[0] = -133210767360.000, df[0] = -95677734912.000, dc_hat[0] = -68075941888.000 + Gradient do_[0] = -6035548405760.000 +Backward Time Step 1: + Gradient di[0] = -167067713536.000, df[0] = -115206520832.000, dc_hat[0] = -92948643840.000 + Gradient do_[0] = -5318789562368.000 +Backward Time Step 0: + Gradient di[0] = -202236706816.000, df[0] = -143890268160.000, dc_hat[0] = -189766254592.000 + Gradient do_[0] = -3126561341440.000 +Time Step 0: + i_gate[0] = 0.566, f_gate[0] = 0.747, o_gate[0] = 0.172, c_hat[0] = 0.829 + c_state[0] = 0.469, h_state[0] = 0.075 +Time Step 1: + i_gate[0] = 0.433, f_gate[0] = 0.747, o_gate[0] = 0.148, c_hat[0] = 0.866 + c_state[0] = 0.725, h_state[0] = 0.092 +Time Step 2: + i_gate[0] = 0.376, f_gate[0] = 0.761, o_gate[0] = 0.149, c_hat[0] = 0.863 + c_state[0] = 0.877, h_state[0] = 0.105 +Time Step 3: + i_gate[0] = 0.344, f_gate[0] = 0.771, o_gate[0] = 0.157, c_hat[0] = 0.885 + c_state[0] = 0.980, h_state[0] = 0.119 +Time Step 4: + i_gate[0] = 0.330, f_gate[0] = 0.787, o_gate[0] = 0.160, c_hat[0] = 0.870 + c_state[0] = 1.059, h_state[0] = 0.126 +Backward Time Step 4: + Gradient di[0] = 19156218937344.000, df[0] = 13142134358016.000, dc_hat[0] = 8010554408960.000 + Gradient do_[0] = 1271238378913792.000 +Backward Time Step 3: + Gradient di[0] = 30098356961280.000, df[0] = 20507969716224.000, dc_hat[0] = 11206314164224.000 + Gradient do_[0] = 1665477721980928.000 +Backward Time Step 2: + Gradient di[0] = 36914476351488.000, df[0] = 25224460369920.000, dc_hat[0] = 17511709933568.000 + Gradient do_[0] = 1708683079712768.000 +Backward Time Step 1: + Gradient di[0] = 45967902507008.000, df[0] = 30538465280000.000, dc_hat[0] = 23479652974592.000 + Gradient do_[0] = 1474196118962176.000 +Backward Time Step 0: + Gradient di[0] = 57253348507648.000, df[0] = 39729219239936.000, dc_hat[0] = 49915472379904.000 + Gradient do_[0] = 883066348240896.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.824, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1345754759168.000, df[0] = 986717749248.000, dc_hat[0] = 584281620480.000 + Gradient do_[0] = 83443371212800.000 +Backward Time Step 3: + Gradient di[0] = 2111409487872.000, df[0] = 1523686309888.000, dc_hat[0] = 826660618240.000 + Gradient do_[0] = 111696756604928.000 +Backward Time Step 2: + Gradient di[0] = 2686690263040.000, df[0] = 1929704767488.000, dc_hat[0] = 1372963471360.000 + Gradient do_[0] = 121728097320960.000 +Backward Time Step 1: + Gradient di[0] = 3369711697920.000, df[0] = 2323685048320.000, dc_hat[0] = 1874749554688.000 + Gradient do_[0] = 107278585823232.000 +Backward Time Step 0: + Gradient di[0] = 4080442802176.000, df[0] = 2903211507712.000, dc_hat[0] = 3828831485952.000 + Gradient do_[0] = 63083267489792.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2209343525093376.000, df[0] = -1734537440657408.000, dc_hat[0] = -998669016891392.000 + Gradient do_[0] = -129162302384504832.000 +Backward Time Step 3: + Gradient di[0] = -3469109241053184.000, df[0] = -2648370518687744.000, dc_hat[0] = -1433582371340288.000 + Gradient do_[0] = -176605361539645440.000 +Backward Time Step 2: + Gradient di[0] = -4544658440978432.000, df[0] = -3424274882756608.000, dc_hat[0] = -2504781440483328.000 + Gradient do_[0] = -202073091675783168.000 +Backward Time Step 1: + Gradient di[0] = -5734208152535040.000, df[0] = -4102685368254464.000, dc_hat[0] = -3471894527344640.000 + Gradient do_[0] = -181326131893370880.000 +Backward Time Step 0: + Gradient di[0] = -6696747904532480.000, df[0] = -4894655325929472.000, dc_hat[0] = -6770188087197696.000 + Gradient do_[0] = -103892965377703936.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1307062960128.000, df[0] = 958347804672.000, dc_hat[0] = 567473537024.000 + Gradient do_[0] = 81043340132352.000 +Backward Time Step 3: + Gradient di[0] = 2050681602048.000, df[0] = 1479862910976.000, dc_hat[0] = 802855518208.000 + Gradient do_[0] = 108482183299072.000 +Backward Time Step 2: + Gradient di[0] = 2609398415360.000, df[0] = 1874187124736.000, dc_hat[0] = 1333374091264.000 + Gradient do_[0] = 118223177515008.000 +Backward Time Step 1: + Gradient di[0] = 3272694300672.000, df[0] = 2256774365184.000, dc_hat[0] = 1820585230336.000 + Gradient do_[0] = 104186863681536.000 +Backward Time Step 0: + Gradient di[0] = 3962756661248.000, df[0] = 2819478519808.000, dc_hat[0] = 3718402539520.000 + Gradient do_[0] = 61263858106368.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2234885695602688.000, df[0] = -1754589435002880.000, dc_hat[0] = -1010197178875904.000 + Gradient do_[0] = -130654030425751552.000 +Backward Time Step 3: + Gradient di[0] = -3509172159119360.000, df[0] = -2678956591415296.000, dc_hat[0] = -1450085246304256.000 + Gradient do_[0] = -178641880412585984.000 +Backward Time Step 2: + Gradient di[0] = -4597113413435392.000, df[0] = -3463792876847104.000, dc_hat[0] = -2533525509111808.000 + Gradient do_[0] = -204400568813223936.000 +Backward Time Step 1: + Gradient di[0] = -5800289445609472.000, df[0] = -4149946383073280.000, dc_hat[0] = -3511574119579648.000 + Gradient do_[0] = -183410840299372544.000 +Backward Time Step 0: + Gradient di[0] = -6773575205781504.000, df[0] = -4950808802099200.000, dc_hat[0] = -6847858812649472.000 + Gradient do_[0] = -105084861752016896.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1267899432960.000, df[0] = 929631961088.000, dc_hat[0] = 550460981248.000 + Gradient do_[0] = 78614049587200.000 +Backward Time Step 3: + Gradient di[0] = 1989213552640.000, df[0] = 1435505000448.000, dc_hat[0] = 778763304960.000 + Gradient do_[0] = 105228678463488.000 +Backward Time Step 2: + Gradient di[0] = 2531163111424.000, df[0] = 1817992626176.000, dc_hat[0] = 1293310885888.000 + Gradient do_[0] = 114675626803200.000 +Backward Time Step 1: + Gradient di[0] = 3174490963968.000, df[0] = 2189045923840.000, dc_hat[0] = 1765774458880.000 + Gradient do_[0] = 101057594130432.000 +Backward Time Step 0: + Gradient di[0] = 3843652583424.000, df[0] = 2734736801792.000, dc_hat[0] = 3606642688000.000 + Gradient do_[0] = 59422516707328.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2261528988352512.000, df[0] = -1775506328387584.000, dc_hat[0] = -1022223087304704.000 + Gradient do_[0] = -132210062717353984.000 +Backward Time Step 3: + Gradient di[0] = -3550965680570368.000, df[0] = -2710864708763648.000, dc_hat[0] = -1467303770193920.000 + Gradient do_[0] = -180766394575486976.000 +Backward Time Step 2: + Gradient di[0] = -4651836665495552.000, df[0] = -3505019999485952.000, dc_hat[0] = -2563520218529792.000 + Gradient do_[0] = -206828994861989888.000 +Backward Time Step 1: + Gradient di[0] = -5869227965677568.000, df[0] = -4199252339195904.000, dc_hat[0] = -3552980288667648.000 + Gradient do_[0] = -185586000716627968.000 +Backward Time Step 0: + Gradient di[0] = -6853733791039488.000, df[0] = -5009395914113024.000, dc_hat[0] = -6928896255590400.000 + Gradient do_[0] = -106328435172835328.000 +Epoch 600, Train Loss=0.011401, Weight Norm=12.822992 +Sample Predictions at Epoch 600: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.25 | 63.87 | 6.62 | +| 193 | 2024-10-14 | 56.65 | 66.55 | 9.90 | +| 194 | 2024-10-15 | 56.84 | 66.00 | 9.16 | +| 195 | 2024-10-16 | 57.79 | 67.20 | 9.41 | +| 196 | 2024-10-17 | 57.33 | 66.76 | 9.43 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1228033490944.000, df[0] = 900403232768.000, dc_hat[0] = 533149712384.000 + Gradient do_[0] = 76141893255168.000 +Backward Time Step 3: + Gradient di[0] = 1926640041984.000, df[0] = 1390351613952.000, dc_hat[0] = 754259197952.000 + Gradient do_[0] = 101918072373248.000 +Backward Time Step 2: + Gradient di[0] = 2451505676288.000, df[0] = 1760780222464.000, dc_hat[0] = 1252601495552.000 + Gradient do_[0] = 111066419822592.000 +Backward Time Step 1: + Gradient di[0] = 3074514485248.000, df[0] = 2120105197568.000, dc_hat[0] = 1710162051072.000 + Gradient do_[0] = 97874905923584.000 +Backward Time Step 0: + Gradient di[0] = 3722542055424.000, df[0] = 2648567185408.000, dc_hat[0] = 3493000380416.000 + Gradient do_[0] = 57550166818816.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2288907525816320.000, df[0] = -1797002908139520.000, dc_hat[0] = -1034593935294464.000 + Gradient do_[0] = -133810230092890112.000 +Backward Time Step 3: + Gradient di[0] = -3593905152983040.000, df[0] = -2743648999440384.000, dc_hat[0] = -1485035676893184.000 + Gradient do_[0] = -182951605216215040.000 +Backward Time Step 2: + Gradient di[0] = -4708038460047360.000, df[0] = -3547367303282688.000, dc_hat[0] = -2594475658444800.000 + Gradient do_[0] = -209327205539381248.000 +Backward Time Step 1: + Gradient di[0] = -5940022985359360.000, df[0] = -4249903962259456.000, dc_hat[0] = -3595833324863488.000 + Gradient do_[0] = -187824434592088064.000 +Backward Time Step 0: + Gradient di[0] = -6936257829535744.000, df[0] = -5069713361076224.000, dc_hat[0] = -7012324921573376.000 + Gradient do_[0] = -107608713384165376.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1227753783296.000, df[0] = 900198236160.000, dc_hat[0] = 533028274176.000 + Gradient do_[0] = 76124562391040.000 +Backward Time Step 3: + Gradient di[0] = 1926198853632.000, df[0] = 1390033502208.000, dc_hat[0] = 754086903808.000 + Gradient do_[0] = 101894752043008.000 +Backward Time Step 2: + Gradient di[0] = 2450942853120.000, df[0] = 1760375865344.000, dc_hat[0] = 1252313661440.000 + Gradient do_[0] = 111040901677056.000 +Backward Time Step 1: + Gradient di[0] = 3073809055744.000, df[0] = 2119618265088.000, dc_hat[0] = 1709766868992.000 + Gradient do_[0] = 97852424454144.000 +Backward Time Step 0: + Gradient di[0] = 3721687203840.000, df[0] = 2647959011328.000, dc_hat[0] = 3492198219776.000 + Gradient do_[0] = 57536950566912.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2288789414215680.000, df[0] = -1796910432124928.000, dc_hat[0] = -1034540717965312.000 + Gradient do_[0] = -133803340965347328.000 +Backward Time Step 3: + Gradient di[0] = -3593715905986560.000, df[0] = -2743503775858688.000, dc_hat[0] = -1484956354215936.000 + Gradient do_[0] = -182942070388817920.000 +Backward Time Step 2: + Gradient di[0] = -4707790962556928.000, df[0] = -3547180472205312.000, dc_hat[0] = -2594337951055872.000 + Gradient do_[0] = -209316210423103488.000 +Backward Time Step 1: + Gradient di[0] = -5939710526488576.000, df[0] = -4249679550218240.000, dc_hat[0] = -3595638709157888.000 + Gradient do_[0] = -187814504627699712.000 +Backward Time Step 0: + Gradient di[0] = -6935891683573760.000, df[0] = -5069445462491136.000, dc_hat[0] = -7011955017515008.000 + Gradient do_[0] = -107603035437400064.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1227730714624.000, df[0] = 900181262336.000, dc_hat[0] = 533018214400.000 + Gradient do_[0] = 76123127939072.000 +Backward Time Step 3: + Gradient di[0] = 1926162153472.000, df[0] = 1390006894592.000, dc_hat[0] = 754072027136.000 + Gradient do_[0] = 101892797497344.000 +Backward Time Step 2: + Gradient di[0] = 2450899861504.000, df[0] = 1760345325568.000, dc_hat[0] = 1252291248128.000 + Gradient do_[0] = 111038980685824.000 +Backward Time Step 1: + Gradient di[0] = 3073755578368.000, df[0] = 2119581564928.000, dc_hat[0] = 1709735411712.000 + Gradient do_[0] = 97850696400896.000 +Backward Time Step 0: + Gradient di[0] = 3721621667840.000, df[0] = 2647912349696.000, dc_hat[0] = 3492136353792.000 + Gradient do_[0] = 57535931351040.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2288931953442816.000, df[0] = -1797022503927808.000, dc_hat[0] = -1034605008257024.000 + Gradient do_[0] = -133811698971705344.000 +Backward Time Step 3: + Gradient di[0] = -3593941660205056.000, df[0] = -2743676379856896.000, dc_hat[0] = -1485049635536896.000 + Gradient do_[0] = -182953443462217728.000 +Backward Time Step 2: + Gradient di[0] = -4708083020333056.000, df[0] = -3547400589279232.000, dc_hat[0] = -2594497133281280.000 + Gradient do_[0] = -209329112504860672.000 +Backward Time Step 1: + Gradient di[0] = -5940081504288768.000, df[0] = -4249944764448768.000, dc_hat[0] = -3595857752489984.000 + Gradient do_[0] = -187826186938744832.000 +Backward Time Step 0: + Gradient di[0] = -6936325475270656.000, df[0] = -5069762753200128.000, dc_hat[0] = -7012393641050112.000 + Gradient do_[0] = -107609761356185600.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1227910152192.000, df[0] = 900312989696.000, dc_hat[0] = 533096136704.000 + Gradient do_[0] = 76134217678848.000 +Backward Time Step 3: + Gradient di[0] = 1926444744704.000, df[0] = 1390210842624.000, dc_hat[0] = 754182586368.000 + Gradient do_[0] = 101907687276544.000 +Backward Time Step 2: + Gradient di[0] = 2451258736640.000, df[0] = 1760602882048.000, dc_hat[0] = 1252473176064.000 + Gradient do_[0] = 111055128756224.000 +Backward Time Step 1: + Gradient di[0] = 3074205155328.000, df[0] = 2119891156992.000, dc_hat[0] = 1709982482432.000 + Gradient do_[0] = 97864915091456.000 +Backward Time Step 0: + Gradient di[0] = 3722165354496.000, df[0] = 2648299274240.000, dc_hat[0] = 3492646748160.000 + Gradient do_[0] = 57544345124864.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2289242264829952.000, df[0] = -1797265706450944.000, dc_hat[0] = -1034744527585280.000 + Gradient do_[0] = -133829797963890688.000 +Backward Time Step 3: + Gradient di[0] = -3594429675864064.000, df[0] = -2744049505140736.000, dc_hat[0] = -1485251364782080.000 + Gradient do_[0] = -182978251193319424.000 +Backward Time Step 2: + Gradient di[0] = -4708725654814720.000, df[0] = -3547885115277312.000, dc_hat[0] = -2594850662776832.000 + Gradient do_[0] = -209357716987052032.000 +Backward Time Step 1: + Gradient di[0] = -5940892179365888.000, df[0] = -4250524853469184.000, dc_hat[0] = -3596344962842624.000 + Gradient do_[0] = -187851716224352256.000 +Backward Time Step 0: + Gradient di[0] = -6937267146850304.000, df[0] = -5070451021709312.000, dc_hat[0] = -7013345513177088.000 + Gradient do_[0] = -107624364244992000.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1228104663040.000, df[0] = 900455530496.000, dc_hat[0] = 533179990016.000 + Gradient do_[0] = 76146314051584.000 +Backward Time Step 3: + Gradient di[0] = 1926751322112.000, df[0] = 1390431698944.000, dc_hat[0] = 754302124032.000 + Gradient do_[0] = 101923969564672.000 +Backward Time Step 2: + Gradient di[0] = 2451648020480.000, df[0] = 1760882196480.000, dc_hat[0] = 1252670963712.000 + Gradient do_[0] = 111072803553280.000 +Backward Time Step 1: + Gradient di[0] = 3074691956736.000, df[0] = 2120226439168.000, dc_hat[0] = 1710250655744.000 + Gradient do_[0] = 97880392073216.000 +Backward Time Step 0: + Gradient di[0] = 3722751508480.000, df[0] = 2648716083200.000, dc_hat[0] = 3493196464128.000 + Gradient do_[0] = 57553400627200.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2289676593397760.000, df[0] = -1797606485262336.000, dc_hat[0] = -1034940217032704.000 + Gradient do_[0] = -133855164040740864.000 +Backward Time Step 3: + Gradient di[0] = -3595113917841408.000, df[0] = -2744571612102656.000, dc_hat[0] = -1485532416704512.000 + Gradient do_[0] = -183013040428417024.000 +Backward Time Step 2: + Gradient di[0] = -4709619008012288.000, df[0] = -3548558082965504.000, dc_hat[0] = -2595341094354944.000 + Gradient do_[0] = -209397333765390336.000 +Backward Time Step 1: + Gradient di[0] = -5942014239571968.000, df[0] = -4251328012353536.000, dc_hat[0] = -3597020078014464.000 + Gradient do_[0] = -187887141114609664.000 +Backward Time Step 0: + Gradient di[0] = -6938573890650112.000, df[0] = -5071406115061760.000, dc_hat[0] = -7014666752491520.000 + Gradient do_[0] = -107644645080563712.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1228482281472.000, df[0] = 900732420096.000, dc_hat[0] = 533343895552.000 + Gradient do_[0] = 76169684713472.000 +Backward Time Step 3: + Gradient di[0] = 1927341670400.000, df[0] = 1390857814016.000, dc_hat[0] = 754533269504.000 + Gradient do_[0] = 101955108077568.000 +Backward Time Step 2: + Gradient di[0] = 2452399849472.000, df[0] = 1761422082048.000, dc_hat[0] = 1253054218240.000 + Gradient do_[0] = 111106810970112.000 +Backward Time Step 1: + Gradient di[0] = 3075633840128.000, df[0] = 2120875900928.000, dc_hat[0] = 1710772846592.000 + Gradient do_[0] = 97910347792384.000 +Backward Time Step 0: + Gradient di[0] = 3723890262016.000, df[0] = 2649526632448.000, dc_hat[0] = 3494264963072.000 + Gradient do_[0] = 57571008315392.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2290193063215104.000, df[0] = -1798012091236352.000, dc_hat[0] = -1035173688770560.000 + Gradient do_[0] = -133885288941355008.000 +Backward Time Step 3: + Gradient di[0] = -3595923519176704.000, df[0] = -2745189282086912.000, dc_hat[0] = -1485866618847232.000 + Gradient do_[0] = -183054203394981888.000 +Backward Time Step 2: + Gradient di[0] = -4710680401805312.000, df[0] = -3549357483753472.000, dc_hat[0] = -2595924673036288.000 + Gradient do_[0] = -209444440966692864.000 +Backward Time Step 1: + Gradient di[0] = -5943355879981056.000, df[0] = -4252287669108736.000, dc_hat[0] = -3597827531866112.000 + Gradient do_[0] = -187929541031755776.000 +Backward Time Step 0: + Gradient di[0] = -6940145311809536.000, df[0] = -5072555018813440.000, dc_hat[0] = -7016255353520128.000 + Gradient do_[0] = -107669023314935808.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1228668272640.000, df[0] = 900868538368.000, dc_hat[0] = 533424635904.000 + Gradient do_[0] = 76181277769728.000 +Backward Time Step 3: + Gradient di[0] = 1927639072768.000, df[0] = 1391072641024.000, dc_hat[0] = 754649726976.000 + Gradient do_[0] = 101970861883392.000 +Backward Time Step 2: + Gradient di[0] = 2452781793280.000, df[0] = 1761696022528.000, dc_hat[0] = 1253248335872.000 + Gradient do_[0] = 111124083113984.000 +Backward Time Step 1: + Gradient di[0] = 3076113825792.000, df[0] = 2121207119872.000, dc_hat[0] = 1711037480960.000 + Gradient do_[0] = 97925581504512.000 +Backward Time Step 0: + Gradient di[0] = 3724465668096.000, df[0] = 2649935839232.000, dc_hat[0] = 3494804717568.000 + Gradient do_[0] = 57579896045568.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2290729128820736.000, df[0] = -1798432998031360.000, dc_hat[0] = -1035415884660736.000 + Gradient do_[0] = -133916616432812032.000 +Backward Time Step 3: + Gradient di[0] = -3596766138073088.000, df[0] = -2745832721874944.000, dc_hat[0] = -1486214376980480.000 + Gradient do_[0] = -183097153067941888.000 +Backward Time Step 2: + Gradient di[0] = -4711788503367680.000, df[0] = -3550191244279808.000, dc_hat[0] = -2596531068731392.000 + Gradient do_[0] = -209493626932166656.000 +Backward Time Step 1: + Gradient di[0] = -5944751744352256.000, df[0] = -4253285980569600.000, dc_hat[0] = -3598667466407936.000 + Gradient do_[0] = -187973624576081920.000 +Backward Time Step 0: + Gradient di[0] = -6941765588221952.000, df[0] = -5073739356045312.000, dc_hat[0] = -7017893346672640.000 + Gradient do_[0] = -107694166053486592.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1228893978624.000, df[0] = 901034147840.000, dc_hat[0] = 533522644992.000 + Gradient do_[0] = 76195211247616.000 +Backward Time Step 3: + Gradient di[0] = 1927991525376.000, df[0] = 1391326658560.000, dc_hat[0] = 754786435072.000 + Gradient do_[0] = 101989434261504.000 +Backward Time Step 2: + Gradient di[0] = 2453227175936.000, df[0] = 1762016231424.000, dc_hat[0] = 1253475745792.000 + Gradient do_[0] = 111144249327616.000 +Backward Time Step 1: + Gradient di[0] = 3076675076096.000, df[0] = 2121593782272.000, dc_hat[0] = 1711346941952.000 + Gradient do_[0] = 97943440850944.000 +Backward Time Step 0: + Gradient di[0] = 3725141213184.000, df[0] = 2650416611328.000, dc_hat[0] = 3495438843904.000 + Gradient do_[0] = 57590348251136.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2291301164777472.000, df[0] = -1798881956331520.000, dc_hat[0] = -1035673918242816.000 + Gradient do_[0] = -133950031278374912.000 +Backward Time Step 3: + Gradient di[0] = -3597663517802496.000, df[0] = -2746517769158656.000, dc_hat[0] = -1486583609950208.000 + Gradient do_[0] = -183142645361541120.000 +Backward Time Step 2: + Gradient di[0] = -4712958345084928.000, df[0] = -3551073323188224.000, dc_hat[0] = -2597174508519424.000 + Gradient do_[0] = -209545596036448256.000 +Backward Time Step 1: + Gradient di[0] = -5946235655553024.000, df[0] = -4254347105927168.000, dc_hat[0] = -3599562161782784.000 + Gradient do_[0] = -188020405359869952.000 +Backward Time Step 0: + Gradient di[0] = -6943495923171328.000, df[0] = -5075003150172160.000, dc_hat[0] = -7019642472103936.000 + Gradient do_[0] = -107721001009152000.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1229508182016.000, df[0] = 901484445696.000, dc_hat[0] = 533789147136.000 + Gradient do_[0] = 76233329082368.000 +Backward Time Step 3: + Gradient di[0] = 1928955166720.000, df[0] = 1392022126592.000, dc_hat[0] = 755163660288.000 + Gradient do_[0] = 102040453775360.000 +Backward Time Step 2: + Gradient di[0] = 2454451388416.000, df[0] = 1762895069184.000, dc_hat[0] = 1254099910656.000 + Gradient do_[0] = 111199706415104.000 +Backward Time Step 1: + Gradient di[0] = 3078206783488.000, df[0] = 2122649960448.000, dc_hat[0] = 1712197468160.000 + Gradient do_[0] = 97992170274816.000 +Backward Time Step 0: + Gradient di[0] = 3726992736256.000, df[0] = 2651733884928.000, dc_hat[0] = 3497176334336.000 + Gradient do_[0] = 57618970181632.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2291905412988928.000, df[0] = -1799356550217728.000, dc_hat[0] = -1035947521081344.000 + Gradient do_[0] = -133985327319613440.000 +Backward Time Step 3: + Gradient di[0] = -3598612437139456.000, df[0] = -2747242276454400.000, dc_hat[0] = -1486975659933696.000 + Gradient do_[0] = -183190955153686528.000 +Backward Time Step 2: + Gradient di[0] = -4714207106826240.000, df[0] = -3552013921026048.000, dc_hat[0] = -2597861166415872.000 + Gradient do_[0] = -209601018294435840.000 +Backward Time Step 1: + Gradient di[0] = -5947802781745152.000, df[0] = -4255468092391424.000, dc_hat[0] = -3600505443975168.000 + Gradient do_[0] = -188069934922727424.000 +Backward Time Step 0: + Gradient di[0] = -6945327189852160.000, df[0] = -5076342106226688.000, dc_hat[0] = -7021494139879424.000 + Gradient do_[0] = -107749407922847744.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1229749616640.000, df[0] = 901661392896.000, dc_hat[0] = 533893971968.000 + Gradient do_[0] = 76248260804608.000 +Backward Time Step 3: + Gradient di[0] = 1929330819072.000, df[0] = 1392293445632.000, dc_hat[0] = 755311247360.000 + Gradient do_[0] = 102060317999104.000 +Backward Time Step 2: + Gradient di[0] = 2454930325504.000, df[0] = 1763239657472.000, dc_hat[0] = 1254344491008.000 + Gradient do_[0] = 111221407744000.000 +Backward Time Step 1: + Gradient di[0] = 3078813646848.000, df[0] = 2123068342272.000, dc_hat[0] = 1712532226048.000 + Gradient do_[0] = 98011447296000.000 +Backward Time Step 0: + Gradient di[0] = 3727732244480.000, df[0] = 2652260270080.000, dc_hat[0] = 3497870229504.000 + Gradient do_[0] = 57630408048640.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2292506439974912.000, df[0] = -1799828459749376.000, dc_hat[0] = -1036218640891904.000 + Gradient do_[0] = -134020460152094720.000 +Backward Time Step 3: + Gradient di[0] = -3599553034977280.000, df[0] = -2747960072863744.000, dc_hat[0] = -1487363146514432.000 + Gradient do_[0] = -183238801089363968.000 +Backward Time Step 2: + Gradient di[0] = -4715432783118336.000, df[0] = -3552938144301056.000, dc_hat[0] = -2598535744716800.000 + Gradient do_[0] = -209655530019356672.000 +Backward Time Step 1: + Gradient di[0] = -5949357559906304.000, df[0] = -4256580488921088.000, dc_hat[0] = -3601442015281152.000 + Gradient do_[0] = -188118966269378560.000 +Backward Time Step 0: + Gradient di[0] = -6947141276663808.000, df[0] = -5077668177379328.000, dc_hat[0] = -7023328090914816.000 + Gradient do_[0] = -107777548548571136.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1230168784896.000, df[0] = 901968494592.000, dc_hat[0] = 534075572224.000 + Gradient do_[0] = 76274273878016.000 +Backward Time Step 3: + Gradient di[0] = 1929991553024.000, df[0] = 1392770154496.000, dc_hat[0] = 755569393664.000 + Gradient do_[0] = 102095273328640.000 +Backward Time Step 2: + Gradient di[0] = 2455771283456.000, df[0] = 1763842981888.000, dc_hat[0] = 1254772178944.000 + Gradient do_[0] = 111259391361024.000 +Backward Time Step 1: + Gradient di[0] = 3079866679296.000, df[0] = 2123794481152.000, dc_hat[0] = 1713115889664.000 + Gradient do_[0] = 98044909453312.000 +Backward Time Step 0: + Gradient di[0] = 3729005740032.000, df[0] = 2653166239744.000, dc_hat[0] = 3499065344000.000 + Gradient do_[0] = 57650096111616.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2293113640976384.000, df[0] = -1800305201119232.000, dc_hat[0] = -1036493116145664.000 + Gradient do_[0] = -134055867862482944.000 +Backward Time Step 3: + Gradient di[0] = -3600509202071552.000, df[0] = -2748690217304064.000, dc_hat[0] = -1487758283505664.000 + Gradient do_[0] = -183287420119154688.000 +Backward Time Step 2: + Gradient di[0] = -4716690671665152.000, df[0] = -3553885184589824.000, dc_hat[0] = -2599225892274176.000 + Gradient do_[0] = -209711347414335488.000 +Backward Time Step 1: + Gradient di[0] = -5950939718483968.000, df[0] = -4257712212803584.000, dc_hat[0] = -3602394961149952.000 + Gradient do_[0] = -188168925328965632.000 +Backward Time Step 0: + Gradient di[0] = -6948979522666496.000, df[0] = -5079011428401152.000, dc_hat[0] = -7025186201141248.000 + Gradient do_[0] = -107806084311285760.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1230543519744.000, df[0] = 902243352576.000, dc_hat[0] = 534238298112.000 + Gradient do_[0] = 76297476767744.000 +Backward Time Step 3: + Gradient di[0] = 1930574954496.000, df[0] = 1393191157760.000, dc_hat[0] = 755797262336.000 + Gradient do_[0] = 102126126628864.000 +Backward Time Step 2: + Gradient di[0] = 2456516034560.000, df[0] = 1764378279936.000, dc_hat[0] = 1255152680960.000 + Gradient do_[0] = 111293147119616.000 +Backward Time Step 1: + Gradient di[0] = 3080803319808.000, df[0] = 2124440535040.000, dc_hat[0] = 1713634410496.000 + Gradient do_[0] = 98074714177536.000 +Backward Time Step 0: + Gradient di[0] = 3730137415680.000, df[0] = 2653971283968.000, dc_hat[0] = 3500127027200.000 + Gradient do_[0] = 57667586359296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2293729700347904.000, df[0] = -1800788787593216.000, dc_hat[0] = -1036771013951488.000 + Gradient do_[0] = -134091954177703936.000 +Backward Time Step 3: + Gradient di[0] = -3601476106584064.000, df[0] = -2749428414808064.000, dc_hat[0] = -1488157715464192.000 + Gradient do_[0] = -183336657624236032.000 +Backward Time Step 2: + Gradient di[0] = -4717956613275648.000, df[0] = -3554838935764992.000, dc_hat[0] = -2599923556024320.000 + Gradient do_[0] = -209767594306043904.000 +Backward Time Step 1: + Gradient di[0] = -5952534761963520.000, df[0] = -4258853063491584.000, dc_hat[0] = -3603357033824256.000 + Gradient do_[0] = -188219313885282304.000 +Backward Time Step 0: + Gradient di[0] = -6950843538472960.000, df[0] = -5080374006775808.000, dc_hat[0] = -7027071154913280.000 + Gradient do_[0] = -107834989441187840.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1230931361792.000, df[0] = 902527778816.000, dc_hat[0] = 534406692864.000 + Gradient do_[0] = 76321526906880.000 +Backward Time Step 3: + Gradient di[0] = 1931184439296.000, df[0] = 1393630773248.000, dc_hat[0] = 756035878912.000 + Gradient do_[0] = 102158372438016.000 +Backward Time Step 2: + Gradient di[0] = 2457291718656.000, df[0] = 1764934811648.000, dc_hat[0] = 1255548125184.000 + Gradient do_[0] = 111328270221312.000 +Backward Time Step 1: + Gradient di[0] = 3081773514752.000, df[0] = 2125109264384.000, dc_hat[0] = 1714172329984.000 + Gradient do_[0] = 98105575866368.000 +Backward Time Step 0: + Gradient di[0] = 3731310510080.000, df[0] = 2654805950464.000, dc_hat[0] = 3501228032000.000 + Gradient do_[0] = 57685722529792.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2294364013330432.000, df[0] = -1801286869581824.000, dc_hat[0] = -1037057635909632.000 + Gradient do_[0] = -134129036925337600.000 +Backward Time Step 3: + Gradient di[0] = -3602470123077632.000, df[0] = -2750187281842176.000, dc_hat[0] = -1488567079534592.000 + Gradient do_[0] = -183387200799375360.000 +Backward Time Step 2: + Gradient di[0] = -4719255840882688.000, df[0] = -3555817651437568.000, dc_hat[0] = -2600636252160000.000 + Gradient do_[0] = -209825353026240512.000 +Backward Time Step 1: + Gradient di[0] = -5954182955663360.000, df[0] = -4260032032014336.000, dc_hat[0] = -3604351050317824.000 + Gradient do_[0] = -188271403248648192.000 +Backward Time Step 0: + Gradient di[0] = -6952763925725184.000, df[0] = -5081777387339776.000, dc_hat[0] = -7029013017001984.000 + Gradient do_[0] = -107864787924287488.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1231357345792.000, df[0] = 902840123392.000, dc_hat[0] = 534591635456.000 + Gradient do_[0] = 76347917467648.000 +Backward Time Step 3: + Gradient di[0] = 1931854741504.000, df[0] = 1394114428928.000, dc_hat[0] = 756297629696.000 + Gradient do_[0] = 102193730420736.000 +Backward Time Step 2: + Gradient di[0] = 2458142638080.000, df[0] = 1765546393600.000, dc_hat[0] = 1255982235648.000 + Gradient do_[0] = 111366782320640.000 +Backward Time Step 1: + Gradient di[0] = 3082835984384.000, df[0] = 2125841825792.000, dc_hat[0] = 1714760843264.000 + Gradient do_[0] = 98139323236352.000 +Backward Time Step 0: + Gradient di[0] = 3732597899264.000, df[0] = 2655721881600.000, dc_hat[0] = 3502435729408.000 + Gradient do_[0] = 57705624502272.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2294973630251008.000, df[0] = -1801765355782144.000, dc_hat[0] = -1037332849360896.000 + Gradient do_[0] = -134164582074679296.000 +Backward Time Step 3: + Gradient di[0] = -3603432732622848.000, df[0] = -2750922526556160.000, dc_hat[0] = -1488964766662656.000 + Gradient do_[0] = -183436129066811392.000 +Backward Time Step 2: + Gradient di[0] = -4720521245622272.000, df[0] = -3556771134177280.000, dc_hat[0] = -2601332305297408.000 + Gradient do_[0] = -209881531198472192.000 +Backward Time Step 1: + Gradient di[0] = -5955778536013824.000, df[0] = -4261173419573248.000, dc_hat[0] = -3605312049250304.000 + Gradient do_[0] = -188321843344572416.000 +Backward Time Step 0: + Gradient di[0] = -6954614519758848.000, df[0] = -5083129765167104.000, dc_hat[0] = -7030882401517568.000 + Gradient do_[0] = -107893495485693952.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1231771140096.000, df[0] = 903143358464.000, dc_hat[0] = 534771236864.000 + Gradient do_[0] = 76373603385344.000 +Backward Time Step 3: + Gradient di[0] = 1932502368256.000, df[0] = 1394581962752.000, dc_hat[0] = 756551057408.000 + Gradient do_[0] = 102227989495808.000 +Backward Time Step 2: + Gradient di[0] = 2458967605248.000, df[0] = 1766138839040.000, dc_hat[0] = 1256402714624.000 + Gradient do_[0] = 111404161957888.000 +Backward Time Step 1: + Gradient di[0] = 3083879317504.000, df[0] = 2126560886784.000, dc_hat[0] = 1715338608640.000 + Gradient do_[0] = 98172500180992.000 +Backward Time Step 0: + Gradient di[0] = 3733856714752.000, df[0] = 2656617365504.000, dc_hat[0] = 3503616950272.000 + Gradient do_[0] = 57725090267136.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2295601500782592.000, df[0] = -1802258069061632.000, dc_hat[0] = -1037616585637888.000 + Gradient do_[0] = -134201338404798464.000 +Backward Time Step 3: + Gradient di[0] = -3604414937956352.000, df[0] = -2751671729913856.000, dc_hat[0] = -1489369835765760.000 + Gradient do_[0] = -183486191205613568.000 +Backward Time Step 2: + Gradient di[0] = -4721809735811072.000, df[0] = -3557742602092544.000, dc_hat[0] = -2602041511772160.000 + Gradient do_[0] = -209938808882331648.000 +Backward Time Step 1: + Gradient di[0] = -5957400959909888.000, df[0] = -4262333866049536.000, dc_hat[0] = -3606289422745600.000 + Gradient do_[0] = -188373108074217472.000 +Backward Time Step 0: + Gradient di[0] = -6956504842240000.000, df[0] = -5084511670894592.000, dc_hat[0] = -7032793661964288.000 + Gradient do_[0] = -107922812932456448.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1232090169344.000, df[0] = 903377321984.000, dc_hat[0] = 534909124608.000 + Gradient do_[0] = 76393325002752.000 +Backward Time Step 3: + Gradient di[0] = 1933002801152.000, df[0] = 1394943066112.000, dc_hat[0] = 756746420224.000 + Gradient do_[0] = 102254472331264.000 +Backward Time Step 2: + Gradient di[0] = 2459605663744.000, df[0] = 1766596935680.000, dc_hat[0] = 1256727379968.000 + Gradient do_[0] = 111432968437760.000 +Backward Time Step 1: + Gradient di[0] = 3084675186688.000, df[0] = 2127109816320.000, dc_hat[0] = 1715779534848.000 + Gradient do_[0] = 98197833777152.000 +Backward Time Step 0: + Gradient di[0] = 3734817996800.000, df[0] = 2657301561344.000, dc_hat[0] = 3504518987776.000 + Gradient do_[0] = 57739954880512.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2296209507090432.000, df[0] = -1802735481520128.000, dc_hat[0] = -1037890926673920.000 + Gradient do_[0] = -134236814834663424.000 +Backward Time Step 3: + Gradient di[0] = -3605369226002432.000, df[0] = -2752400532176896.000, dc_hat[0] = -1489763227926528.000 + Gradient do_[0] = -183534707156189184.000 +Backward Time Step 2: + Gradient di[0] = -4723056886939648.000, df[0] = -3558681320882176.000, dc_hat[0] = -2602725753749504.000 + Gradient do_[0] = -209994076521496576.000 +Backward Time Step 1: + Gradient di[0] = -5958976139165696.000, df[0] = -4263460221222912.000, dc_hat[0] = -3607237805211648.000 + Gradient do_[0] = -188422757896159232.000 +Backward Time Step 0: + Gradient di[0] = -6958339867017216.000, df[0] = -5085852774432768.000, dc_hat[0] = -7034649087836160.000 + Gradient do_[0] = -107951288565628928.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1232470016000.000, df[0] = 903655784448.000, dc_hat[0] = 535074013184.000 + Gradient do_[0] = 76416880214016.000 +Backward Time Step 3: + Gradient di[0] = 1933600620544.000, df[0] = 1395374424064.000, dc_hat[0] = 756980056064.000 + Gradient do_[0] = 102286097383424.000 +Backward Time Step 2: + Gradient di[0] = 2460367978496.000, df[0] = 1767144423424.000, dc_hat[0] = 1257116663808.000 + Gradient do_[0] = 111467504336896.000 +Backward Time Step 1: + Gradient di[0] = 3085634633728.000, df[0] = 2127771336704.000, dc_hat[0] = 1716310638592.000 + Gradient do_[0] = 98228334755840.000 +Backward Time Step 0: + Gradient di[0] = 3735974051840.000, df[0] = 2658124169216.000, dc_hat[0] = 3505603739648.000 + Gradient do_[0] = 57757818421248.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2296810265640960.000, df[0] = -1803207122616320.000, dc_hat[0] = -1038162382028800.000 + Gradient do_[0] = -134271930487275520.000 +Backward Time Step 3: + Gradient di[0] = -3606312776630272.000, df[0] = -2753120744505344.000, dc_hat[0] = -1490152861990912.000 + Gradient do_[0] = -183582742070427648.000 +Backward Time Step 2: + Gradient di[0] = -4724294374391808.000, df[0] = -3559613597220864.000, dc_hat[0] = -2603406774501376.000 + Gradient do_[0] = -210049103642492928.000 +Backward Time Step 1: + Gradient di[0] = -5960536286035968.000, df[0] = -4264576375848960.000, dc_hat[0] = -3608177060872192.000 + Gradient do_[0] = -188471961041502208.000 +Backward Time Step 0: + Gradient di[0] = -6960165228118016.000, df[0] = -5087186898649088.000, dc_hat[0] = -7036494313160704.000 + Gradient do_[0] = -107979600990044160.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1232821026816.000, df[0] = 903913209856.000, dc_hat[0] = 535226417152.000 + Gradient do_[0] = 76438631874560.000 +Backward Time Step 3: + Gradient di[0] = 1934147584000.000, df[0] = 1395769344000.000, dc_hat[0] = 757194227712.000 + Gradient do_[0] = 102315038081024.000 +Backward Time Step 2: + Gradient di[0] = 2461065281536.000, df[0] = 1767645118464.000, dc_hat[0] = 1257472524288.000 + Gradient do_[0] = 111499053891584.000 +Backward Time Step 1: + Gradient di[0] = 3086506786816.000, df[0] = 2128372563968.000, dc_hat[0] = 1716793507840.000 + Gradient do_[0] = 98256025550848.000 +Backward Time Step 0: + Gradient di[0] = 3737026035712.000, df[0] = 2658872590336.000, dc_hat[0] = 3506590973952.000 + Gradient do_[0] = 57774088126464.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2297430083108864.000, df[0] = -1803694198751232.000, dc_hat[0] = -1038442695753728.000 + Gradient do_[0] = -134308197191122944.000 +Backward Time Step 3: + Gradient di[0] = -3607287197335552.000, df[0] = -2753864579153920.000, dc_hat[0] = -1490555112521728.000 + Gradient do_[0] = -183632340352761856.000 +Backward Time Step 2: + Gradient di[0] = -4725575885258752.000, df[0] = -3560578891120640.000, dc_hat[0] = -2604110343831552.000 + Gradient do_[0] = -210106054908837888.000 +Backward Time Step 1: + Gradient di[0] = -5962159783673856.000, df[0] = -4265737896067072.000, dc_hat[0] = -3609154702802944.000 + Gradient do_[0] = -188523260130885632.000 +Backward Time Step 0: + Gradient di[0] = -6962061993050112.000, df[0] = -5088573636214784.000, dc_hat[0] = -7038412016058368.000 + Gradient do_[0] = -108009038695890944.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1233176887296.000, df[0] = 904174239744.000, dc_hat[0] = 535380754432.000 + Gradient do_[0] = 76460685524992.000 +Backward Time Step 3: + Gradient di[0] = 1934705688576.000, df[0] = 1396171866112.000, dc_hat[0] = 757412528128.000 + Gradient do_[0] = 102344465317888.000 +Backward Time Step 2: + Gradient di[0] = 2461770711040.000, df[0] = 1768151973888.000, dc_hat[0] = 1257832185856.000 + Gradient do_[0] = 111530947379200.000 +Backward Time Step 1: + Gradient di[0] = 3087393095680.000, df[0] = 2128983359488.000, dc_hat[0] = 1717283717120.000 + Gradient do_[0] = 98284219662336.000 +Backward Time Step 0: + Gradient di[0] = 3738102923264.000, df[0] = 2659638837248.000, dc_hat[0] = 3507601539072.000 + Gradient do_[0] = 57790731124736.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2298058758946816.000, df[0] = -1804187180466176.000, dc_hat[0] = -1038726096486400.000 + Gradient do_[0] = -134344936341372928.000 +Backward Time Step 3: + Gradient di[0] = -3608271550152704.000, df[0] = -2754616735301632.000, dc_hat[0] = -1490961926455296.000 + Gradient do_[0] = -183682471211040768.000 +Backward Time Step 2: + Gradient di[0] = -4726862227963904.000, df[0] = -3561548479987712.000, dc_hat[0] = -2604818208129024.000 + Gradient do_[0] = -210163195153743872.000 +Backward Time Step 1: + Gradient di[0] = -5963780596957184.000, df[0] = -4266897268801536.000, dc_hat[0] = -3610132613169152.000 + Gradient do_[0] = -188574473320923136.000 +Backward Time Step 0: + Gradient di[0] = -6963946409951232.000, df[0] = -5089950710104064.000, dc_hat[0] = -7040317370925056.000 + Gradient do_[0] = -108038261653372928.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1233502470144.000, df[0] = 904412790784.000, dc_hat[0] = 535522115584.000 + Gradient do_[0] = 76480876904448.000 +Backward Time Step 3: + Gradient di[0] = 1935217917952.000, df[0] = 1396541489152.000, dc_hat[0] = 757612609536.000 + Gradient do_[0] = 102371585687552.000 +Backward Time Step 2: + Gradient di[0] = 2462428430336.000, df[0] = 1768624095232.000, dc_hat[0] = 1258167205888.000 + Gradient do_[0] = 111560743714816.000 +Backward Time Step 1: + Gradient di[0] = 3088215965696.000, df[0] = 2129550770176.000, dc_hat[0] = 1717739585536.000 + Gradient do_[0] = 98310400507904.000 +Backward Time Step 0: + Gradient di[0] = 3739094089728.000, df[0] = 2660344004608.000, dc_hat[0] = 3508531625984.000 + Gradient do_[0] = 57806061305856.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2298673476141056.000, df[0] = -1804669961633792.000, dc_hat[0] = -1039004061401088.000 + Gradient do_[0] = -134380876627705856.000 +Backward Time Step 3: + Gradient di[0] = -3609239796842496.000, df[0] = -2755354932805632.000, dc_hat[0] = -1491360016236544.000 + Gradient do_[0] = -183731674356383744.000 +Backward Time Step 2: + Gradient di[0] = -4728128706445312.000, df[0] = -3562502768033792.000, dc_hat[0] = -2605515066572800.000 + Gradient do_[0] = -210219510764929024.000 +Backward Time Step 1: + Gradient di[0] = -5965380472274944.000, df[0] = -4268041877585920.000, dc_hat[0] = -3611097370198016.000 + Gradient do_[0] = -188625016496062464.000 +Backward Time Step 0: + Gradient di[0] = -6965812036370432.000, df[0] = -5091314362220544.000, dc_hat[0] = -7042203398438912.000 + Gradient do_[0] = -108067218322882560.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1233884938240.000, df[0] = 904693153792.000, dc_hat[0] = 535688183808.000 + Gradient do_[0] = 76504599887872.000 +Backward Time Step 3: + Gradient di[0] = 1935820062720.000, df[0] = 1396975992832.000, dc_hat[0] = 757848080384.000 + Gradient do_[0] = 102403412066304.000 +Backward Time Step 2: + Gradient di[0] = 2463194152960.000, df[0] = 1769173942272.000, dc_hat[0] = 1258557145088.000 + Gradient do_[0] = 111595405443072.000 +Backward Time Step 1: + Gradient di[0] = 3089176985600.000, df[0] = 2130213470208.000, dc_hat[0] = 1718271606784.000 + Gradient do_[0] = 98340935041024.000 +Backward Time Step 0: + Gradient di[0] = 3740254076928.000, df[0] = 2661169233920.000, dc_hat[0] = 3509619785728.000 + Gradient do_[0] = 57823991955456.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2299288998641664.000, df[0] = -1805153011236864.000, dc_hat[0] = -1039282093424640.000 + Gradient do_[0] = -134416765374431232.000 +Backward Time Step 3: + Gradient di[0] = -3610204017000448.000, df[0] = -2756090982825984.000, dc_hat[0] = -1491757971800064.000 + Gradient do_[0] = -183780722882904064.000 +Backward Time Step 2: + Gradient di[0] = -4729393037443072.000, df[0] = -3563455177031680.000, dc_hat[0] = -2606210314403840.000 + Gradient do_[0] = -210275671757291520.000 +Backward Time Step 1: + Gradient di[0] = -5966971757658112.000, df[0] = -4269179775483904.000, dc_hat[0] = -3612055147905024.000 + Gradient do_[0] = -188675198893948928.000 +Backward Time Step 0: + Gradient di[0] = -6967661019791360.000, df[0] = -5092666203176960.000, dc_hat[0] = -7044072782954496.000 + Gradient do_[0] = -108095900114485248.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1234323636224.000, df[0] = 905014804480.000, dc_hat[0] = 535878303744.000 + Gradient do_[0] = 76531778977792.000 +Backward Time Step 3: + Gradient di[0] = 1936507797504.000, df[0] = 1397472493568.000, dc_hat[0] = 758117367808.000 + Gradient do_[0] = 102439759904768.000 +Backward Time Step 2: + Gradient di[0] = 2464064995328.000, df[0] = 1769799286784.000, dc_hat[0] = 1259001872384.000 + Gradient do_[0] = 111634823512064.000 +Backward Time Step 1: + Gradient di[0] = 3090268028928.000, df[0] = 2130966085632.000, dc_hat[0] = 1718876635136.000 + Gradient do_[0] = 98375655489536.000 +Backward Time Step 0: + Gradient di[0] = 3741580263424.000, df[0] = 2662112690176.000, dc_hat[0] = 3510864445440.000 + Gradient do_[0] = 57844493713408.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2299909084545024.000, df[0] = -1805639684718592.000, dc_hat[0] = -1039562138714112.000 + Gradient do_[0] = -134452980538671104.000 +Backward Time Step 3: + Gradient di[0] = -3611177900834816.000, df[0] = -2756835085910016.000, dc_hat[0] = -1492160490766336.000 + Gradient do_[0] = -183830321165238272.000 +Backward Time Step 2: + Gradient di[0] = -4730667568988160.000, df[0] = -3564415102222336.000, dc_hat[0] = -2606910125637632.000 + Gradient do_[0] = -210332330965860352.000 +Backward Time Step 1: + Gradient di[0] = -5968580759781376.000, df[0] = -4270331095154688.000, dc_hat[0] = -3613025273643008.000 + Gradient do_[0] = -188726068486602752.000 +Backward Time Step 0: + Gradient di[0] = -6969543826079744.000, df[0] = -5094042203324416.000, dc_hat[0] = -7045976527208448.000 + Gradient do_[0] = -108125114482032640.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1234727469056.000, df[0] = 905311092736.000, dc_hat[0] = 536053678080.000 + Gradient do_[0] = 76556785418240.000 +Backward Time Step 3: + Gradient di[0] = 1937141661696.000, df[0] = 1397929803776.000, dc_hat[0] = 758364962816.000 + Gradient do_[0] = 102473280782336.000 +Backward Time Step 2: + Gradient di[0] = 2464872923136.000, df[0] = 1770379804672.000, dc_hat[0] = 1259413569536.000 + Gradient do_[0] = 111671423008768.000 +Backward Time Step 1: + Gradient di[0] = 3091283574784.000, df[0] = 2131666010112.000, dc_hat[0] = 1719438671872.000 + Gradient do_[0] = 98407951630336.000 +Backward Time Step 0: + Gradient di[0] = 3742804738048.000, df[0] = 2662984056832.000, dc_hat[0] = 3512013422592.000 + Gradient do_[0] = 57863422607360.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2300514138062848.000, df[0] = -1806114815475712.000, dc_hat[0] = -1039835003355136.000 + Gradient do_[0] = -134488336709451776.000 +Backward Time Step 3: + Gradient di[0] = -3612123598946304.000, df[0] = -2757556640415744.000, dc_hat[0] = -1492550124830720.000 + Gradient do_[0] = -183878459158691840.000 +Backward Time Step 2: + Gradient di[0] = -4731910425149440.000, df[0] = -3565351941963776.000, dc_hat[0] = -2607593830744064.000 + Gradient do_[0] = -210387547065417728.000 +Backward Time Step 1: + Gradient di[0] = -5970153254682624.000, df[0] = -4271455571279872.000, dc_hat[0] = -3613971777060864.000 + Gradient do_[0] = -188775718308544512.000 +Backward Time Step 0: + Gradient di[0] = -6971375629631488.000, df[0] = -5095381159378944.000, dc_hat[0] = -7047828194983936.000 + Gradient do_[0] = -108153529985662976.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1235178618880.000, df[0] = 905641656320.000, dc_hat[0] = 536249532416.000 + Gradient do_[0] = 76584786591744.000 +Backward Time Step 3: + Gradient di[0] = 1937847091200.000, df[0] = 1398438887424.000, dc_hat[0] = 758641065984.000 + Gradient do_[0] = 102510601699328.000 +Backward Time Step 2: + Gradient di[0] = 2465772863488.000, df[0] = 1771025989632.000, dc_hat[0] = 1259873239040.000 + Gradient do_[0] = 111712132923392.000 +Backward Time Step 1: + Gradient di[0] = 3092416036864.000, df[0] = 2132446674944.000, dc_hat[0] = 1720066637824.000 + Gradient do_[0] = 98443921981440.000 +Backward Time Step 0: + Gradient di[0] = 3744170770432.000, df[0] = 2663956086784.000, dc_hat[0] = 3513295044608.000 + Gradient do_[0] = 57884536733696.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2301129660563456.000, df[0] = -1806597865078784.000, dc_hat[0] = -1040113236705280.000 + Gradient do_[0] = -134524371485065216.000 +Backward Time Step 3: + Gradient di[0] = -3613096945909760.000, df[0] = -2758299938193408.000, dc_hat[0] = -1492952375361536.000 + Gradient do_[0] = -183927902822203392.000 +Backward Time Step 2: + Gradient di[0] = -4733180661727232.000, df[0] = -3566308914364416.000, dc_hat[0] = -2608292568236032.000 + Gradient do_[0] = -210443931396079616.000 +Backward Time Step 1: + Gradient di[0] = -5971750982516736.000, df[0] = -4272598837886976.000, dc_hat[0] = -3614935460347904.000 + Gradient do_[0] = -188826141224599552.000 +Backward Time Step 0: + Gradient di[0] = -6973239108567040.000, df[0] = -5096743200882688.000, dc_hat[0] = -7049712075014144.000 + Gradient do_[0] = -108182443705499648.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1235507216384.000, df[0] = 905882501120.000, dc_hat[0] = 536391745536.000 + Gradient do_[0] = 76605145743360.000 +Backward Time Step 3: + Gradient di[0] = 1938366136320.000, df[0] = 1398813229056.000, dc_hat[0] = 758843834368.000 + Gradient do_[0] = 102538015670272.000 +Backward Time Step 2: + Gradient di[0] = 2466430058496.000, df[0] = 1771498110976.000, dc_hat[0] = 1260207865856.000 + Gradient do_[0] = 111741920870400.000 +Backward Time Step 1: + Gradient di[0] = 3093234450432.000, df[0] = 2133010677760.000, dc_hat[0] = 1720519622656.000 + Gradient do_[0] = 98469935054848.000 +Backward Time Step 0: + Gradient di[0] = 3745161936896.000, df[0] = 2664660992000.000, dc_hat[0] = 3514224869376.000 + Gradient do_[0] = 57899862720512.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2301746525241344.000, df[0] = -1807082391076864.000, dc_hat[0] = -1040392141144064.000 + Gradient do_[0] = -134560423440547840.000 +Backward Time Step 3: + Gradient di[0] = -3614064924164096.000, df[0] = -2759038672568320.000, dc_hat[0] = -1493351538884608.000 + Gradient do_[0] = -183977140327284736.000 +Backward Time Step 2: + Gradient di[0] = -4734449287692288.000, df[0] = -3567264544587776.000, dc_hat[0] = -2608990231986176.000 + Gradient do_[0] = -210500178287788032.000 +Backward Time Step 1: + Gradient di[0] = -5973355689672704.000, df[0] = -4273746399461376.000, dc_hat[0] = -3615901827989504.000 + Gradient do_[0] = -188876804658823168.000 +Backward Time Step 0: + Gradient di[0] = -6975110640566272.000, df[0] = -5098111147966464.000, dc_hat[0] = -7051604008108032.000 + Gradient do_[0] = -108211477684420608.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1235863470080.000, df[0] = 906143858688.000, dc_hat[0] = 536546803712.000 + Gradient do_[0] = 76627216171008.000 +Backward Time Step 3: + Gradient di[0] = 1938923585536.000, df[0] = 1399215751168.000, dc_hat[0] = 759062069248.000 + Gradient do_[0] = 102567484850176.000 +Backward Time Step 2: + Gradient di[0] = 2467139944448.000, df[0] = 1772007981056.000, dc_hat[0] = 1260570017792.000 + Gradient do_[0] = 111774074404864.000 +Backward Time Step 1: + Gradient di[0] = 3094128885760.000, df[0] = 2133627502592.000, dc_hat[0] = 1721014681600.000 + Gradient do_[0] = 98498405990400.000 +Backward Time Step 0: + Gradient di[0] = 3746241183744.000, df[0] = 2665429073920.000, dc_hat[0] = 3515237793792.000 + Gradient do_[0] = 57916547661824.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2302346746920960.000, df[0] = -1807553629519872.000, dc_hat[0] = -1040662858301440.000 + Gradient do_[0] = -134595513323356160.000 +Backward Time Step 3: + Gradient di[0] = -3615010622275584.000, df[0] = -2759761300815872.000, dc_hat[0] = -1493741307166720.000 + Gradient do_[0] = -184025278320738304.000 +Backward Time Step 2: + Gradient di[0] = -4735692143853568.000, df[0] = -3568200847458304.000, dc_hat[0] = -2609673131786240.000 + Gradient do_[0] = -210555394387345408.000 +Backward Time Step 1: + Gradient di[0] = -5974922815864832.000, df[0] = -4274867385925632.000, dc_hat[0] = -3616845378617344.000 + Gradient do_[0] = -188926299861942272.000 +Backward Time Step 0: + Gradient di[0] = -6976934927925248.000, df[0] = -5099444735311872.000, dc_hat[0] = -7053448696561664.000 + Gradient do_[0] = -108239781518901248.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1236194164736.000, df[0] = 906386210816.000, dc_hat[0] = 536689868800.000 + Gradient do_[0] = 76647667597312.000 +Backward Time Step 3: + Gradient di[0] = 1939442630656.000, df[0] = 1399590354944.000, dc_hat[0] = 759264837632.000 + Gradient do_[0] = 102594923986944.000 +Backward Time Step 2: + Gradient di[0] = 2467802906624.000, df[0] = 1772484296704.000, dc_hat[0] = 1260907528192.000 + Gradient do_[0] = 111804038512640.000 +Backward Time Step 1: + Gradient di[0] = 3094956474368.000, df[0] = 2134198190080.000, dc_hat[0] = 1721473040384.000 + Gradient do_[0] = 98524687499264.000 +Backward Time Step 0: + Gradient di[0] = 3747237593088.000, df[0] = 2666137911296.000, dc_hat[0] = 3516172599296.000 + Gradient do_[0] = 57931953340416.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2302951800438784.000, df[0] = -1808028491841536.000, dc_hat[0] = -1040936326922240.000 + Gradient do_[0] = -134630860904202240.000 +Backward Time Step 3: + Gradient di[0] = -3615960883789824.000, df[0] = -2760486344982528.000, dc_hat[0] = -1494133894021120.000 + Gradient do_[0] = -184073588112883712.000 +Backward Time Step 2: + Gradient di[0] = -4736934463143936.000, df[0] = -3569137418764288.000, dc_hat[0] = -2610356568457216.000 + Gradient do_[0] = -210610662026510336.000 +Backward Time Step 1: + Gradient di[0] = -5976491015798784.000, df[0] = -4275988640825344.000, dc_hat[0] = -3617790002987008.000 + Gradient do_[0] = -188975846604668928.000 +Backward Time Step 0: + Gradient di[0] = -6978766194606080.000, df[0] = -5100782617624576.000, dc_hat[0] = -7055299827466240.000 + Gradient do_[0] = -108268179842662400.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1236537835520.000, df[0] = 906638262272.000, dc_hat[0] = 536839061504.000 + Gradient do_[0] = 76669016604672.000 +Backward Time Step 3: + Gradient di[0] = 1939982778368.000, df[0] = 1399980032000.000, dc_hat[0] = 759475863552.000 + Gradient do_[0] = 102623495585792.000 +Backward Time Step 2: + Gradient di[0] = 2468489723904.000, df[0] = 1772977258496.000, dc_hat[0] = 1261257883648.000 + Gradient do_[0] = 111835185414144.000 +Backward Time Step 1: + Gradient di[0] = 3095817355264.000, df[0] = 2134791553024.000, dc_hat[0] = 1721949880320.000 + Gradient do_[0] = 98552067915776.000 +Backward Time Step 0: + Gradient di[0] = 3748280664064.000, df[0] = 2666880303104.000, dc_hat[0] = 3517151707136.000 + Gradient do_[0] = 57948080439296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2303552290553856.000, df[0] = -1808499998720000.000, dc_hat[0] = -1041207782277120.000 + Gradient do_[0] = -134665916427272192.000 +Backward Time Step 3: + Gradient di[0] = -3616897991966720.000, df[0] = -2761201725472768.000, dc_hat[0] = -1494520575295488.000 + Gradient do_[0] = -184121227890130944.000 +Backward Time Step 2: + Gradient di[0] = -4738157455081472.000, df[0] = -3570058152378368.000, dc_hat[0] = -2611027925532672.000 + Gradient do_[0] = -210664984772870144.000 +Backward Time Step 1: + Gradient di[0] = -5978039351508992.000, df[0] = -4277096205516800.000, dc_hat[0] = -3618723889938432.000 + Gradient do_[0] = -189024688972759040.000 +Backward Time Step 0: + Gradient di[0] = -6980575449579520.000, df[0] = -5102105467551744.000, dc_hat[0] = -7057128946663424.000 + Gradient do_[0] = -108296251748909056.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1236853194752.000, df[0] = 906869473280.000, dc_hat[0] = 536975998976.000 + Gradient do_[0] = 76688536895488.000 +Backward Time Step 3: + Gradient di[0] = 1940473774080.000, df[0] = 1400334319616.000, dc_hat[0] = 759667687424.000 + Gradient do_[0] = 102649483493376.000 +Backward Time Step 2: + Gradient di[0] = 2469115199488.000, df[0] = 1773426573312.000, dc_hat[0] = 1261576257536.000 + Gradient do_[0] = 111863471800320.000 +Backward Time Step 1: + Gradient di[0] = 3096606932992.000, df[0] = 2135336157184.000, dc_hat[0] = 1722386350080.000 + Gradient do_[0] = 98577183408128.000 +Backward Time Step 0: + Gradient di[0] = 3749231984640.000, df[0] = 2667557158912.000, dc_hat[0] = 3518044569600.000 + Gradient do_[0] = 57962789863424.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2304171571150848.000, df[0] = -1808985732677632.000, dc_hat[0] = -1041486619607040.000 + Gradient do_[0] = -134702105821708288.000 +Backward Time Step 3: + Gradient di[0] = -3617872412672000.000, df[0] = -2761946096992256.000, dc_hat[0] = -1494923094261760.000 + Gradient do_[0] = -184170843352334336.000 +Backward Time Step 2: + Gradient di[0] = -4739438965948416.000, df[0] = -3571023983149056.000, dc_hat[0] = -2611733910781952.000 + Gradient do_[0] = -210721918859345920.000 +Backward Time Step 1: + Gradient di[0] = -5979657480437760.000, df[0] = -4278254504509440.000, dc_hat[0] = -3619698579079168.000 + Gradient do_[0] = -189075833443319808.000 +Backward Time Step 0: + Gradient di[0] = -6982452350287872.000, df[0] = -5103476635860992.000, dc_hat[0] = -7059025711595520.000 + Gradient do_[0] = -108325371627175936.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1237343141888.000, df[0] = 907228545024.000, dc_hat[0] = 537188401152.000 + Gradient do_[0] = 76718895267840.000 +Backward Time Step 3: + Gradient di[0] = 1941244215296.000, df[0] = 1400890589184.000, dc_hat[0] = 759969349632.000 + Gradient do_[0] = 102690159853568.000 +Backward Time Step 2: + Gradient di[0] = 2470092734464.000, df[0] = 1774128594944.000, dc_hat[0] = 1262075117568.000 + Gradient do_[0] = 111907738484736.000 +Backward Time Step 1: + Gradient di[0] = 3097832456192.000, df[0] = 2136181309440.000, dc_hat[0] = 1723066220544.000 + Gradient do_[0] = 98616156880896.000 +Backward Time Step 0: + Gradient di[0] = 3750717816832.000, df[0] = 2668614123520.000, dc_hat[0] = 3519438389248.000 + Gradient do_[0] = 57985757872128.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2304789241135104.000, df[0] = -1809471332417536.000, dc_hat[0] = -1041766463569920.000 + Gradient do_[0] = -134738235086602240.000 +Backward Time Step 3: + Gradient di[0] = -3618840927797248.000, df[0] = -2762685368238080.000, dc_hat[0] = -1495322392002560.000 + Gradient do_[0] = -184220132397023232.000 +Backward Time Step 2: + Gradient di[0] = -4740709202526208.000, df[0] = -3571980955549696.000, dc_hat[0] = -2612430769225728.000 + Gradient do_[0] = -210778354729615360.000 +Backward Time Step 1: + Gradient di[0] = -5981256282013696.000, df[0] = -4279397234245632.000, dc_hat[0] = -3620660383318016.000 + Gradient do_[0] = -189126325078851584.000 +Backward Time Step 0: + Gradient di[0] = -6984322271674368.000, df[0] = -5104844046073856.000, dc_hat[0] = -7060917107818496.000 + Gradient do_[0] = -108354379836293120.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1237684977664.000, df[0] = 907479285760.000, dc_hat[0] = 537336807424.000 + Gradient do_[0] = 76740084891648.000 +Backward Time Step 3: + Gradient di[0] = 1941780168704.000, df[0] = 1401277120512.000, dc_hat[0] = 760178606080.000 + Gradient do_[0] = 102718488182784.000 +Backward Time Step 2: + Gradient di[0] = 2470774308864.000, df[0] = 1774617886720.000, dc_hat[0] = 1262422589440.000 + Gradient do_[0] = 111938566619136.000 +Backward Time Step 1: + Gradient di[0] = 3098685210624.000, df[0] = 2136768905216.000, dc_hat[0] = 1723537817600.000 + Gradient do_[0] = 98643235307520.000 +Backward Time Step 0: + Gradient di[0] = 3751748567040.000, df[0] = 2669347602432.000, dc_hat[0] = 3520405700608.000 + Gradient do_[0] = 58001696227328.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2305395368394752.000, df[0] = -1809947134263296.000, dc_hat[0] = -1042039999299584.000 + Gradient do_[0] = -134773617027186688.000 +Backward Time Step 3: + Gradient di[0] = -3619793873666048.000, df[0] = -2763412828323840.000, dc_hat[0] = -1495715381510144.000 + Gradient do_[0] = -184268528088514560.000 +Backward Time Step 2: + Gradient di[0] = -4741955279912960.000, df[0] = -3572919137468416.000, dc_hat[0] = -2613114205896704.000 + Gradient do_[0] = -210833588009041920.000 +Backward Time Step 1: + Gradient di[0] = -5982834145624064.000, df[0] = -4280527079079936.000, dc_hat[0] = -3621612523880448.000 + Gradient do_[0] = -189176129519616000.000 +Backward Time Step 0: + Gradient di[0] = -6986158907064320.000, df[0] = -5106186223353856.000, dc_hat[0] = -7062773607432192.000 + Gradient do_[0] = -108382881239269376.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1238110175232.000, df[0] = 907791040512.000, dc_hat[0] = 537521160192.000 + Gradient do_[0] = 76766458675200.000 +Backward Time Step 3: + Gradient di[0] = 1942449160192.000, df[0] = 1401759727616.000, dc_hat[0] = 760439963648.000 + Gradient do_[0] = 102753871331328.000 +Backward Time Step 2: + Gradient di[0] = 2471627849728.000, df[0] = 1775231041536.000, dc_hat[0] = 1262858010624.000 + Gradient do_[0] = 111977246490624.000 +Backward Time Step 1: + Gradient di[0] = 3099758952448.000, df[0] = 2137509068800.000, dc_hat[0] = 1724132491264.000 + Gradient do_[0] = 98677402107904.000 +Backward Time Step 0: + Gradient di[0] = 3753044606976.000, df[0] = 2670269562880.000, dc_hat[0] = 3521621786624.000 + Gradient do_[0] = 58021728223232.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2305996395380736.000, df[0] = -1810418641141760.000, dc_hat[0] = -1042311521763328.000 + Gradient do_[0] = -134808792809340928.000 +Backward Time Step 3: + Gradient di[0] = -3620738766471168.000, df[0] = -2764133845958656.000, dc_hat[0] = -1496104612921344.000 + Gradient do_[0] = -184316648902098944.000 +Backward Time Step 2: + Gradient di[0] = -4743193304236032.000, df[0] = -3573851950678016.000, dc_hat[0] = -2613794689777664.000 + Gradient do_[0] = -210888632309907456.000 +Backward Time Step 1: + Gradient di[0] = -5984389997527040.000, df[0] = -4281639744045056.000, dc_hat[0] = -3622550168928256.000 + Gradient do_[0] = -189225298305220608.000 +Backward Time Step 0: + Gradient di[0] = -6987972457005056.000, df[0] = -5107511757635584.000, dc_hat[0] = -7064607021596672.000 + Gradient do_[0] = -108411021864992768.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1238540615680.000, df[0] = 908106661888.000, dc_hat[0] = 537708003328.000 + Gradient do_[0] = 76793142837248.000 +Backward Time Step 3: + Gradient di[0] = 1943121821696.000, df[0] = 1402245349376.000, dc_hat[0] = 760703352832.000 + Gradient do_[0] = 102789472583680.000 +Backward Time Step 2: + Gradient di[0] = 2472482701312.000, df[0] = 1775844851712.000, dc_hat[0] = 1263293562880.000 + Gradient do_[0] = 112015917973504.000 +Backward Time Step 1: + Gradient di[0] = 3100828499968.000, df[0] = 2138246742016.000, dc_hat[0] = 1724725067776.000 + Gradient do_[0] = 98711426301952.000 +Backward Time Step 0: + Gradient di[0] = 3754335141888.000, df[0] = 2671187853312.000, dc_hat[0] = 3522832629760.000 + Gradient do_[0] = 58041680527360.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2306596617060352.000, df[0] = -1810890013802496.000, dc_hat[0] = -1042582306029568.000 + Gradient do_[0] = -134843822562607104.000 +Backward Time Step 3: + Gradient di[0] = -3621678022131712.000, df[0] = -2764851105497088.000, dc_hat[0] = -1496492502155264.000 + Gradient do_[0] = -184364477657907200.000 +Backward Time Step 2: + Gradient di[0] = -4744425422979072.000, df[0] = -3574780468920320.000, dc_hat[0] = -2614473026174976.000 + Gradient do_[0] = -210943401732866048.000 +Backward Time Step 1: + Gradient di[0] = -5985945312559104.000, df[0] = -4282752140574720.000, dc_hat[0] = -3623486471798784.000 + Gradient do_[0] = -189274432731086848.000 +Backward Time Step 0: + Gradient di[0] = -6989785470074880.000, df[0] = -5108836755046400.000, dc_hat[0] = -7066439898890240.000 + Gradient do_[0] = -108439136720912384.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1238920069120.000, df[0] = 908384796672.000, dc_hat[0] = 537872924672.000 + Gradient do_[0] = 76816630939648.000 +Backward Time Step 3: + Gradient di[0] = 1943716495360.000, df[0] = 1402674610176.000, dc_hat[0] = 760935940096.000 + Gradient do_[0] = 102820929863680.000 +Backward Time Step 2: + Gradient di[0] = 2473241083904.000, df[0] = 1776389849088.000, dc_hat[0] = 1263681142784.000 + Gradient do_[0] = 112050244157440.000 +Backward Time Step 1: + Gradient di[0] = 3101784539136.000, df[0] = 2138905772032.000, dc_hat[0] = 1725254991872.000 + Gradient do_[0] = 98741784674304.000 +Backward Time Step 0: + Gradient di[0] = 3755494604800.000, df[0] = 2672012558336.000, dc_hat[0] = 3523920527360.000 + Gradient do_[0] = 58059606982656.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2307208918335488.000, df[0] = -1811370647486464.000, dc_hat[0] = -1042858459004928.000 + Gradient do_[0] = -134879625409986560.000 +Backward Time Step 3: + Gradient di[0] = -3622645195079680.000, df[0] = -2765589303001088.000, dc_hat[0] = -1496891128807424.000 + Gradient do_[0] = -184413577724035072.000 +Backward Time Step 2: + Gradient di[0] = -4745694048944128.000, df[0] = -3575736636014592.000, dc_hat[0] = -2615172300537856.000 + Gradient do_[0] = -210999734523920384.000 +Backward Time Step 1: + Gradient di[0] = -5987547335360512.000, df[0] = -4283898628407296.000, dc_hat[0] = -3624452571004928.000 + Gradient do_[0] = -189325010265964544.000 +Backward Time Step 0: + Gradient di[0] = -6991656465203200.000, df[0] = -5110204165259264.000, dc_hat[0] = -7068331831984128.000 + Gradient do_[0] = -108468162109898752.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1239381573632.000, df[0] = 908723159040.000, dc_hat[0] = 538072743936.000 + Gradient do_[0] = 76845244481536.000 +Backward Time Step 3: + Gradient di[0] = 1944442896384.000, df[0] = 1403198373888.000, dc_hat[0] = 761220104192.000 + Gradient do_[0] = 102859324522496.000 +Backward Time Step 2: + Gradient di[0] = 2474165665792.000, df[0] = 1777053466624.000, dc_hat[0] = 1264152608768.000 + Gradient do_[0] = 112092103311360.000 +Backward Time Step 1: + Gradient di[0] = 3102941118464.000, df[0] = 2139703345152.000, dc_hat[0] = 1725896458240.000 + Gradient do_[0] = 98778585497600.000 +Backward Time Step 0: + Gradient di[0] = 3756894453760.000, df[0] = 2673008705536.000, dc_hat[0] = 3525234130944.000 + Gradient do_[0] = 58081249591296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2307802697564160.000, df[0] = -1811836919873536.000, dc_hat[0] = -1043126558916608.000 + Gradient do_[0] = -134914225666523136.000 +Backward Time Step 3: + Gradient di[0] = -3623570492096512.000, df[0] = -2766295556685824.000, dc_hat[0] = -1497273246679040.000 + Gradient do_[0] = -184460667745468416.000 +Backward Time Step 2: + Gradient di[0] = -4746905766592512.000, df[0] = -3576649316564992.000, dc_hat[0] = -2615837752033280.000 + Gradient do_[0] = -211053541874204672.000 +Backward Time Step 1: + Gradient di[0] = -5989074196234240.000, df[0] = -4284990086971392.000, dc_hat[0] = -3625371694006272.000 + Gradient do_[0] = -189373234158764032.000 +Backward Time Step 0: + Gradient di[0] = -6993438339760128.000, df[0] = -5111506614091776.000, dc_hat[0] = -7070132497022976.000 + Gradient do_[0] = -108495804519415808.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1239675961344.000, df[0] = 908939034624.000, dc_hat[0] = 538200408064.000 + Gradient do_[0] = 76863481315328.000 +Backward Time Step 3: + Gradient di[0] = 1944903221248.000, df[0] = 1403530772480.000, dc_hat[0] = 761399869440.000 + Gradient do_[0] = 102883668262912.000 +Backward Time Step 2: + Gradient di[0] = 2474752081920.000, df[0] = 1777474732032.000, dc_hat[0] = 1264450666496.000 + Gradient do_[0] = 112118653255680.000 +Backward Time Step 1: + Gradient di[0] = 3103680102400.000, df[0] = 2140212953088.000, dc_hat[0] = 1726304616448.000 + Gradient do_[0] = 98802090377216.000 +Backward Time Step 0: + Gradient di[0] = 3757787054080.000, df[0] = 2673643880448.000, dc_hat[0] = 3526071681024.000 + Gradient do_[0] = 58095048851456.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2308419562242048.000, df[0] = -1812321311653888.000, dc_hat[0] = -1043405530464256.000 + Gradient do_[0] = -134950294801874944.000 +Backward Time Step 3: + Gradient di[0] = -3624539275657216.000, df[0] = -2767035096367104.000, dc_hat[0] = -1497672812855296.000 + Gradient do_[0] = -184509991149895680.000 +Backward Time Step 2: + Gradient di[0] = -4748174929428480.000, df[0] = -3577605483659264.000, dc_hat[0] = -2616536757960704.000 + Gradient do_[0] = -211109977744474112.000 +Backward Time Step 1: + Gradient di[0] = -5990679977132032.000, df[0] = -4286138453852160.000, dc_hat[0] = -3626340209131520.000 + Gradient do_[0] = -189423897592987648.000 +Backward Time Step 0: + Gradient di[0] = -6995306650533888.000, df[0] = -5112871876820992.000, dc_hat[0] = -7072021745762304.000 + Gradient do_[0] = -108524786958729216.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1240114659328.000, df[0] = 909260554240.000, dc_hat[0] = 538390855680.000 + Gradient do_[0] = 76890677182464.000 +Backward Time Step 3: + Gradient di[0] = 1945592528896.000, df[0] = 1404028190720.000, dc_hat[0] = 761669287936.000 + Gradient do_[0] = 102920099987456.000 +Backward Time Step 2: + Gradient di[0] = 2475629740032.000, df[0] = 1778105188352.000, dc_hat[0] = 1264898146304.000 + Gradient do_[0] = 112158356537344.000 +Backward Time Step 1: + Gradient di[0] = 3104778485760.000, df[0] = 2140969762816.000, dc_hat[0] = 1726912921600.000 + Gradient do_[0] = 98837020540928.000 +Backward Time Step 0: + Gradient di[0] = 3759109832704.000, df[0] = 2674584977408.000, dc_hat[0] = 3527312670720.000 + Gradient do_[0] = 58115496083456.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2309018978615296.000, df[0] = -1812791744790528.000, dc_hat[0] = -1043676448948224.000 + Gradient do_[0] = -134985307375271936.000 +Backward Time Step 3: + Gradient di[0] = -3625482020978688.000, df[0] = -2767755040260096.000, dc_hat[0] = -1498061507395584.000 + Gradient do_[0] = -184557940164788224.000 +Backward Time Step 2: + Gradient di[0] = -4749408658784256.000, df[0] = -3578534270337024.000, dc_hat[0] = -2617211873132544.000 + Gradient do_[0] = -211164747167432704.000 +Backward Time Step 1: + Gradient di[0] = -5992233144680448.000, df[0] = -4287249508204544.000, dc_hat[0] = -3627274632953856.000 + Gradient do_[0] = -189472946119507968.000 +Backward Time Step 0: + Gradient di[0] = -6997116979249152.000, df[0] = -5114195263619072.000, dc_hat[0] = -7073851938701312.000 + Gradient do_[0] = -108552884634779648.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1240390565888.000, df[0] = 909462798336.000, dc_hat[0] = 538510589952.000 + Gradient do_[0] = 76907764776960.000 +Backward Time Step 3: + Gradient di[0] = 1946024017920.000, df[0] = 1404339617792.000, dc_hat[0] = 761838239744.000 + Gradient do_[0] = 102942917001216.000 +Backward Time Step 2: + Gradient di[0] = 2476177096704.000, df[0] = 1778498142208.000, dc_hat[0] = 1265176936448.000 + Gradient do_[0] = 112183153262592.000 +Backward Time Step 1: + Gradient di[0] = 3105464778752.000, df[0] = 2141442932736.000, dc_hat[0] = 1727292768256.000 + Gradient do_[0] = 98858797367296.000 +Backward Time Step 0: + Gradient di[0] = 3759943974912.000, df[0] = 2675178471424.000, dc_hat[0] = 3528095694848.000 + Gradient do_[0] = 58128397762560.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2309610073489408.000, df[0] = -1813256003911680.000, dc_hat[0] = -1043943676444672.000 + Gradient do_[0] = -135019873272070144.000 +Backward Time Step 3: + Gradient di[0] = -3626410807656448.000, df[0] = -2768464246734848.000, dc_hat[0] = -1498445235879936.000 + Gradient do_[0] = -184605253524520960.000 +Backward Time Step 2: + Gradient di[0] = -4750628429496320.000, df[0] = -3579453661773824.000, dc_hat[0] = -2617884035514368.000 + Gradient do_[0] = -211218949654708224.000 +Backward Time Step 1: + Gradient di[0] = -5993772353585152.000, df[0] = -4288350630445056.000, dc_hat[0] = -3628202614325248.000 + Gradient do_[0] = -189521547969429504.000 +Backward Time Step 0: + Gradient di[0] = -6998910128095232.000, df[0] = -5115506302386176.000, dc_hat[0] = -7075664951771136.000 + Gradient do_[0] = -108580690253053952.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1240768315392.000, df[0] = 909739884544.000, dc_hat[0] = 538674626560.000 + Gradient do_[0] = 76931202547712.000 +Backward Time Step 3: + Gradient di[0] = 1946618560512.000, df[0] = 1404768616448.000, dc_hat[0] = 762070368256.000 + Gradient do_[0] = 102974374281216.000 +Backward Time Step 2: + Gradient di[0] = 2476936003584.000, df[0] = 1779043008512.000, dc_hat[0] = 1265564385280.000 + Gradient do_[0] = 112217487835136.000 +Backward Time Step 1: + Gradient di[0] = 3106417147904.000, df[0] = 2142099603456.000, dc_hat[0] = 1727820333056.000 + Gradient do_[0] = 98889113796608.000 +Backward Time Step 0: + Gradient di[0] = 3761090330624.000, df[0] = 2675994263552.000, dc_hat[0] = 3529171271680.000 + Gradient do_[0] = 58146114502656.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2310208684556288.000, df[0] = -1813726034395136.000, dc_hat[0] = -1044213990948864.000 + Gradient do_[0] = -135054868665597952.000 +Backward Time Step 3: + Gradient di[0] = -3627349258010624.000, df[0] = -2769180700966912.000, dc_hat[0] = -1498832856678400.000 + Gradient do_[0] = -184653013560852480.000 +Backward Time Step 2: + Gradient di[0] = -4751857863884800.000, df[0] = -3580380300967936.000, dc_hat[0] = -2618559419121664.000 + Gradient do_[0] = -211273598818582528.000 +Backward Time Step 1: + Gradient di[0] = -5995332500455424.000, df[0] = -4289466516635648.000, dc_hat[0] = -3629141601550336.000 + Gradient do_[0] = -189570837014118400.000 +Backward Time Step 0: + Gradient di[0] = -7000732804841472.000, df[0] = -5116838279118848.000, dc_hat[0] = -7077507492741120.000 + Gradient do_[0] = -108608976907665408.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1241101238272.000, df[0] = 909983940608.000, dc_hat[0] = 538819133440.000 + Gradient do_[0] = 76951855300608.000 +Backward Time Step 3: + Gradient di[0] = 1947139440640.000, df[0] = 1405144399872.000, dc_hat[0] = 762274054144.000 + Gradient do_[0] = 103001922469888.000 +Backward Time Step 2: + Gradient di[0] = 2477597655040.000, df[0] = 1779518406656.000, dc_hat[0] = 1265901895680.000 + Gradient do_[0] = 112247460331520.000 +Backward Time Step 1: + Gradient di[0] = 3107247357952.000, df[0] = 2142672257024.000, dc_hat[0] = 1728279478272.000 + Gradient do_[0] = 98915512745984.000 +Backward Time Step 0: + Gradient di[0] = 3762100109312.000, df[0] = 2676712800256.000, dc_hat[0] = 3530118922240.000 + Gradient do_[0] = 58161725702144.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2310812664332288.000, df[0] = -1814199957192704.000, dc_hat[0] = -1044485916065792.000 + Gradient do_[0] = -135090130347098112.000 +Backward Time Step 3: + Gradient di[0] = -3628296566734848.000, df[0] = -2769903597649920.000, dc_hat[0] = -1499223698702336.000 + Gradient do_[0] = -184701031295221760.000 +Backward Time Step 2: + Gradient di[0] = -4753102330658816.000, df[0] = -3581317946015744.000, dc_hat[0] = -2619242855792640.000 + Gradient do_[0] = -211328780558401536.000 +Backward Time Step 1: + Gradient di[0] = -5996894257938432.000, df[0] = -4290584013438976.000, dc_hat[0] = -3630083810000896.000 + Gradient do_[0] = -189620194778284032.000 +Backward Time Step 0: + Gradient di[0] = -7002552260362240.000, df[0] = -5118167571496960.000, dc_hat[0] = -7079345738743808.000 + Gradient do_[0] = -108637186252865536.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1241565233152.000, df[0] = 910324072448.000, dc_hat[0] = 539020492800.000 + Gradient do_[0] = 76980603060224.000 +Backward Time Step 3: + Gradient di[0] = 1947868856320.000, df[0] = 1405670653952.000, dc_hat[0] = 762559070208.000 + Gradient do_[0] = 103040417792000.000 +Backward Time Step 2: + Gradient di[0] = 2478525120512.000, df[0] = 1780184383488.000, dc_hat[0] = 1266375065600.000 + Gradient do_[0] = 112289436925952.000 +Backward Time Step 1: + Gradient di[0] = 3108414423040.000, df[0] = 2143476645888.000, dc_hat[0] = 1728926842880.000 + Gradient do_[0] = 98952573616128.000 +Backward Time Step 0: + Gradient di[0] = 3763505987584.000, df[0] = 2677712879616.000, dc_hat[0] = 3531438030848.000 + Gradient do_[0] = 58183460585472.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2311414228189184.000, df[0] = -1814672000942080.000, dc_hat[0] = -1044757639856128.000 + Gradient do_[0] = -135125331899056128.000 +Backward Time Step 3: + Gradient di[0] = -3629244949200896.000, df[0] = -2770627836510208.000, dc_hat[0] = -1499614674944000.000 + Gradient do_[0] = -184749289547759616.000 +Backward Time Step 2: + Gradient di[0] = -4754338207498240.000, df[0] = -3582248611741696.000, dc_hat[0] = -2619922802802688.000 + Gradient do_[0] = -211383687420313600.000 +Backward Time Step 1: + Gradient di[0] = -5998457089163264.000, df[0] = -4291702315548672.000, dc_hat[0] = -3631025750016000.000 + Gradient do_[0] = -189669569722318848.000 +Backward Time Step 0: + Gradient di[0] = -7004382990172160.000, df[0] = -5119505990680576.000, dc_hat[0] = -7081197406519296.000 + Gradient do_[0] = -108665601756495872.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1241903398912.000, df[0] = 910572126208.000, dc_hat[0] = 539167293440.000 + Gradient do_[0] = 77001582968832.000 +Backward Time Step 3: + Gradient di[0] = 1948397993984.000, df[0] = 1406052859904.000, dc_hat[0] = 762766360576.000 + Gradient do_[0] = 103068452519936.000 +Backward Time Step 2: + Gradient di[0] = 2479199354880.000, df[0] = 1780668694528.000, dc_hat[0] = 1266718998528.000 + Gradient do_[0] = 112319988236288.000 +Backward Time Step 1: + Gradient di[0] = 3109257478144.000, df[0] = 2144058081280.000, dc_hat[0] = 1729393065984.000 + Gradient do_[0] = 98979391995904.000 +Backward Time Step 0: + Gradient di[0] = 3764527038464.000, df[0] = 2678439542784.000, dc_hat[0] = 3532396167168.000 + Gradient do_[0] = 58199247945728.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2312009081159680.000, df[0] = -1815139078635520.000, dc_hat[0] = -1045026477965312.000 + Gradient do_[0] = -135160061004611584.000 +Backward Time Step 3: + Gradient di[0] = -3630175078055936.000, df[0] = -2771337848291328.000, dc_hat[0] = -1499997598121984.000 + Gradient do_[0] = -184796654447099904.000 +Backward Time Step 2: + Gradient di[0] = -4755556367597568.000, df[0] = -3583166392565760.000, dc_hat[0] = -2620591207088128.000 + Gradient do_[0] = -211437838367981568.000 +Backward Time Step 1: + Gradient di[0] = -5999996834938880.000, df[0] = -4292803169353728.000, dc_hat[0] = -3631951852339200.000 + Gradient do_[0] = -189718154392371200.000 +Backward Time Step 0: + Gradient di[0] = -7006169159696384.000, df[0] = -5120811660738560.000, dc_hat[0] = -7083003440267264.000 + Gradient do_[0] = -108693312885489664.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1242248773632.000, df[0] = 910825488384.000, dc_hat[0] = 539317272576.000 + Gradient do_[0] = 77022990696448.000 +Backward Time Step 3: + Gradient di[0] = 1948942073856.000, df[0] = 1406445289472.000, dc_hat[0] = 762978828288.000 + Gradient do_[0] = 103097225445376.000 +Backward Time Step 2: + Gradient di[0] = 2479894036480.000, df[0] = 1781167816704.000, dc_hat[0] = 1267073286144.000 + Gradient do_[0] = 112351395184640.000 +Backward Time Step 1: + Gradient di[0] = 3110125436928.000, df[0] = 2144656293888.000, dc_hat[0] = 1729873707008.000 + Gradient do_[0] = 99006998904832.000 +Backward Time Step 0: + Gradient di[0] = 3765572468736.000, df[0] = 2679183245312.000, dc_hat[0] = 3533376847872.000 + Gradient do_[0] = 58215404404736.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2312606618484736.000, df[0] = -1815608303812608.000, dc_hat[0] = -1045296591142912.000 + Gradient do_[0] = -135194979088728064.000 +Backward Time Step 3: + Gradient di[0] = -3631114065281024.000, df[0] = -2772054839394304.000, dc_hat[0] = -1500385621573632.000 + Gradient do_[0] = -184844466023038976.000 +Backward Time Step 2: + Gradient di[0] = -4756787949469696.000, df[0] = -3584094105501696.000, dc_hat[0] = -2621269006614528.000 + Gradient do_[0] = -211492573431201792.000 +Backward Time Step 1: + Gradient di[0] = -6001550539358208.000, df[0] = -4293914760577024.000, dc_hat[0] = -3632887886774272.000 + Gradient do_[0] = -189767168559153152.000 +Backward Time Step 0: + Gradient di[0] = -7007987004604416.000, df[0] = -5122140416245760.000, dc_hat[0] = -7084841686269952.000 + Gradient do_[0] = -108721522230689792.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1242657980416.000, df[0] = 911125512192.000, dc_hat[0] = 539494678528.000 + Gradient do_[0] = 77048341069824.000 +Backward Time Step 3: + Gradient di[0] = 1949582622720.000, df[0] = 1406907580416.000, dc_hat[0] = 763229569024.000 + Gradient do_[0] = 103131107033088.000 +Backward Time Step 2: + Gradient di[0] = 2480708255744.000, df[0] = 1781752266752.000, dc_hat[0] = 1267487997952.000 + Gradient do_[0] = 112388237950976.000 +Backward Time Step 1: + Gradient di[0] = 3111147798528.000, df[0] = 2145361199104.000, dc_hat[0] = 1730440593408.000 + Gradient do_[0] = 99039529926656.000 +Backward Time Step 0: + Gradient di[0] = 3766813982720.000, df[0] = 2680066408448.000, dc_hat[0] = 3534542077952.000 + Gradient do_[0] = 58234601734144.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2313189123424256.000, df[0] = -1816065583611904.000, dc_hat[0] = -1045559523672064.000 + Gradient do_[0] = -135229012409581568.000 +Backward Time Step 3: + Gradient di[0] = -3632027819573248.000, df[0] = -2772752503144448.000, dc_hat[0] = -1500763176042496.000 + Gradient do_[0] = -184890971928920064.000 +Backward Time Step 2: + Gradient di[0] = -4757985708474368.000, df[0] = -3584996853940224.000, dc_hat[0] = -2621927747223552.000 + Gradient do_[0] = -211545676406849536.000 +Backward Time Step 1: + Gradient di[0] = -6003053777911808.000, df[0] = -4294989844578304.000, dc_hat[0] = -3633793856438272.000 + Gradient do_[0] = -189814670897446912.000 +Backward Time Step 0: + Gradient di[0] = -7009736666906624.000, df[0] = -5123418705887232.000, dc_hat[0] = -7086609602183168.000 + Gradient do_[0] = -108748657834065920.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1242952368128.000, df[0] = 911341322240.000, dc_hat[0] = 539622440960.000 + Gradient do_[0] = 77066586292224.000 +Backward Time Step 3: + Gradient di[0] = 1950046879744.000, df[0] = 1407242469376.000, dc_hat[0] = 763410448384.000 + Gradient do_[0] = 103155534659584.000 +Backward Time Step 2: + Gradient di[0] = 2481294934016.000, df[0] = 1782173925376.000, dc_hat[0] = 1267786973184.000 + Gradient do_[0] = 112414829838336.000 +Backward Time Step 1: + Gradient di[0] = 3111888617472.000, df[0] = 2145871855616.000, dc_hat[0] = 1730849931264.000 + Gradient do_[0] = 99063043194880.000 +Backward Time Step 0: + Gradient di[0] = 3767703437312.000, df[0] = 2680699224064.000, dc_hat[0] = 3535376744448.000 + Gradient do_[0] = 58248354856960.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2313785318572032.000, df[0] = -1816533735047168.000, dc_hat[0] = -1045828563107840.000 + Gradient do_[0] = -135263818824548352.000 +Backward Time Step 3: + Gradient di[0] = -3632964390879232.000, df[0] = -2773467346763776.000, dc_hat[0] = -1501150125752320.000 + Gradient do_[0] = -184938663245774848.000 +Backward Time Step 2: + Gradient di[0] = -4759216753475584.000, df[0] = -3585924298440704.000, dc_hat[0] = -2622604741443584.000 + Gradient do_[0] = -211600428649938944.000 +Backward Time Step 1: + Gradient di[0] = -6004617682878464.000, df[0] = -4296108951994368.000, dc_hat[0] = -3634736870195200.000 + Gradient do_[0] = -189863994301874176.000 +Backward Time Step 0: + Gradient di[0] = -7011557733040128.000, df[0] = -5124750145748992.000, dc_hat[0] = -7088450532540416.000 + Gradient do_[0] = -108776910128939008.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1243370094592.000, df[0] = 911647440896.000, dc_hat[0] = 539803516928.000 + Gradient do_[0] = 77092498702336.000 +Backward Time Step 3: + Gradient di[0] = 1950700797952.000, df[0] = 1407714590720.000, dc_hat[0] = 763666563072.000 + Gradient do_[0] = 103190137667584.000 +Backward Time Step 2: + Gradient di[0] = 2482130124800.000, df[0] = 1782773579776.000, dc_hat[0] = 1268212826112.000 + Gradient do_[0] = 112452603740160.000 +Backward Time Step 1: + Gradient di[0] = 3112935358464.000, df[0] = 2146593669120.000, dc_hat[0] = 1731429924864.000 + Gradient do_[0] = 99096345968640.000 +Backward Time Step 0: + Gradient di[0] = 3768970117120.000, df[0] = 2681600475136.000, dc_hat[0] = 3536565305344.000 + Gradient do_[0] = 58267933868032.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2314383392768000.000, df[0] = -1817003362877440.000, dc_hat[0] = -1046099414482944.000 + Gradient do_[0] = -135298822808010752.000 +Backward Time Step 3: + Gradient di[0] = -3633904988717056.000, df[0] = -2774185143173120.000, dc_hat[0] = -1501536941244416.000 + Gradient do_[0] = -184986371742498816.000 +Backward Time Step 2: + Gradient di[0] = -4760444577251328.000, df[0] = -3586849327022080.000, dc_hat[0] = -2623280125050880.000 + Gradient do_[0] = -211654991914467328.000 +Backward Time Step 1: + Gradient di[0] = -6006164944846848.000, df[0] = -4297215711379456.000, dc_hat[0] = -3635668341227520.000 + Gradient do_[0] = -189912888209571840.000 +Backward Time Step 0: + Gradient di[0] = -7013356787466240.000, df[0] = -5126064942612480.000, dc_hat[0] = -7090269988061184.000 + Gradient do_[0] = -108804818826428416.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1243631321088.000, df[0] = 911838937088.000, dc_hat[0] = 539916861440.000 + Gradient do_[0] = 77108688715776.000 +Backward Time Step 3: + Gradient di[0] = 1951110660096.000, df[0] = 1408010420224.000, dc_hat[0] = 763826995200.000 + Gradient do_[0] = 103211830607872.000 +Backward Time Step 2: + Gradient di[0] = 2482651791360.000, df[0] = 1783148052480.000, dc_hat[0] = 1268478509056.000 + Gradient do_[0] = 112476234448896.000 +Backward Time Step 1: + Gradient di[0] = 3113591504896.000, df[0] = 2147045736448.000, dc_hat[0] = 1731792207872.000 + Gradient do_[0] = 99117200048128.000 +Backward Time Step 0: + Gradient di[0] = 3769764413440.000, df[0] = 2682165657600.000, dc_hat[0] = 3537310580736.000 + Gradient do_[0] = 58280218984448.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2314986567237632.000, df[0] = -1817476748804096.000, dc_hat[0] = -1046370869837824.000 + Gradient do_[0] = -135334058719707136.000 +Backward Time Step 3: + Gradient di[0] = -3634851492134912.000, df[0] = -2774907502985216.000, dc_hat[0] = -1501927917486080.000 + Gradient do_[0] = -185034595635298304.000 +Backward Time Step 2: + Gradient di[0] = -4761682601574400.000, df[0] = -3587781334925312.000, dc_hat[0] = -2623958461448192.000 + Gradient do_[0] = -211709950315986944.000 +Backward Time Step 1: + Gradient di[0] = -6007727776071680.000, df[0] = -4298333476618240.000, dc_hat[0] = -3636609207500800.000 + Gradient do_[0] = -189962263153606656.000 +Backward Time Step 0: + Gradient di[0] = -7015188054147072.000, df[0] = -5127403361796096.000, dc_hat[0] = -7092121118965760.000 + Gradient do_[0] = -108833225740124160.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1244059402240.000, df[0] = 912152788992.000, dc_hat[0] = 540102557696.000 + Gradient do_[0] = 77135171551232.000 +Backward Time Step 3: + Gradient di[0] = 1951781748736.000, df[0] = 1408494469120.000, dc_hat[0] = 764089139200.000 + Gradient do_[0] = 103247331196928.000 +Backward Time Step 2: + Gradient di[0] = 2483507691520.000, df[0] = 1783762780160.000, dc_hat[0] = 1268915634176.000 + Gradient do_[0] = 112515006595072.000 +Backward Time Step 1: + Gradient di[0] = 3114662363136.000, df[0] = 2147784458240.000, dc_hat[0] = 1732386095104.000 + Gradient do_[0] = 99151232630784.000 +Backward Time Step 0: + Gradient di[0] = 3771054424064.000, df[0] = 2683083423744.000, dc_hat[0] = 3538520899584.000 + Gradient do_[0] = 58300158705664.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2315564777209856.000, df[0] = -1817930807377920.000, dc_hat[0] = -1046632191754240.000 + Gradient do_[0] = -135367834342522880.000 +Backward Time Step 3: + Gradient di[0] = -3635759340847104.000, df[0] = -2775601140203520.000, dc_hat[0] = -1502303190253568.000 + Gradient do_[0] = -185080792303534080.000 +Backward Time Step 2: + Gradient di[0] = -4762878749966336.000, df[0] = -3588682472751104.000, dc_hat[0] = -2624615591444480.000 + Gradient do_[0] = -211763139190980608.000 +Backward Time Step 1: + Gradient di[0] = -6009234772721664.000, df[0] = -4299411244974080.000, dc_hat[0] = -3637517593083904.000 + Gradient do_[0] = -190009868571115520.000 +Backward Time Step 0: + Gradient di[0] = -7016943085158400.000, df[0] = -5128686483275776.000, dc_hat[0] = -7093896014200832.000 + Gradient do_[0] = -108860455832780800.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1244627861504.000, df[0] = 912569532416.000, dc_hat[0] = 540349333504.000 + Gradient do_[0] = 77170420482048.000 +Backward Time Step 3: + Gradient di[0] = 1952674086912.000, df[0] = 1409138425856.000, dc_hat[0] = 764438511616.000 + Gradient do_[0] = 103294542282752.000 +Backward Time Step 2: + Gradient di[0] = 2484643561472.000, df[0] = 1784578572288.000, dc_hat[0] = 1269494710272.000 + Gradient do_[0] = 112566445539328.000 +Backward Time Step 1: + Gradient di[0] = 3116086329344.000, df[0] = 2148766187520.000, dc_hat[0] = 1733175934976.000 + Gradient do_[0] = 99196531113984.000 +Backward Time Step 0: + Gradient di[0] = 3772779069440.000, df[0] = 2684310519808.000, dc_hat[0] = 3540139376640.000 + Gradient do_[0] = 58326826090496.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2316174125694976.000, df[0] = -1818409427795968.000, dc_hat[0] = -1046907673640960.000 + Gradient do_[0] = -135403448211341312.000 +Backward Time Step 3: + Gradient di[0] = -3636720608215040.000, df[0] = -2776334774304768.000, dc_hat[0] = -1502698729897984.000 + Gradient do_[0] = -185129669031362560.000 +Backward Time Step 2: + Gradient di[0] = -4764135564771328.000, df[0] = -3589629781475328.000, dc_hat[0] = -2625306544308224.000 + Gradient do_[0] = -211818939406090240.000 +Backward Time Step 1: + Gradient di[0] = -6010818541912064.000, df[0] = -4300544311033856.000, dc_hat[0] = -3638472954871808.000 + Gradient do_[0] = -190059896350179328.000 +Backward Time Step 0: + Gradient di[0] = -7018787773612032.000, df[0] = -5130034566135808.000, dc_hat[0] = -7095760030007296.000 + Gradient do_[0] = -108889086084775936.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1244958818304.000, df[0] = 912812212224.000, dc_hat[0] = 540492988416.000 + Gradient do_[0] = 77190939017216.000 +Backward Time Step 3: + Gradient di[0] = 1953192083456.000, df[0] = 1409512505344.000, dc_hat[0] = 764640821248.000 + Gradient do_[0] = 103321922699264.000 +Backward Time Step 2: + Gradient di[0] = 2485299445760.000, df[0] = 1785049513984.000, dc_hat[0] = 1269829337088.000 + Gradient do_[0] = 112596141211648.000 +Backward Time Step 1: + Gradient di[0] = 3116908150784.000, df[0] = 2149332811776.000, dc_hat[0] = 1733631279104.000 + Gradient do_[0] = 99222661627904.000 +Backward Time Step 0: + Gradient di[0] = 3773774430208.000, df[0] = 2685018570752.000, dc_hat[0] = 3541073133568.000 + Gradient do_[0] = 58342206603264.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2316754751586304.000, df[0] = -1818864962764800.000, dc_hat[0] = -1047170002190336.000 + Gradient do_[0] = -135437378452979712.000 +Backward Time Step 3: + Gradient di[0] = -3637625772572672.000, df[0] = -2777026532474880.000, dc_hat[0] = -1503072928923648.000 + Gradient do_[0] = -185175728260644864.000 +Backward Time Step 2: + Gradient di[0] = -4765326881325056.000, df[0] = -3590527161204736.000, dc_hat[0] = -2625962600562688.000 + Gradient do_[0] = -211871784683700224.000 +Backward Time Step 1: + Gradient di[0] = -6012331981012992.000, df[0] = -4301626374356992.000, dc_hat[0] = -3639384293244928.000 + Gradient do_[0] = -190107622026772480.000 +Backward Time Step 0: + Gradient di[0] = -7020550857687040.000, df[0] = -5131323056324608.000, dc_hat[0] = -7097542978306048.000 + Gradient do_[0] = -108916427846582272.000 +Epoch 700, Train Loss=0.011329, Weight Norm=12.909179 +Sample Predictions at Epoch 700: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.34 | 63.87 | 6.53 | +| 193 | 2024-10-14 | 56.73 | 66.55 | 9.82 | +| 194 | 2024-10-15 | 56.92 | 66.00 | 9.08 | +| 195 | 2024-10-16 | 57.88 | 67.20 | 9.32 | +| 196 | 2024-10-17 | 57.41 | 66.76 | 9.35 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1245248356352.000, df[0] = 913024548864.000, dc_hat[0] = 540618391552.000 + Gradient do_[0] = 77208873861120.000 +Backward Time Step 3: + Gradient di[0] = 1953647689728.000, df[0] = 1409841102848.000, dc_hat[0] = 764819079168.000 + Gradient do_[0] = 103345930895360.000 +Backward Time Step 2: + Gradient di[0] = 2485879570432.000, df[0] = 1785466322944.000, dc_hat[0] = 1270125166592.000 + Gradient do_[0] = 112622372388864.000 +Backward Time Step 1: + Gradient di[0] = 3117638483968.000, df[0] = 2149835997184.000, dc_hat[0] = 1734034718720.000 + Gradient do_[0] = 99245881294848.000 +Backward Time Step 0: + Gradient di[0] = 3774652350464.000, df[0] = 2685643522048.000, dc_hat[0] = 3541897052160.000 + Gradient do_[0] = 58355783565312.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2317350678298624.000, df[0] = -1819332979982336.000, dc_hat[0] = -1047439377170432.000 + Gradient do_[0] = -135472219227684864.000 +Backward Time Step 3: + Gradient di[0] = -3638562343878656.000, df[0] = -2777740839223296.000, dc_hat[0] = -1503458804891648.000 + Gradient do_[0] = -185223419577499648.000 +Backward Time Step 2: + Gradient di[0] = -4766555241971712.000, df[0] = -3591452995092480.000, dc_hat[0] = -2626639057911808.000 + Gradient do_[0] = -211926433847574528.000 +Backward Time Step 1: + Gradient di[0] = -6013879779852288.000, df[0] = -4302734207483904.000, dc_hat[0] = -3640314958970880.000 + Gradient do_[0] = -190156515934470144.000 +Backward Time Step 0: + Gradient di[0] = -7022352059596800.000, df[0] = -5132640000671744.000, dc_hat[0] = -7099364044439552.000 + Gradient do_[0] = -108944379493744640.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1245425958912.000, df[0] = 913154637824.000, dc_hat[0] = 540695494656.000 + Gradient do_[0] = 77219888103424.000 +Backward Time Step 3: + Gradient di[0] = 1953926086656.000, df[0] = 1410041905152.000, dc_hat[0] = 764927606784.000 + Gradient do_[0] = 103360644513792.000 +Backward Time Step 2: + Gradient di[0] = 2486234251264.000, df[0] = 1785720733696.000, dc_hat[0] = 1270305128448.000 + Gradient do_[0] = 112638386241536.000 +Backward Time Step 1: + Gradient di[0] = 3118080458752.000, df[0] = 2150140870656.000, dc_hat[0] = 1734278250496.000 + Gradient do_[0] = 99259898658816.000 +Backward Time Step 0: + Gradient di[0] = 3775184240640.000, df[0] = 2686021795840.000, dc_hat[0] = 3542396174336.000 + Gradient do_[0] = 58364004401152.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2317926740787200.000, df[0] = -1819785025290240.000, dc_hat[0] = -1047699222691840.000 + Gradient do_[0] = -135505891771285504.000 +Backward Time Step 3: + Gradient di[0] = -3639468045107200.000, df[0] = -2778432060522496.000, dc_hat[0] = -1503833003917312.000 + Gradient do_[0] = -185269478806781952.000 +Backward Time Step 2: + Gradient di[0] = -4767739579203584.000, df[0] = -3592345006112768.000, dc_hat[0] = -2627290282328064.000 + Gradient do_[0] = -211979038607015936.000 +Backward Time Step 1: + Gradient di[0] = -6015372280987648.000, df[0] = -4303800701550592.000, dc_hat[0] = -3641214754619392.000 + Gradient do_[0] = -190203571596165120.000 +Backward Time Step 0: + Gradient di[0] = -7024085078900736.000, df[0] = -5133905942282240.000, dc_hat[0] = -7101115317354496.000 + Gradient do_[0] = -108971248809148416.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1245969121280.000, df[0] = 913552769024.000, dc_hat[0] = 540930998272.000 + Gradient do_[0] = 77253551587328.000 +Backward Time Step 3: + Gradient di[0] = 1954779496448.000, df[0] = 1410657943552.000, dc_hat[0] = 765261709312.000 + Gradient do_[0] = 103405800390656.000 +Backward Time Step 2: + Gradient di[0] = 2487320576000.000, df[0] = 1786500874240.000, dc_hat[0] = 1270858907648.000 + Gradient do_[0] = 112687585427456.000 +Backward Time Step 1: + Gradient di[0] = 3119444918272.000, df[0] = 2151081312256.000, dc_hat[0] = 1735034667008.000 + Gradient do_[0] = 99303309705216.000 +Backward Time Step 0: + Gradient di[0] = 3776837320704.000, df[0] = 2687198035968.000, dc_hat[0] = 3543947018240.000 + Gradient do_[0] = 58389560295424.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2318525351854080.000, df[0] = -1820255189991424.000, dc_hat[0] = -1047969671413760.000 + Gradient do_[0] = -135540809855401984.000 +Backward Time Step 3: + Gradient di[0] = -3640404884848640.000, df[0] = -2779146904141824.000, dc_hat[0] = -1504218343014400.000 + Gradient do_[0] = -185317101404160000.000 +Backward Time Step 2: + Gradient di[0] = -4768965255495680.000, df[0] = -3593269497823232.000, dc_hat[0] = -2627964323758080.000 + Gradient do_[0] = -212033430072852480.000 +Backward Time Step 1: + Gradient di[0] = -6016917932343296.000, df[0] = -4304907192500224.000, dc_hat[0] = -3642146494087168.000 + Gradient do_[0] = -190252448323993600.000 +Backward Time Step 0: + Gradient di[0] = -7025894333874176.000, df[0] = -5135228792209408.000, dc_hat[0] = -7102944436551680.000 + Gradient do_[0] = -108999329305329664.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1246337957888.000, df[0] = 913823367168.000, dc_hat[0] = 541091463168.000 + Gradient do_[0] = 77276435709952.000 +Backward Time Step 3: + Gradient di[0] = 1955357392896.000, df[0] = 1411074883584.000, dc_hat[0] = 765487480832.000 + Gradient do_[0] = 103436360089600.000 +Backward Time Step 2: + Gradient di[0] = 2488056676352.000, df[0] = 1787029487616.000, dc_hat[0] = 1271234428928.000 + Gradient do_[0] = 112720913367040.000 +Backward Time Step 1: + Gradient di[0] = 3120369238016.000, df[0] = 2151718977536.000, dc_hat[0] = 1735547158528.000 + Gradient do_[0] = 99332745330688.000 +Backward Time Step 0: + Gradient di[0] = 3777956937728.000, df[0] = 2687994691584.000, dc_hat[0] = 3544997691392.000 + Gradient do_[0] = 58406870188032.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2319100877471744.000, df[0] = -1820707101081600.000, dc_hat[0] = -1048229852479488.000 + Gradient do_[0] = -135574533938610176.000 +Backward Time Step 3: + Gradient di[0] = -3641313270431744.000, df[0] = -2779840541360128.000, dc_hat[0] = -1504593749999616.000 + Gradient do_[0] = -185363383971741696.000 +Backward Time Step 2: + Gradient di[0] = -4770150129598464.000, df[0] = -3594160971972608.000, dc_hat[0] = -2628615279738880.000 + Gradient do_[0] = -212086155091378176.000 +Backward Time Step 1: + Gradient di[0] = -6018415265316864.000, df[0] = -4305978249969664.000, dc_hat[0] = -3643047095042048.000 + Gradient do_[0] = -190299744503857152.000 +Backward Time Step 0: + Gradient di[0] = -7027645069918208.000, df[0] = -5136508692463616.000, dc_hat[0] = -7104715036819456.000 + Gradient do_[0] = -109026490678509568.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1246638374912.000, df[0] = 914043502592.000, dc_hat[0] = 541221191680.000 + Gradient do_[0] = 77295058419712.000 +Backward Time Step 3: + Gradient di[0] = 1955828727808.000, df[0] = 1411414884352.000, dc_hat[0] = 765671636992.000 + Gradient do_[0] = 103461282643968.000 +Backward Time Step 2: + Gradient di[0] = 2488653578240.000, df[0] = 1787458486272.000, dc_hat[0] = 1271538778112.000 + Gradient do_[0] = 112747949850624.000 +Backward Time Step 1: + Gradient di[0] = 3121113989120.000, df[0] = 2152232255488.000, dc_hat[0] = 1735959642112.000 + Gradient do_[0] = 99356376039424.000 +Backward Time Step 0: + Gradient di[0] = 3778858188800.000, df[0] = 2688635895808.000, dc_hat[0] = 3545843630080.000 + Gradient do_[0] = 58420812054528.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2319693851394048.000, df[0] = -1821172031291392.000, dc_hat[0] = -1048497012867072.000 + Gradient do_[0] = -135609185734754304.000 +Backward Time Step 3: + Gradient di[0] = -3642244741464064.000, df[0] = -2780551895318528.000, dc_hat[0] = -1504978418008064.000 + Gradient do_[0] = -185410800410689536.000 +Backward Time Step 2: + Gradient di[0] = -4771370974052352.000, df[0] = -3595080094973952.000, dc_hat[0] = -2629283952459776.000 + Gradient do_[0] = -212140374758522880.000 +Backward Time Step 1: + Gradient di[0] = -6019960916672512.000, df[0] = -4307083667177472.000, dc_hat[0] = -3643979371380736.000 + Gradient do_[0] = -190348552512208896.000 +Backward Time Step 0: + Gradient di[0] = -7029441439989760.000, df[0] = -5137821341843456.000, dc_hat[0] = -7106530734243840.000 + Gradient do_[0] = -109054365016260608.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1247095816192.000, df[0] = 914378981376.000, dc_hat[0] = 541419864064.000 + Gradient do_[0] = 77323369971712.000 +Backward Time Step 3: + Gradient di[0] = 1956545953792.000, df[0] = 1411932356608.000, dc_hat[0] = 765952000000.000 + Gradient do_[0] = 103499215929344.000 +Backward Time Step 2: + Gradient di[0] = 2489570820096.000, df[0] = 1788117516288.000, dc_hat[0] = 1272007229440.000 + Gradient do_[0] = 112789448294400.000 +Backward Time Step 1: + Gradient di[0] = 3122265325568.000, df[0] = 2153026551808.000, dc_hat[0] = 1736597045248.000 + Gradient do_[0] = 99392983924736.000 +Backward Time Step 0: + Gradient di[0] = 3780242046976.000, df[0] = 2689620508672.000, dc_hat[0] = 3547142029312.000 + Gradient do_[0] = 58442198810624.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2320294341509120.000, df[0] = -1821643672387584.000, dc_hat[0] = -1048768468221952.000 + Gradient do_[0] = -135644172538347520.000 +Backward Time Step 3: + Gradient di[0] = -3643181044334592.000, df[0] = -2781266738937856.000, dc_hat[0] = -1505364830846976.000 + Gradient do_[0] = -185458371468460032.000 +Backward Time Step 2: + Gradient di[0] = -4772603629666304.000, df[0] = -3596009686958080.000, dc_hat[0] = -2629962288857088.000 + Gradient do_[0] = -212195127001612288.000 +Backward Time Step 1: + Gradient di[0] = -6021516768575488.000, df[0] = -4308196600578048.000, dc_hat[0] = -3644915674251264.000 + Gradient do_[0] = -190397686938075136.000 +Backward Time Step 0: + Gradient di[0] = -7031257674285056.000, df[0] = -5139149023608832.000, dc_hat[0] = -7108367369633792.000 + Gradient do_[0] = -109082540001722368.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1247439093760.000, df[0] = 914630705152.000, dc_hat[0] = 541568860160.000 + Gradient do_[0] = 77344609927168.000 +Backward Time Step 3: + Gradient di[0] = 1957084135424.000, df[0] = 1412320854016.000, dc_hat[0] = 766162632704.000 + Gradient do_[0] = 103527628144640.000 +Backward Time Step 2: + Gradient di[0] = 2490253705216.000, df[0] = 1788607725568.000, dc_hat[0] = 1272355094528.000 + Gradient do_[0] = 112820326760448.000 +Backward Time Step 1: + Gradient di[0] = 3123122274304.000, df[0] = 2153616900096.000, dc_hat[0] = 1737070608384.000 + Gradient do_[0] = 99420230123520.000 +Backward Time Step 0: + Gradient di[0] = 3781281710080.000, df[0] = 2690360279040.000, dc_hat[0] = 3548117729280.000 + Gradient do_[0] = 58458275577856.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2320867988078592.000, df[0] = -1822093838647296.000, dc_hat[0] = -1049027105783808.000 + Gradient do_[0] = -135677699053060096.000 +Backward Time Step 3: + Gradient di[0] = -3644087282434048.000, df[0] = -2781958497107968.000, dc_hat[0] = -1505738493001728.000 + Gradient do_[0] = -185504447877611520.000 +Backward Time Step 2: + Gradient di[0] = -4773786893156352.000, df[0] = -3596900087365632.000, dc_hat[0] = -2630611634225152.000 + Gradient do_[0] = -212247645861707776.000 +Backward Time Step 1: + Gradient di[0] = -6023001753518080.000, df[0] = -4309258799677440.000, dc_hat[0] = -3645810638061568.000 + Gradient do_[0] = -190444519261470720.000 +Backward Time Step 0: + Gradient di[0] = -7032982103654400.000, df[0] = -5140409059639296.000, dc_hat[0] = -7110110052614144.000 + Gradient do_[0] = -109109280468107264.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1247745015808.000, df[0] = 914854838272.000, dc_hat[0] = 541701144576.000 + Gradient do_[0] = 77363576569856.000 +Backward Time Step 3: + Gradient di[0] = 1957562417152.000, df[0] = 1412666228736.000, dc_hat[0] = 766349475840.000 + Gradient do_[0] = 103552944963584.000 +Backward Time Step 2: + Gradient di[0] = 2490863452160.000, df[0] = 1789045506048.000, dc_hat[0] = 1272665735168.000 + Gradient do_[0] = 112847958835200.000 +Backward Time Step 1: + Gradient di[0] = 3123889045504.000, df[0] = 2154145775616.000, dc_hat[0] = 1737495543808.000 + Gradient do_[0] = 99444615806976.000 +Backward Time Step 0: + Gradient di[0] = 3782209437696.000, df[0] = 2691020357632.000, dc_hat[0] = 3548988571648.000 + Gradient do_[0] = 58472615903232.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2321446466486272.000, df[0] = -1822548165656576.000, dc_hat[0] = -1049288293482496.000 + Gradient do_[0] = -135711509035614208.000 +Backward Time Step 3: + Gradient di[0] = -3644993520533504.000, df[0] = -2782650792148992.000, dc_hat[0] = -1506112020938752.000 + Gradient do_[0] = -185550627365978112.000 +Backward Time Step 2: + Gradient di[0] = -4774979820322816.000, df[0] = -3597799883014144.000, dc_hat[0] = -2631269301092352.000 + Gradient do_[0] = -212300680117878784.000 +Backward Time Step 1: + Gradient di[0] = -6024510360780800.000, df[0] = -4310338715516928.000, dc_hat[0] = -3646720097386496.000 + Gradient do_[0] = -190492176218587136.000 +Backward Time Step 0: + Gradient di[0] = -7034742503374848.000, df[0] = -5141695402344448.000, dc_hat[0] = -7111889779687424.000 + Gradient do_[0] = -109136587870175232.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1248160776192.000, df[0] = 915159711744.000, dc_hat[0] = 541881958400.000 + Gradient do_[0] = 77389346373632.000 +Backward Time Step 3: + Gradient di[0] = 1958213844992.000, df[0] = 1413135859712.000, dc_hat[0] = 766604476416.000 + Gradient do_[0] = 103587396976640.000 +Backward Time Step 2: + Gradient di[0] = 2491691040768.000, df[0] = 1789640048640.000, dc_hat[0] = 1273088049152.000 + Gradient do_[0] = 112885455912960.000 +Backward Time Step 1: + Gradient di[0] = 3124929757184.000, df[0] = 2154863394816.000, dc_hat[0] = 1738072391680.000 + Gradient do_[0] = 99477692088320.000 +Backward Time Step 0: + Gradient di[0] = 3783464583168.000, df[0] = 2691913220096.000, dc_hat[0] = 3550166122496.000 + Gradient do_[0] = 58492022947840.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2322032461086720.000, df[0] = -1823008129810432.000, dc_hat[0] = -1049553440604160.000 + Gradient do_[0] = -135745782874636288.000 +Backward Time Step 3: + Gradient di[0] = -3645915596324864.000, df[0] = -2783354361479168.000, dc_hat[0] = -1506492930850816.000 + Gradient do_[0] = -185597425329635328.000 +Backward Time Step 2: + Gradient di[0] = -4776179726811136.000, df[0] = -3598703168323584.000, dc_hat[0] = -2631928847007744.000 + Gradient do_[0] = -212353851813003264.000 +Backward Time Step 1: + Gradient di[0] = -6026023799881728.000, df[0] = -4311419973533696.000, dc_hat[0] = -3647628751405056.000 + Gradient do_[0] = -190539936254918656.000 +Backward Time Step 0: + Gradient di[0] = -7036515251126272.000, df[0] = -5142991945596928.000, dc_hat[0] = -7113682391662592.000 + Gradient do_[0] = -109164110020608000.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1248509034496.000, df[0] = 915415236608.000, dc_hat[0] = 542033149952.000 + Gradient do_[0] = 77410938650624.000 +Backward Time Step 3: + Gradient di[0] = 1958763429888.000, df[0] = 1413532745728.000, dc_hat[0] = 766818975744.000 + Gradient do_[0] = 103616438337536.000 +Backward Time Step 2: + Gradient di[0] = 2492389130240.000, df[0] = 1790141267968.000, dc_hat[0] = 1273443909632.000 + Gradient do_[0] = 112917039022080.000 +Backward Time Step 1: + Gradient di[0] = 3125802172416.000, df[0] = 2155464753152.000, dc_hat[0] = 1738554605568.000 + Gradient do_[0] = 99505449992192.000 +Backward Time Step 0: + Gradient di[0] = 3784523382784.000, df[0] = 2692666621952.000, dc_hat[0] = 3551159386112.000 + Gradient do_[0] = 58508384927744.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2322611744800768.000, df[0] = -1823462993690624.000, dc_hat[0] = -1049814963847168.000 + Gradient do_[0] = -135779644396797952.000 +Backward Time Step 3: + Gradient di[0] = -3646827203133440.000, df[0] = -2784050146181120.000, dc_hat[0] = -1506868606271488.000 + Gradient do_[0] = -185643845336170496.000 +Backward Time Step 2: + Gradient di[0] = -4777378022686720.000, df[0] = -3599606722068480.000, dc_hat[0] = -2632587319181312.000 + Gradient do_[0] = -212407160947081216.000 +Backward Time Step 1: + Gradient di[0] = -6027541533949952.000, df[0] = -4312506063388672.000, dc_hat[0] = -3648544384745472.000 + Gradient do_[0] = -190587868089942016.000 +Backward Time Step 0: + Gradient di[0] = -7038277798330368.000, df[0] = -5144279898914816.000, dc_hat[0] = -7115464266219520.000 + Gradient do_[0] = -109191451782414336.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1248919814144.000, df[0] = 915716177920.000, dc_hat[0] = 542211211264.000 + Gradient do_[0] = 77436389687296.000 +Backward Time Step 3: + Gradient di[0] = 1959406338048.000, df[0] = 1413996609536.000, dc_hat[0] = 767070437376.000 + Gradient do_[0] = 103650445754368.000 +Backward Time Step 2: + Gradient di[0] = 2493207281664.000, df[0] = 1790729125888.000, dc_hat[0] = 1273861373952.000 + Gradient do_[0] = 112954099892224.000 +Backward Time Step 1: + Gradient di[0] = 3126828204032.000, df[0] = 2156171886592.000, dc_hat[0] = 1739123589120.000 + Gradient do_[0] = 99538090065920.000 +Backward Time Step 0: + Gradient di[0] = 3785764634624.000, df[0] = 2693549785088.000, dc_hat[0] = 3552324091904.000 + Gradient do_[0] = 58527578062848.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2323197470965760.000, df[0] = -1823922823626752.000, dc_hat[0] = -1050079641206784.000 + Gradient do_[0] = -135813866696212480.000 +Backward Time Step 3: + Gradient di[0] = -3647742568038400.000, df[0] = -2784749152108544.000, dc_hat[0] = -1507246294958080.000 + Gradient do_[0] = -185690419961528320.000 +Backward Time Step 2: + Gradient di[0] = -4778580613529600.000, df[0] = -3600512423297024.000, dc_hat[0] = -2633248207273984.000 + Gradient do_[0] = -212460590340243456.000 +Backward Time Step 1: + Gradient di[0] = -6029061952372736.000, df[0] = -4313593495420928.000, dc_hat[0] = -3649461360263168.000 + Gradient do_[0] = -190635920184049664.000 +Backward Time Step 0: + Gradient di[0] = -7040045177372672.000, df[0] = -5145571610329088.000, dc_hat[0] = -7117250435743744.000 + Gradient do_[0] = -109218862263697408.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1249243955200.000, df[0] = 915953876992.000, dc_hat[0] = 542351654912.000 + Gradient do_[0] = 77456497180672.000 +Backward Time Step 3: + Gradient di[0] = 1959914635264.000, df[0] = 1414363086848.000, dc_hat[0] = 767269339136.000 + Gradient do_[0] = 103677331243008.000 +Backward Time Step 2: + Gradient di[0] = 2493856612352.000, df[0] = 1791195086848.000, dc_hat[0] = 1274191806464.000 + Gradient do_[0] = 112983451631616.000 +Backward Time Step 1: + Gradient di[0] = 3127640064000.000, df[0] = 2156731957248.000, dc_hat[0] = 1739573297152.000 + Gradient do_[0] = 99563851481088.000 +Backward Time Step 0: + Gradient di[0] = 3786744791040.000, df[0] = 2694247088128.000, dc_hat[0] = 3553243955200.000 + Gradient do_[0] = 58542727888896.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2323785076178944.000, df[0] = -1824383861522432.000, dc_hat[0] = -1050344654110720.000 + Gradient do_[0] = -135848209254711296.000 +Backward Time Step 3: + Gradient di[0] = -3648667328184320.000, df[0] = -2785454868922368.000, dc_hat[0] = -1507627607523328.000 + Gradient do_[0] = -185737441263484928.000 +Backward Time Step 2: + Gradient di[0] = -4779795015532544.000, df[0] = -3601427251331072.000, dc_hat[0] = -2633916343123968.000 + Gradient do_[0] = -212514432050266112.000 +Backward Time Step 1: + Gradient di[0] = -6030589886988288.000, df[0] = -4314686564597760.000, dc_hat[0] = -3650381825441792.000 + Gradient do_[0] = -190684178436587520.000 +Backward Time Step 0: + Gradient di[0] = -7041830273155072.000, df[0] = -5146876743516160.000, dc_hat[0] = -7119055395749888.000 + Gradient do_[0] = -109246556212822016.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1249571504128.000, df[0] = 916194197504.000, dc_hat[0] = 542494130176.000 + Gradient do_[0] = 77476831166464.000 +Backward Time Step 3: + Gradient di[0] = 1960431976448.000, df[0] = 1414736510976.000, dc_hat[0] = 767471386624.000 + Gradient do_[0] = 103704644550656.000 +Backward Time Step 2: + Gradient di[0] = 2494512496640.000, df[0] = 1791666421760.000, dc_hat[0] = 1274526826496.000 + Gradient do_[0] = 113013180858368.000 +Backward Time Step 1: + Gradient di[0] = 3128469487616.000, df[0] = 2157303431168.000, dc_hat[0] = 1740031393792.000 + Gradient do_[0] = 99590216876032.000 +Backward Time Step 0: + Gradient di[0] = 3787744083968.000, df[0] = 2694958022656.000, dc_hat[0] = 3554181644288.000 + Gradient do_[0] = 58558179704832.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2324363286151168.000, df[0] = -1824837920096256.000, dc_hat[0] = -1050605908918272.000 + Gradient do_[0] = -135881967697657856.000 +Backward Time Step 3: + Gradient di[0] = -3649575445331968.000, df[0] = -2786148506140672.000, dc_hat[0] = -1508002880290816.000 + Gradient do_[0] = -185783689471328256.000 +Backward Time Step 2: + Gradient di[0] = -4780974520926208.000, df[0] = -3602316041125888.000, dc_hat[0] = -2634563809443840.000 + Gradient do_[0] = -212566882190884864.000 +Backward Time Step 1: + Gradient di[0] = -6032079703769088.000, df[0] = -4315752521793536.000, dc_hat[0] = -3651278131429376.000 + Gradient do_[0] = -190731131019067392.000 +Backward Time Step 0: + Gradient di[0] = -7043569198039040.000, df[0] = -5148147516964864.000, dc_hat[0] = -7120813647986688.000 + Gradient do_[0] = -109273537197375488.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1249989623808.000, df[0] = 916500709376.000, dc_hat[0] = 542675566592.000 + Gradient do_[0] = 77502743576576.000 +Backward Time Step 3: + Gradient di[0] = 1961083666432.000, df[0] = 1415206666240.000, dc_hat[0] = 767726583808.000 + Gradient do_[0] = 103739113340928.000 +Backward Time Step 2: + Gradient di[0] = 2495344541696.000, df[0] = 1792263585792.000, dc_hat[0] = 1274950713344.000 + Gradient do_[0] = 113050778599424.000 +Backward Time Step 1: + Gradient di[0] = 3129509675008.000, df[0] = 2158020788224.000, dc_hat[0] = 1740608372736.000 + Gradient do_[0] = 99623301545984.000 +Backward Time Step 0: + Gradient di[0] = 3789001064448.000, df[0] = 2695852457984.000, dc_hat[0] = 3555361030144.000 + Gradient do_[0] = 58577616109568.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2324933979930624.000, df[0] = -1825286207307776.000, dc_hat[0] = -1050863338520576.000 + Gradient do_[0] = -135915339593547776.000 +Backward Time Step 3: + Gradient di[0] = -3650467993223168.000, df[0] = -2786829526892544.000, dc_hat[0] = -1508370502647808.000 + Gradient do_[0] = -185829147405189120.000 +Backward Time Step 2: + Gradient di[0] = -4782149731352576.000, df[0] = -3603201878130688.000, dc_hat[0] = -2635210470457344.000 + Gradient do_[0] = -212619074633465856.000 +Backward Time Step 1: + Gradient di[0] = -6033562004357120.000, df[0] = -4316813110280192.000, dc_hat[0] = -3652172558368768.000 + Gradient do_[0] = -190777980522332160.000 +Backward Time Step 0: + Gradient di[0] = -7045301680472064.000, df[0] = -5149413458575360.000, dc_hat[0] = -7122564920901632.000 + Gradient do_[0] = -109300415102713856.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1250321891328.000, df[0] = 916744306688.000, dc_hat[0] = 542819844096.000 + Gradient do_[0] = 77523329220608.000 +Backward Time Step 3: + Gradient di[0] = 1961605595136.000, df[0] = 1415583629312.000, dc_hat[0] = 767930793984.000 + Gradient do_[0] = 103766669918208.000 +Backward Time Step 2: + Gradient di[0] = 2496007241728.000, df[0] = 1792739770368.000, dc_hat[0] = 1275288485888.000 + Gradient do_[0] = 113080751095808.000 +Backward Time Step 1: + Gradient di[0] = 3130343292928.000, df[0] = 2158595538944.000, dc_hat[0] = 1741070139392.000 + Gradient do_[0] = 99649784381440.000 +Backward Time Step 0: + Gradient di[0] = 3790010056704.000, df[0] = 2696570470400.000, dc_hat[0] = 3556307894272.000 + Gradient do_[0] = 58593218920448.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2325530711949312.000, df[0] = -1825754627178496.000, dc_hat[0] = -1051133585915904.000 + Gradient do_[0] = -135950120238710784.000 +Backward Time Step 3: + Gradient di[0] = -3651399464255488.000, df[0] = -2787540880850944.000, dc_hat[0] = -1508755573309440.000 + Gradient do_[0] = -185876512304529408.000 +Backward Time Step 2: + Gradient di[0] = -4783373260161024.000, df[0] = -3604123417051136.000, dc_hat[0] = -2635882901274624.000 + Gradient do_[0] = -212673431739564032.000 +Backward Time Step 1: + Gradient di[0] = -6035103897616384.000, df[0] = -4317915574697984.000, dc_hat[0] = -3653100002869248.000 + Gradient do_[0] = -190826685451468800.000 +Backward Time Step 0: + Gradient di[0] = -7047099661156352.000, df[0] = -5150727718567936.000, dc_hat[0] = -7124382765809664.000 + Gradient do_[0] = -109328306620334080.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1250656649216.000, df[0] = 916989804544.000, dc_hat[0] = 542965202944.000 + Gradient do_[0] = 77544099414016.000 +Backward Time Step 3: + Gradient di[0] = 1962131718144.000, df[0] = 1415963082752.000, dc_hat[0] = 768136314880.000 + Gradient do_[0] = 103794511708160.000 +Backward Time Step 2: + Gradient di[0] = 2496676495360.000, df[0] = 1793220411392.000, dc_hat[0] = 1275629666304.000 + Gradient do_[0] = 113111075913728.000 +Backward Time Step 1: + Gradient di[0] = 3131184775168.000, df[0] = 2159175401472.000, dc_hat[0] = 1741535576064.000 + Gradient do_[0] = 99676544040960.000 +Backward Time Step 0: + Gradient di[0] = 3791024553984.000, df[0] = 2697292414976.000, dc_hat[0] = 3557259739136.000 + Gradient do_[0] = 58608901423104.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2326087178649600.000, df[0] = -1826191237447680.000, dc_hat[0] = -1051385177047040.000 + Gradient do_[0] = -135982650321010688.000 +Backward Time Step 3: + Gradient di[0] = -3652279932551168.000, df[0] = -2788213311668224.000, dc_hat[0] = -1509118363828224.000 + Gradient do_[0] = -185921265863753728.000 +Backward Time Step 2: + Gradient di[0] = -4784525385138176.000, df[0] = -3604991537315840.000, dc_hat[0] = -2636515872079872.000 + Gradient do_[0] = -212724644929601536.000 +Backward Time Step 1: + Gradient di[0] = -6036554522820608.000, df[0] = -4318952809299968.000, dc_hat[0] = -3653974297149440.000 + Gradient do_[0] = -190872521342451712.000 +Backward Time Step 0: + Gradient di[0] = -7048777382756352.000, df[0] = -5151953931730944.000, dc_hat[0] = -7126078204149760.000 + Gradient do_[0] = -109354334122147840.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1250971877376.000, df[0] = 917220753408.000, dc_hat[0] = 543101321216.000 + Gradient do_[0] = 77563628093440.000 +Backward Time Step 3: + Gradient di[0] = 1962626383872.000, df[0] = 1416320122880.000, dc_hat[0] = 768329580544.000 + Gradient do_[0] = 103820667387904.000 +Backward Time Step 2: + Gradient di[0] = 2497304854528.000, df[0] = 1793671561216.000, dc_hat[0] = 1275950399488.000 + Gradient do_[0] = 113139504906240.000 +Backward Time Step 1: + Gradient di[0] = 3131970420736.000, df[0] = 2159717253120.000, dc_hat[0] = 1741970210816.000 + Gradient do_[0] = 99701508538368.000 +Backward Time Step 0: + Gradient di[0] = 3791977447424.000, df[0] = 2697970057216.000, dc_hat[0] = 3558153650176.000 + Gradient do_[0] = 58623627624448.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2326676394475520.000, df[0] = -1826653751738368.000, dc_hat[0] = -1051650861039616.000 + Gradient do_[0] = -136017078778855424.000 +Backward Time Step 3: + Gradient di[0] = -3653202276777984.000, df[0] = -2788916880998400.000, dc_hat[0] = -1509498065780736.000 + Gradient do_[0] = -185968184086495232.000 +Backward Time Step 2: + Gradient di[0] = -4785729049722880.000, df[0] = -3605898043850752.000, dc_hat[0] = -2637176223301632.000 + Gradient do_[0] = -212778125862371328.000 +Backward Time Step 1: + Gradient di[0] = -6038082457436160.000, df[0] = -4320046146912256.000, dc_hat[0] = -3654894493892608.000 + Gradient do_[0] = -190920693695643648.000 +Backward Time Step 0: + Gradient di[0] = -7050566236635136.000, df[0] = -5153261212401664.000, dc_hat[0] = -7127887459123200.000 + Gradient do_[0] = -109382088200814592.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1251348316160.000, df[0] = 917496659968.000, dc_hat[0] = 543264669696.000 + Gradient do_[0] = 77586973589504.000 +Backward Time Step 3: + Gradient di[0] = 1963214766080.000, df[0] = 1416744534016.000, dc_hat[0] = 768559480832.000 + Gradient do_[0] = 103851763957760.000 +Backward Time Step 2: + Gradient di[0] = 2498055110656.000, df[0] = 1794210267136.000, dc_hat[0] = 1276332212224.000 + Gradient do_[0] = 113173495545856.000 +Backward Time Step 1: + Gradient di[0] = 3132913090560.000, df[0] = 2160366977024.000, dc_hat[0] = 1742491746304.000 + Gradient do_[0] = 99731497811968.000 +Backward Time Step 0: + Gradient di[0] = 3793119084544.000, df[0] = 2698782441472.000, dc_hat[0] = 3559225032704.000 + Gradient do_[0] = 58641281449984.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2327254872883200.000, df[0] = -1827108078747648.000, dc_hat[0] = -1051911511867392.000 + Gradient do_[0] = -136050905941278720.000 +Backward Time Step 3: + Gradient di[0] = -3654114152022016.000, df[0] = -2789613202571264.000, dc_hat[0] = -1509874814943232.000 + Gradient do_[0] = -186014604093030400.000 +Backward Time Step 2: + Gradient di[0] = -4786927345598464.000, df[0] = -3606801060724736.000, dc_hat[0] = -2637835769217024.000 + Gradient do_[0] = -212831314737364992.000 +Backward Time Step 1: + Gradient di[0] = -6039582474764288.000, df[0] = -4321119351865344.000, dc_hat[0] = -3655798316072960.000 + Gradient do_[0] = -190968110134591488.000 +Backward Time Step 0: + Gradient di[0] = -7052314825195520.000, df[0] = -5154539502043136.000, dc_hat[0] = -7129654301294592.000 + Gradient do_[0] = -109409215214256128.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1251858186240.000, df[0] = 917870608384.000, dc_hat[0] = 543486083072.000 + Gradient do_[0] = 77618590253056.000 +Backward Time Step 3: + Gradient di[0] = 1964016271360.000, df[0] = 1417323085824.000, dc_hat[0] = 768873267200.000 + Gradient do_[0] = 103894168371200.000 +Backward Time Step 2: + Gradient di[0] = 2499078520832.000, df[0] = 1794945318912.000, dc_hat[0] = 1276854009856.000 + Gradient do_[0] = 113219825827840.000 +Backward Time Step 1: + Gradient di[0] = 3134193664000.000, df[0] = 2161250271232.000, dc_hat[0] = 1743202418688.000 + Gradient do_[0] = 99772232892416.000 +Backward Time Step 0: + Gradient di[0] = 3794661277696.000, df[0] = 2699879776256.000, dc_hat[0] = 3560672329728.000 + Gradient do_[0] = 58665126068224.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2327817245163520.000, df[0] = -1827549520855040.000, dc_hat[0] = -1052165653135360.000 + Gradient do_[0] = -136083745261223936.000 +Backward Time Step 3: + Gradient di[0] = -3654994888753152.000, df[0] = -2790285901824000.000, dc_hat[0] = -1510238679203840.000 + Gradient do_[0] = -186059443551600640.000 +Backward Time Step 2: + Gradient di[0] = -4788080007446528.000, df[0] = -3607668912553984.000, dc_hat[0] = -2638470350635008.000 + Gradient do_[0] = -212882545107271680.000 +Backward Time Step 1: + Gradient di[0] = -6041049742966784.000, df[0] = -4322168934498304.000, dc_hat[0] = -3656680931852288.000 + Gradient do_[0] = -191014461421649920.000 +Backward Time Step 0: + Gradient di[0] = -7054018853470208.000, df[0] = -5155785042558976.000, dc_hat[0] = -7131378193793024.000 + Gradient do_[0] = -109435655032930304.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1252142874624.000, df[0] = 918079340544.000, dc_hat[0] = 543609651200.000 + Gradient do_[0] = 77636181164032.000 +Backward Time Step 3: + Gradient di[0] = 1964464013312.000, df[0] = 1417646047232.000, dc_hat[0] = 769048051712.000 + Gradient do_[0] = 103917782302720.000 +Backward Time Step 2: + Gradient di[0] = 2499643703296.000, df[0] = 1795351248896.000, dc_hat[0] = 1277142237184.000 + Gradient do_[0] = 113245385916416.000 +Backward Time Step 1: + Gradient di[0] = 3134903549952.000, df[0] = 2161739431936.000, dc_hat[0] = 1743594717184.000 + Gradient do_[0] = 99794781470720.000 +Backward Time Step 0: + Gradient di[0] = 3795520585728.000, df[0] = 2700491096064.000, dc_hat[0] = 3561478684672.000 + Gradient do_[0] = 58678401040384.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2328404313505792.000, df[0] = -1828010290315264.000, dc_hat[0] = -1052430867365888.000 + Gradient do_[0] = -136117993330442240.000 +Backward Time Step 3: + Gradient di[0] = -3655916964544512.000, df[0] = -2790989739589632.000, dc_hat[0] = -1510618649591808.000 + Gradient do_[0] = -186106275874996224.000 +Backward Time Step 2: + Gradient di[0] = -4789284745773056.000, df[0] = -3608577029701632.000, dc_hat[0] = -2639131775598592.000 + Gradient do_[0] = -212936077579649024.000 +Backward Time Step 1: + Gradient di[0] = -6042569087647744.000, df[0] = -4323255024353280.000, dc_hat[0] = -3657597370499072.000 + Gradient do_[0] = -191062427616411648.000 +Backward Time Step 0: + Gradient di[0] = -7055797506801664.000, df[0] = -5157084807036928.000, dc_hat[0] = -7133176174477312.000 + Gradient do_[0] = -109463245902839808.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1252556668928.000, df[0] = 918382706688.000, dc_hat[0] = 543789252608.000 + Gradient do_[0] = 77661841915904.000 +Backward Time Step 3: + Gradient di[0] = 1965112688640.000, df[0] = 1418114498560.000, dc_hat[0] = 769302462464.000 + Gradient do_[0] = 103952091709440.000 +Backward Time Step 2: + Gradient di[0] = 2500469456896.000, df[0] = 1795944611840.000, dc_hat[0] = 1277563895808.000 + Gradient do_[0] = 113282782330880.000 +Backward Time Step 1: + Gradient di[0] = 3135940329472.000, df[0] = 2162454429696.000, dc_hat[0] = 1744169074688.000 + Gradient do_[0] = 99827757088768.000 +Backward Time Step 0: + Gradient di[0] = 3796777041920.000, df[0] = 2701384744960.000, dc_hat[0] = 3562657284096.000 + Gradient do_[0] = 58697824862208.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2328966417350656.000, df[0] = -1828451463987200.000, dc_hat[0] = -1052684471762944.000 + Gradient do_[0] = -136150918549733376.000 +Backward Time Step 3: + Gradient di[0] = -3656803606855680.000, df[0] = -2791666465374208.000, dc_hat[0] = -1510984527118336.000 + Gradient do_[0] = -186151458930950144.000 +Backward Time Step 2: + Gradient di[0] = -4790449755652096.000, df[0] = -3609454276771840.000, dc_hat[0] = -2639770383548416.000 + Gradient do_[0] = -212987857705369600.000 +Backward Time Step 1: + Gradient di[0] = -6044027765915648.000, df[0] = -4324299238277120.000, dc_hat[0] = -3658475422875648.000 + Gradient do_[0] = -191108469665824768.000 +Backward Time Step 0: + Gradient di[0] = -7057494018883584.000, df[0] = -5158324978843648.000, dc_hat[0] = -7134891477041152.000 + Gradient do_[0] = -109489574052364288.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1252880809984.000, df[0] = 918620340224.000, dc_hat[0] = 543929696256.000 + Gradient do_[0] = 77681932632064.000 +Backward Time Step 3: + Gradient di[0] = 1965623214080.000, df[0] = 1418482679808.000, dc_hat[0] = 769500905472.000 + Gradient do_[0] = 103979077861376.000 +Backward Time Step 2: + Gradient di[0] = 2501119049728.000, df[0] = 1796411228160.000, dc_hat[0] = 1277895901184.000 + Gradient do_[0] = 113312150847488.000 +Backward Time Step 1: + Gradient di[0] = 3136752975872.000, df[0] = 2163014631424.000, dc_hat[0] = 1744619175936.000 + Gradient do_[0] = 99853577224192.000 +Backward Time Step 0: + Gradient di[0] = 3797760606208.000, df[0] = 2702084931584.000, dc_hat[0] = 3563580293120.000 + Gradient do_[0] = 58713033408512.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2329550532902912.000, df[0] = -1828910220181504.000, dc_hat[0] = -1052948478033920.000 + Gradient do_[0] = -136185020590063616.000 +Backward Time Step 3: + Gradient di[0] = -3657721387679744.000, df[0] = -2792367081914368.000, dc_hat[0] = -1511362752675840.000 + Gradient do_[0] = -186198033556307968.000 +Backward Time Step 2: + Gradient di[0] = -4791652346494976.000, df[0] = -3610360514871296.000, dc_hat[0] = -2640432882253824.000 + Gradient do_[0] = -213041166839447552.000 +Backward Time Step 1: + Gradient di[0] = -6045546573725696.000, df[0] = -4325385596567552.000, dc_hat[0] = -3659390519345152.000 + Gradient do_[0] = -191156418680717312.000 +Backward Time Step 0: + Gradient di[0] = -7059261397925888.000, df[0] = -5159616690257920.000, dc_hat[0] = -7136677646565376.000 + Gradient do_[0] = -109516984533647360.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1253276254208.000, df[0] = 918910337024.000, dc_hat[0] = 544101400576.000 + Gradient do_[0] = 77706427367424.000 +Backward Time Step 3: + Gradient di[0] = 1966240956416.000, df[0] = 1418928324608.000, dc_hat[0] = 769742602240.000 + Gradient do_[0] = 104011734712320.000 +Backward Time Step 2: + Gradient di[0] = 2501905481728.000, df[0] = 1796975755264.000, dc_hat[0] = 1278296457216.000 + Gradient do_[0] = 113347752099840.000 +Backward Time Step 1: + Gradient di[0] = 3137739948032.000, df[0] = 2163695026176.000, dc_hat[0] = 1745166008320.000 + Gradient do_[0] = 99884984172544.000 +Backward Time Step 0: + Gradient di[0] = 3798954147840.000, df[0] = 2702934016000.000, dc_hat[0] = 3564700172288.000 + Gradient do_[0] = 58731488346112.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2330137869680640.000, df[0] = -1829371392294912.000, dc_hat[0] = -1053213960699904.000 + Gradient do_[0] = -136219388918366208.000 +Backward Time Step 3: + Gradient di[0] = -3658643731906560.000, df[0] = -2793071188115456.000, dc_hat[0] = -1511743125716992.000 + Gradient do_[0] = -186244934599180288.000 +Backward Time Step 2: + Gradient di[0] = -4792858158563328.000, df[0] = -3611269168889856.000, dc_hat[0] = -2641096454701056.000 + Gradient do_[0] = -213094716491694080.000 +Backward Time Step 1: + Gradient di[0] = -6047076118953984.000, df[0] = -4326479739486208.000, dc_hat[0] = -3660312326701056.000 + Gradient do_[0] = -191204745652731904.000 +Backward Time Step 0: + Gradient di[0] = -7061042735611904.000, df[0] = -5160918602219520.000, dc_hat[0] = -7138478848475136.000 + Gradient do_[0] = -109544618353229824.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1253485445120.000, df[0] = 919063429120.000, dc_hat[0] = 544191873024.000 + Gradient do_[0] = 77719421321216.000 +Backward Time Step 3: + Gradient di[0] = 1966572830720.000, df[0] = 1419167793152.000, dc_hat[0] = 769872625664.000 + Gradient do_[0] = 104029283680256.000 +Backward Time Step 2: + Gradient di[0] = 2502329106432.000, df[0] = 1797279973376.000, dc_hat[0] = 1278511546368.000 + Gradient do_[0] = 113366936846336.000 +Backward Time Step 1: + Gradient di[0] = 3138268430336.000, df[0] = 2164059275264.000, dc_hat[0] = 1745457119232.000 + Gradient do_[0] = 99901744611328.000 +Backward Time Step 0: + Gradient di[0] = 3799582769152.000, df[0] = 2703381233664.000, dc_hat[0] = 3565290258432.000 + Gradient do_[0] = 58741206548480.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2330701852573696.000, df[0] = -1829814176579584.000, dc_hat[0] = -1053468370403328.000 + Gradient do_[0] = -136252365677264896.000 +Backward Time Step 3: + Gradient di[0] = -3659528495169536.000, df[0] = -2793746840158208.000, dc_hat[0] = -1512107795283968.000 + Gradient do_[0] = -186289963036311552.000 +Backward Time Step 2: + Gradient di[0] = -4794015652249600.000, df[0] = -3612141047250944.000, dc_hat[0] = -2641731572989952.000 + Gradient do_[0] = -213146170199900160.000 +Backward Time Step 1: + Gradient di[0] = -6048537481576448.000, df[0] = -4327524758716416.000, dc_hat[0] = -3661191452819456.000 + Gradient do_[0] = -191250856421621760.000 +Backward Time Step 0: + Gradient di[0] = -7062761259401216.000, df[0] = -5162174880153600.000, dc_hat[0] = -7140216699617280.000 + Gradient do_[0] = -109571272920268800.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1254010519552.000, df[0] = 919448322048.000, dc_hat[0] = 544419774464.000 + Gradient do_[0] = 77751969120256.000 +Backward Time Step 3: + Gradient di[0] = 1967392161792.000, df[0] = 1419759190016.000, dc_hat[0] = 770193096704.000 + Gradient do_[0] = 104072627617792.000 +Backward Time Step 2: + Gradient di[0] = 2503372963840.000, df[0] = 1798029443072.000, dc_hat[0] = 1279043698688.000 + Gradient do_[0] = 113414198263808.000 +Backward Time Step 1: + Gradient di[0] = 3139584655360.000, df[0] = 2164966948864.000, dc_hat[0] = 1746187321344.000 + Gradient do_[0] = 99943595376640.000 +Backward Time Step 0: + Gradient di[0] = 3801177653248.000, df[0] = 2704516055040.000, dc_hat[0] = 3566786838528.000 + Gradient do_[0] = 58765860667392.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2331275230707712.000, df[0] = -1830264074403840.000, dc_hat[0] = -1053727142182912.000 + Gradient do_[0] = -136285892191977472.000 +Backward Time Step 3: + Gradient di[0] = -3660426411769856.000, df[0] = -2794431887441920.000, dc_hat[0] = -1512478638866432.000 + Gradient do_[0] = -186335678668210176.000 +Backward Time Step 2: + Gradient di[0] = -4795198378868736.000, df[0] = -3613031716093952.000, dc_hat[0] = -2642380918358016.000 + Gradient do_[0] = -213198723419734016.000 +Backward Time Step 1: + Gradient di[0] = -6050021929648128.000, df[0] = -4328586689380352.000, dc_hat[0] = -3662086685065216.000 + Gradient do_[0] = -191297740284624896.000 +Backward Time Step 0: + Gradient di[0] = -7064478709448704.000, df[0] = -5163430084345856.000, dc_hat[0] = -7141951866404864.000 + Gradient do_[0] = -109597918897373184.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1254301761536.000, df[0] = 919661969408.000, dc_hat[0] = 544546193408.000 + Gradient do_[0] = 77770004627456.000 +Backward Time Step 3: + Gradient di[0] = 1967851569152.000, df[0] = 1420090802176.000, dc_hat[0] = 770372927488.000 + Gradient do_[0] = 104096946192384.000 +Backward Time Step 2: + Gradient di[0] = 2503957020672.000, df[0] = 1798449004544.000, dc_hat[0] = 1279342018560.000 + Gradient do_[0] = 113440672710656.000 +Backward Time Step 1: + Gradient di[0] = 3140316823552.000, df[0] = 2165471707136.000, dc_hat[0] = 1746592333824.000 + Gradient do_[0] = 99966898929664.000 +Backward Time Step 0: + Gradient di[0] = 3802056622080.000, df[0] = 2705141268480.000, dc_hat[0] = 3567611543552.000 + Gradient do_[0] = 58779450212352.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2331836260810752.000, df[0] = -1830704711204864.000, dc_hat[0] = -1053981015015424.000 + Gradient do_[0] = -136318602662903808.000 +Backward Time Step 3: + Gradient di[0] = -3661307685371904.000, df[0] = -2795105123565568.000, dc_hat[0] = -1512842637344768.000 + Gradient do_[0] = -186380500946911232.000 +Backward Time Step 2: + Gradient di[0] = -4796347282620416.000, df[0] = -3613897420439552.000, dc_hat[0] = -2643013889163264.000 + Gradient do_[0] = -213249764811079680.000 +Backward Time Step 1: + Gradient di[0] = -6051477923561472.000, df[0] = -4329628218949632.000, dc_hat[0] = -3662963663699968.000 + Gradient do_[0] = -191343747974299648.000 +Backward Time Step 0: + Gradient di[0] = -7066182200852480.000, df[0] = -5164675624861696.000, dc_hat[0] = -7143674685161472.000 + Gradient do_[0] = -109624358716047360.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1254649102336.000, df[0] = 919916511232.000, dc_hat[0] = 544696958976.000 + Gradient do_[0] = 77791496241152.000 +Backward Time Step 3: + Gradient di[0] = 1968394993664.000, df[0] = 1420483100672.000, dc_hat[0] = 770585133056.000 + Gradient do_[0] = 104125593288704.000 +Backward Time Step 2: + Gradient di[0] = 2504649080832.000, df[0] = 1798945898496.000, dc_hat[0] = 1279694471168.000 + Gradient do_[0] = 113471995772928.000 +Backward Time Step 1: + Gradient di[0] = 3141184782336.000, df[0] = 2166069788672.000, dc_hat[0] = 1747072450560.000 + Gradient do_[0] = 99994480672768.000 +Backward Time Step 0: + Gradient di[0] = 3803107033088.000, df[0] = 2705888641024.000, dc_hat[0] = 3568596942848.000 + Gradient do_[0] = 58795686363136.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2332412860170240.000, df[0] = -1831157293383680.000, dc_hat[0] = -1054241531625472.000 + Gradient do_[0] = -136352275206504448.000 +Backward Time Step 3: + Gradient di[0] = -3662213923471360.000, df[0] = -2795796881735680.000, dc_hat[0] = -1513216836370432.000 + Gradient do_[0] = -186426646075539456.000 +Backward Time Step 2: + Gradient di[0] = -4797538062303232.000, df[0] = -3614795337039872.000, dc_hat[0] = -2643668871675904.000 + Gradient do_[0] = -213302730347773952.000 +Backward Time Step 1: + Gradient di[0] = -6052987067695104.000, df[0] = -4330707597918208.000, dc_hat[0] = -3663871512412160.000 + Gradient do_[0] = -191391387751546880.000 +Backward Time Step 0: + Gradient di[0] = -7067937231863808.000, df[0] = -5165957672599552.000, dc_hat[0] = -7145449043525632.000 + Gradient do_[0] = -109651580218769408.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1255059357696.000, df[0] = 920217321472.000, dc_hat[0] = 544874954752.000 + Gradient do_[0] = 77816938889216.000 +Backward Time Step 3: + Gradient di[0] = 1969039736832.000, df[0] = 1420948144128.000, dc_hat[0] = 770837577728.000 + Gradient do_[0] = 104159701368832.000 +Backward Time Step 2: + Gradient di[0] = 2505468018688.000, df[0] = 1799534280704.000, dc_hat[0] = 1280112197632.000 + Gradient do_[0] = 113509090197504.000 +Backward Time Step 1: + Gradient di[0] = 3142209241088.000, df[0] = 2166776397824.000, dc_hat[0] = 1747639861248.000 + Gradient do_[0] = 100027078803456.000 +Backward Time Step 0: + Gradient di[0] = 3804353789952.000, df[0] = 2706775998464.000, dc_hat[0] = 3569767153664.000 + Gradient do_[0] = 58814967578624.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2332970400612352.000, df[0] = -1831595111612416.000, dc_hat[0] = -1054493324083200.000 + Gradient do_[0] = -136384865418346496.000 +Backward Time Step 3: + Gradient di[0] = -3663093049589760.000, df[0] = -2796468238811136.000, dc_hat[0] = -1513579090018304.000 + Gradient do_[0] = -186471313735417856.000 +Backward Time Step 2: + Gradient di[0] = -4798686966054912.000, df[0] = -3615660504514560.000, dc_hat[0] = -2644299694997504.000 + Gradient do_[0] = -213353685839773696.000 +Backward Time Step 1: + Gradient di[0] = -6054430176706560.000, df[0] = -4331739732246528.000, dc_hat[0] = -3664740169547776.000 + Gradient do_[0] = -191436897225015296.000 +Backward Time Step 0: + Gradient di[0] = -7069621395914752.000, df[0] = -5167188717600768.000, dc_hat[0] = -7147150924316672.000 + Gradient do_[0] = -109677702209863680.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1255378911232.000, df[0] = 920451678208.000, dc_hat[0] = 545013465088.000 + Gradient do_[0] = 77836752781312.000 +Backward Time Step 3: + Gradient di[0] = 1969541349376.000, df[0] = 1421310164992.000, dc_hat[0] = 771033333760.000 + Gradient do_[0] = 104186242924544.000 +Backward Time Step 2: + Gradient di[0] = 2506109222912.000, df[0] = 1799994736640.000, dc_hat[0] = 1280438960128.000 + Gradient do_[0] = 113538056060928.000 +Backward Time Step 1: + Gradient di[0] = 3143014547456.000, df[0] = 2167331618816.000, dc_hat[0] = 1748085768192.000 + Gradient do_[0] = 100052655669248.000 +Backward Time Step 0: + Gradient di[0] = 3805322936320.000, df[0] = 2707465437184.000, dc_hat[0] = 3570676269056.000 + Gradient do_[0] = 58829949632512.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2333537067859968.000, df[0] = -1832039506509824.000, dc_hat[0] = -1054748807528448.000 + Gradient do_[0] = -136417988206133248.000 +Backward Time Step 3: + Gradient di[0] = -3663977812852736.000, df[0] = -2797143353982976.000, dc_hat[0] = -1513943625367552.000 + Gradient do_[0] = -186516342172549120.000 +Backward Time Step 2: + Gradient di[0] = -4799850365321216.000, df[0] = -3616537214713856.000, dc_hat[0] = -2644938571382784.000 + Gradient do_[0] = -213405208267456512.000 +Backward Time Step 1: + Gradient di[0] = -6055898518650880.000, df[0] = -4332790120185856.000, dc_hat[0] = -3665625469681664.000 + Gradient do_[0] = -191483282871812096.000 +Backward Time Step 0: + Gradient di[0] = -7071340456574976.000, df[0] = -5168444995534848.000, dc_hat[0] = -7148888775458816.000 + Gradient do_[0] = -109704373956771840.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1255745781760.000, df[0] = 920720637952.000, dc_hat[0] = 545172979712.000 + Gradient do_[0] = 77859485908992.000 +Backward Time Step 3: + Gradient di[0] = 1970114265088.000, df[0] = 1421723435008.000, dc_hat[0] = 771257139200.000 + Gradient do_[0] = 104216525799424.000 +Backward Time Step 2: + Gradient di[0] = 2506839556096.000, df[0] = 1800519417856.000, dc_hat[0] = 1280811597824.000 + Gradient do_[0] = 113571157508096.000 +Backward Time Step 1: + Gradient di[0] = 3143933100032.000, df[0] = 2167964827648.000, dc_hat[0] = 1748594589696.000 + Gradient do_[0] = 100081864802304.000 +Backward Time Step 0: + Gradient di[0] = 3806432591872.000, df[0] = 2708255014912.000, dc_hat[0] = 3571717767168.000 + Gradient do_[0] = 58847104335872.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2334109640687616.000, df[0] = -1832489270116352.000, dc_hat[0] = -1055007646416896.000 + Gradient do_[0] = -136451480361107456.000 +Backward Time Step 3: + Gradient di[0] = -3664876266323968.000, df[0] = -2797829743443968.000, dc_hat[0] = -1514315811127296.000 + Gradient do_[0] = -186562057804447744.000 +Backward Time Step 2: + Gradient di[0] = -4801027186360320.000, df[0] = -3617423320154112.000, dc_hat[0] = -2645584427089920.000 + Gradient do_[0] = -213457469429514240.000 +Backward Time Step 1: + Gradient di[0] = -6057388872302592.000, df[0] = -4333856614252544.000, dc_hat[0] = -3666523923152896.000 + Gradient do_[0] = -191530321353637888.000 +Backward Time Step 0: + Gradient di[0] = -7073071328395264.000, df[0] = -5169710400274432.000, dc_hat[0] = -7150639511502848.000 + Gradient do_[0] = -109731226092306432.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1256068087808.000, df[0] = 920957026304.000, dc_hat[0] = 545312342016.000 + Gradient do_[0] = 77879501127680.000 +Backward Time Step 3: + Gradient di[0] = 1970622955520.000, df[0] = 1422090305536.000, dc_hat[0] = 771456040960.000 + Gradient do_[0] = 104243419676672.000 +Backward Time Step 2: + Gradient di[0] = 2507484954624.000, df[0] = 1800982757376.000, dc_hat[0] = 1281140850688.000 + Gradient do_[0] = 113600383418368.000 +Backward Time Step 1: + Gradient di[0] = 3144738930688.000, df[0] = 2168520572928.000, dc_hat[0] = 1749040758784.000 + Gradient do_[0] = 100107450056704.000 +Backward Time Step 0: + Gradient di[0] = 3807403835392.000, df[0] = 2708945764352.000, dc_hat[0] = 3572628979712.000 + Gradient do_[0] = 58862119944192.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2334678992289792.000, df[0] = -1832936617803776.000, dc_hat[0] = -1055264941801472.000 + Gradient do_[0] = -136484731997913088.000 +Backward Time Step 3: + Gradient di[0] = -3665771498569728.000, df[0] = -2798512643244032.000, dc_hat[0] = -1514683567702016.000 + Gradient do_[0] = -186607653177262080.000 +Backward Time Step 2: + Gradient di[0] = -4802201859915776.000, df[0] = -3618307814981632.000, dc_hat[0] = -2646230282797056.000 + Gradient do_[0] = -213509661872095232.000 +Backward Time Step 1: + Gradient di[0] = -6058870636019712.000, df[0] = -4334916665868288.000, dc_hat[0] = -3667415934173184.000 + Gradient do_[0] = -191577153677033472.000 +Backward Time Step 0: + Gradient di[0] = -7074793610280960.000, df[0] = -5170969362563072.000, dc_hat[0] = -7152380046999552.000 + Gradient do_[0] = -109757940788887552.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1256432467968.000, df[0] = 921224085504.000, dc_hat[0] = 545470513152.000 + Gradient do_[0] = 77902083260416.000 +Backward Time Step 3: + Gradient di[0] = 1971194953728.000, df[0] = 1422503313408.000, dc_hat[0] = 771679911936.000 + Gradient do_[0] = 104273677385728.000 +Backward Time Step 2: + Gradient di[0] = 2508211093504.000, df[0] = 1801504161792.000, dc_hat[0] = 1281510342656.000 + Gradient do_[0] = 113633266761728.000 +Backward Time Step 1: + Gradient di[0] = 3145655123968.000, df[0] = 2169152208896.000, dc_hat[0] = 1749548662784.000 + Gradient do_[0] = 100136600469504.000 +Backward Time Step 0: + Gradient di[0] = 3808516112384.000, df[0] = 2709737177088.000, dc_hat[0] = 3573672574976.000 + Gradient do_[0] = 58879316590592.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2335242438311936.000, df[0] = -1833378462564352.000, dc_hat[0] = -1055519015960576.000 + Gradient do_[0] = -136517682987008000.000 +Backward Time Step 3: + Gradient di[0] = -3666655456526336.000, df[0] = -2799187489980416.000, dc_hat[0] = -1515048505704448.000 + Gradient do_[0] = -186652595715047424.000 +Backward Time Step 2: + Gradient di[0] = -4803356669247488.000, df[0] = -3619177814294528.000, dc_hat[0] = -2646865401085952.000 + Gradient do_[0] = -213561046860824576.000 +Backward Time Step 1: + Gradient di[0] = -6060326629933056.000, df[0] = -4335957927002112.000, dc_hat[0] = -3668291839066112.000 + Gradient do_[0] = -191623161366708224.000 +Backward Time Step 0: + Gradient di[0] = -7076503544135680.000, df[0] = -5172219198046208.000, dc_hat[0] = -7154108771336192.000 + Gradient do_[0] = -109784475096842240.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1256852553728.000, df[0] = 921532039168.000, dc_hat[0] = 545652834304.000 + Gradient do_[0] = 77928096333824.000 +Backward Time Step 3: + Gradient di[0] = 1971855032320.000, df[0] = 1422979891200.000, dc_hat[0] = 771938451456.000 + Gradient do_[0] = 104308557217792.000 +Backward Time Step 2: + Gradient di[0] = 2509053886464.000, df[0] = 1802109452288.000, dc_hat[0] = 1281940783104.000 + Gradient do_[0] = 113671384596480.000 +Backward Time Step 1: + Gradient di[0] = 3146709204992.000, df[0] = 2169879003136.000, dc_hat[0] = 1750132064256.000 + Gradient do_[0] = 100170138124288.000 +Backward Time Step 0: + Gradient di[0] = 3809785151488.000, df[0] = 2710640263168.000, dc_hat[0] = 3574863757312.000 + Gradient do_[0] = 58898933350400.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2335799710318592.000, df[0] = -1833816012357632.000, dc_hat[0] = -1055770741309440.000 + Gradient do_[0] = -136550281788784640.000 +Backward Time Step 3: + Gradient di[0] = -3667535119515648.000, df[0] = -2799859115491328.000, dc_hat[0] = -1515411162005504.000 + Gradient do_[0] = -186697297734664192.000 +Backward Time Step 2: + Gradient di[0] = -4804514699804672.000, df[0] = -3620050766397440.000, dc_hat[0] = -2647502129987584.000 + Gradient do_[0] = -213612449029423104.000 +Backward Time Step 1: + Gradient di[0] = -6061784234459136.000, df[0] = -4337000261877760.000, dc_hat[0] = -3669169623007232.000 + Gradient do_[0] = -191669151876513792.000 +Backward Time Step 0: + Gradient di[0] = -7078192540024832.000, df[0] = -5173453464272896.000, dc_hat[0] = -7155817094578176.000 + Gradient do_[0] = -109810682987282432.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1257187180544.000, df[0] = 921777405952.000, dc_hat[0] = 545798160384.000 + Gradient do_[0] = 77948849750016.000 +Backward Time Step 3: + Gradient di[0] = 1972377878528.000, df[0] = 1423357247488.000, dc_hat[0] = 772142989312.000 + Gradient do_[0] = 104336231235584.000 +Backward Time Step 2: + Gradient di[0] = 2509718683648.000, df[0] = 1802586685440.000, dc_hat[0] = 1282279342080.000 + Gradient do_[0] = 113701466144768.000 +Backward Time Step 1: + Gradient di[0] = 3147544920064.000, df[0] = 2170454933504.000, dc_hat[0] = 1750593961984.000 + Gradient do_[0] = 100196713234432.000 +Backward Time Step 0: + Gradient di[0] = 3810799124480.000, df[0] = 2711361945600.000, dc_hat[0] = 3575815340032.000 + Gradient do_[0] = 58914611658752.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2336370404098048.000, df[0] = -1834264165351424.000, dc_hat[0] = -1056028640673792.000 + Gradient do_[0] = -136583645094739968.000 +Backward Time Step 3: + Gradient di[0] = -3668431157067776.000, df[0] = -2800543357468672.000, dc_hat[0] = -1515780529192960.000 + Gradient do_[0] = -186742858747740160.000 +Backward Time Step 2: + Gradient di[0] = -4805684541521920.000, df[0] = -3620932308434944.000, dc_hat[0] = -2648144764469248.000 + Gradient do_[0] = -213664435313573888.000 +Backward Time Step 1: + Gradient di[0] = -6063268145659904.000, df[0] = -4338061924106240.000, dc_hat[0] = -3670064586817536.000 + Gradient do_[0] = -191716035739516928.000 +Backward Time Step 0: + Gradient di[0] = -7079923948716032.000, df[0] = -5174718332141568.000, dc_hat[0] = -7157566756880384.000 + Gradient do_[0] = -109837543712751616.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1257487597568.000, df[0] = 921997606912.000, dc_hat[0] = 545928183808.000 + Gradient do_[0] = 77967447293952.000 +Backward Time Step 3: + Gradient di[0] = 1972849344512.000, df[0] = 1423697510400.000, dc_hat[0] = 772327079936.000 + Gradient do_[0] = 104361128624128.000 +Backward Time Step 2: + Gradient di[0] = 2510317944832.000, df[0] = 1803017125888.000, dc_hat[0] = 1282584346624.000 + Gradient do_[0] = 113728544571392.000 +Backward Time Step 1: + Gradient di[0] = 3148292030464.000, df[0] = 2170970177536.000, dc_hat[0] = 1751007756288.000 + Gradient do_[0] = 100220452995072.000 +Backward Time Step 0: + Gradient di[0] = 3811700375552.000, df[0] = 2712002625536.000, dc_hat[0] = 3576660230144.000 + Gradient do_[0] = 58928540942336.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2336920428347392.000, df[0] = -1834695943782400.000, dc_hat[0] = -1056277077688320.000 + Gradient do_[0] = -136615719910506496.000 +Backward Time Step 3: + Gradient di[0] = -3669292029575168.000, df[0] = -2801200487464960.000, dc_hat[0] = -1516136206172160.000 + Gradient do_[0] = -186786650234290176.000 +Backward Time Step 2: + Gradient di[0] = -4806811970437120.000, df[0] = -3621781369782272.000, dc_hat[0] = -2648765655678976.000 + Gradient do_[0] = -213714548991983616.000 +Backward Time Step 1: + Gradient di[0] = -6064687095480320.000, df[0] = -4339076610129920.000, dc_hat[0] = -3670917943132160.000 + Gradient do_[0] = -191760823658479616.000 +Backward Time Step 0: + Gradient di[0] = -7081585564188672.000, df[0] = -5175933271015424.000, dc_hat[0] = -7159247162834944.000 + Gradient do_[0] = -109863322106462208.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1257894838272.000, df[0] = 922296320000.000, dc_hat[0] = 546105294848.000 + Gradient do_[0] = 77992688615424.000 +Backward Time Step 3: + Gradient di[0] = 1973489238016.000, df[0] = 1424159277056.000, dc_hat[0] = 772577099776.000 + Gradient do_[0] = 104394959880192.000 +Backward Time Step 2: + Gradient di[0] = 2511131901952.000, df[0] = 1803601838080.000, dc_hat[0] = 1283000238080.000 + Gradient do_[0] = 113765454446592.000 +Backward Time Step 1: + Gradient di[0] = 3149314392064.000, df[0] = 2171674951680.000, dc_hat[0] = 1751574249472.000 + Gradient do_[0] = 100252992405504.000 +Backward Time Step 0: + Gradient di[0] = 3812939268096.000, df[0] = 2712884215808.000, dc_hat[0] = 3577822838784.000 + Gradient do_[0] = 58947700523008.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2337491658997760.000, df[0] = -1835144499429376.000, dc_hat[0] = -1056535245488128.000 + Gradient do_[0] = -136649126166134784.000 +Backward Time Step 3: + Gradient di[0] = -3670187798691840.000, df[0] = -2801883924135936.000, dc_hat[0] = -1516506244448256.000 + Gradient do_[0] = -186832245607104512.000 +Backward Time Step 2: + Gradient di[0] = -4807982349025280.000, df[0] = -3622663448690688.000, dc_hat[0] = -2649409095467008.000 + Gradient do_[0] = -213766500916396032.000 +Backward Time Step 1: + Gradient di[0] = -6066166711713792.000, df[0] = -4340135051132928.000, dc_hat[0] = -3671809954152448.000 + Gradient do_[0] = -191807518542921728.000 +Backward Time Step 0: + Gradient di[0] = -7083305698590720.000, df[0] = -5177191159562240.000, dc_hat[0] = -7160986087718912.000 + Gradient do_[0] = -109890011033239552.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1258289496064.000, df[0] = 922585595904.000, dc_hat[0] = 546276343808.000 + Gradient do_[0] = 78017141407744.000 +Backward Time Step 3: + Gradient di[0] = 1974108160000.000, df[0] = 1424605708288.000, dc_hat[0] = 772818927616.000 + Gradient do_[0] = 104427658674176.000 +Backward Time Step 2: + Gradient di[0] = 2511918071808.000, df[0] = 1804166234112.000, dc_hat[0] = 1283400531968.000 + Gradient do_[0] = 113801038921728.000 +Backward Time Step 1: + Gradient di[0] = 3150301102080.000, df[0] = 2172355346432.000, dc_hat[0] = 1752120950784.000 + Gradient do_[0] = 100284332244992.000 +Backward Time Step 0: + Gradient di[0] = 3814138052608.000, df[0] = 2713737232384.000, dc_hat[0] = 3578947960832.000 + Gradient do_[0] = 58966226763776.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2338064768696320.000, df[0] = -1835594263035904.000, dc_hat[0] = -1056794218594304.000 + Gradient do_[0] = -136682601141239808.000 +Backward Time Step 3: + Gradient di[0] = -3671087594340352.000, df[0] = -2802571118903296.000, dc_hat[0] = -1516877088030720.000 + Gradient do_[0] = -186878064318218240.000 +Backward Time Step 2: + Gradient di[0] = -4809172591837184.000, df[0] = -3623560023113728.000, dc_hat[0] = -2650063809544192.000 + Gradient do_[0] = -213819363373875200.000 +Backward Time Step 1: + Gradient di[0] = -6067669413396480.000, df[0] = -4341209598263296.000, dc_hat[0] = -3672714044768256.000 + Gradient do_[0] = -191854969341607936.000 +Backward Time Step 0: + Gradient di[0] = -7085051602796544.000, df[0] = -5178466764849152.000, dc_hat[0] = -7162750782406656.000 + Gradient do_[0] = -109917086507073536.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1258579951616.000, df[0] = 922798587904.000, dc_hat[0] = 546402336768.000 + Gradient do_[0] = 78035168526336.000 +Backward Time Step 3: + Gradient di[0] = 1974563241984.000, df[0] = 1424934305792.000, dc_hat[0] = 772997513216.000 + Gradient do_[0] = 104451750756352.000 +Backward Time Step 2: + Gradient di[0] = 2512497934336.000, df[0] = 1804583043072.000, dc_hat[0] = 1283696754688.000 + Gradient do_[0] = 113827303653376.000 +Backward Time Step 1: + Gradient di[0] = 3151030910976.000, df[0] = 2172858138624.000, dc_hat[0] = 1752524259328.000 + Gradient do_[0] = 100307535134720.000 +Backward Time Step 0: + Gradient di[0] = 3815011254272.000, df[0] = 2714358513664.000, dc_hat[0] = 3579767160832.000 + Gradient do_[0] = 58979728228352.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2338632509685760.000, df[0] = -1836040268546048.000, dc_hat[0] = -1057050440237056.000 + Gradient do_[0] = -136715818418307072.000 +Backward Time Step 3: + Gradient di[0] = -3671981752844288.000, df[0] = -2803253481832448.000, dc_hat[0] = -1517246455218176.000 + Gradient do_[0] = -186923539431948288.000 +Backward Time Step 2: + Gradient di[0] = -4810337064845312.000, df[0] = -3624437001748480.000, dc_hat[0] = -2650701343752192.000 + Gradient do_[0] = -213871160679464960.000 +Backward Time Step 1: + Gradient di[0] = -6069139365953536.000, df[0] = -4342261328379904.000, dc_hat[0] = -3673599613337600.000 + Gradient do_[0] = -191901354988404736.000 +Backward Time Step 0: + Gradient di[0] = -7086769052844032.000, df[0] = -5179721969041408.000, dc_hat[0] = -7164487022936064.000 + Gradient do_[0] = -109943732484177920.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1258921132032.000, df[0] = 923048804352.000, dc_hat[0] = 546550382592.000 + Gradient do_[0] = 78056324595712.000 +Backward Time Step 3: + Gradient di[0] = 1975098540032.000, df[0] = 1425320574976.000, dc_hat[0] = 773206573056.000 + Gradient do_[0] = 104480062308352.000 +Backward Time Step 2: + Gradient di[0] = 2513178984448.000, df[0] = 1805071810560.000, dc_hat[0] = 1284043309056.000 + Gradient do_[0] = 113858047901696.000 +Backward Time Step 1: + Gradient di[0] = 3151884451840.000, df[0] = 2173446651904.000, dc_hat[0] = 1752997036032.000 + Gradient do_[0] = 100334638727168.000 +Backward Time Step 0: + Gradient di[0] = 3816043053056.000, df[0] = 2715092516864.000, dc_hat[0] = 3580735520768.000 + Gradient do_[0] = 58995674972160.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2339188439515136.000, df[0] = -1836476476162048.000, dc_hat[0] = -1057301091844096.000 + Gradient do_[0] = -136748271191195648.000 +Backward Time Step 3: + Gradient di[0] = -3672851752157184.000, df[0] = -2803918128021504.000, dc_hat[0] = -1517605756076032.000 + Gradient do_[0] = -186967811954835456.000 +Backward Time Step 2: + Gradient di[0] = -4811478989275136.000, df[0] = -3625297605820416.000, dc_hat[0] = -2651330556461056.000 + Gradient do_[0] = -213921944372772864.000 +Backward Time Step 1: + Gradient di[0] = -6070584085577728.000, df[0] = -4343294804885504.000, dc_hat[0] = -3674470417956864.000 + Gradient do_[0] = -191947001900826624.000 +Backward Time Step 0: + Gradient di[0] = -7088452143153152.000, df[0] = -5180951940300800.000, dc_hat[0] = -7166188903727104.000 + Gradient do_[0] = -109969845885337600.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1259247501312.000, df[0] = 923288076288.000, dc_hat[0] = 546692104192.000 + Gradient do_[0] = 78076557918208.000 +Backward Time Step 3: + Gradient di[0] = 1975609720832.000, df[0] = 1425689149440.000, dc_hat[0] = 773406326784.000 + Gradient do_[0] = 104507090403328.000 +Backward Time Step 2: + Gradient di[0] = 2513830150144.000, df[0] = 1805539737600.000, dc_hat[0] = 1284375576576.000 + Gradient do_[0] = 113887575801856.000 +Backward Time Step 1: + Gradient di[0] = 3152698933248.000, df[0] = 2174008295424.000, dc_hat[0] = 1753447137280.000 + Gradient do_[0] = 100360551137280.000 +Backward Time Step 0: + Gradient di[0] = 3817031335936.000, df[0] = 2715795849216.000, dc_hat[0] = 3581662724096.000 + Gradient do_[0] = 59010963210240.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2339768260100096.000, df[0] = -1836932011130880.000, dc_hat[0] = -1057563286175744.000 + Gradient do_[0] = -136782175663030272.000 +Backward Time Step 3: + Gradient di[0] = -3673763090530304.000, df[0] = -2804613375852544.000, dc_hat[0] = -1517980894625792.000 + Gradient do_[0] = -187014146062024704.000 +Backward Time Step 2: + Gradient di[0] = -4812675674537984.000, df[0] = -3626199012081664.000, dc_hat[0] = -2651987954892800.000 + Gradient do_[0] = -213974995808813056.000 +Backward Time Step 1: + Gradient di[0] = -6072087324131328.000, df[0] = -4344370694193152.000, dc_hat[0] = -3675375582314496.000 + Gradient do_[0] = -191994487059251200.000 +Backward Time Step 0: + Gradient di[0] = -7090205026680832.000, df[0] = -5182233451167744.000, dc_hat[0] = -7167960577736704.000 + Gradient do_[0] = -109997041618255872.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1259551457280.000, df[0] = 923510964224.000, dc_hat[0] = 546824028160.000 + Gradient do_[0] = 78095356788736.000 +Backward Time Step 3: + Gradient di[0] = 1976087085056.000, df[0] = 1426033999872.000, dc_hat[0] = 773592842240.000 + Gradient do_[0] = 104532306558976.000 +Backward Time Step 2: + Gradient di[0] = 2514437799936.000, df[0] = 1805976076288.000, dc_hat[0] = 1284685168640.000 + Gradient do_[0] = 113915098824704.000 +Backward Time Step 1: + Gradient di[0] = 3153460723712.000, df[0] = 2174533763072.000, dc_hat[0] = 1753868926976.000 + Gradient do_[0] = 100384794214400.000 +Backward Time Step 0: + Gradient di[0] = 3817949102080.000, df[0] = 2716448849920.000, dc_hat[0] = 3582523867136.000 + Gradient do_[0] = 59025144152064.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2340301372915712.000, df[0] = -1837350367789056.000, dc_hat[0] = -1057803603017728.000 + Gradient do_[0] = -136813331355795456.000 +Backward Time Step 3: + Gradient di[0] = -3674602219765760.000, df[0] = -2805254131286016.000, dc_hat[0] = -1518326907928576.000 + Gradient do_[0] = -187056872396685312.000 +Backward Time Step 2: + Gradient di[0] = -4813770891198464.000, df[0] = -3627023914237952.000, dc_hat[0] = -2652589518749696.000 + Gradient do_[0] = -214023614838603776.000 +Backward Time Step 1: + Gradient di[0] = -6073469229858816.000, df[0] = -4345359073542144.000, dc_hat[0] = -3676208537534464.000 + Gradient do_[0] = -192038123926978560.000 +Backward Time Step 0: + Gradient di[0] = -7091821008125952.000, df[0] = -5183414567174144.000, dc_hat[0] = -7169594812792832.000 + Gradient do_[0] = -110022107047395328.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1260009422848.000, df[0] = 923846443008.000, dc_hat[0] = 547022143488.000 + Gradient do_[0] = 78123727060992.000 +Backward Time Step 3: + Gradient di[0] = 1976804835328.000, df[0] = 1426551603200.000, dc_hat[0] = 773873664000.000 + Gradient do_[0] = 104570248232960.000 +Backward Time Step 2: + Gradient di[0] = 2515352420352.000, df[0] = 1806633009152.000, dc_hat[0] = 1285151391744.000 + Gradient do_[0] = 113956471439360.000 +Backward Time Step 1: + Gradient di[0] = 3154609438720.000, df[0] = 2175325569024.000, dc_hat[0] = 1754505936896.000 + Gradient do_[0] = 100421318213632.000 +Backward Time Step 0: + Gradient di[0] = 3819336630272.000, df[0] = 2717436084224.000, dc_hat[0] = 3583825674240.000 + Gradient do_[0] = 59046598017024.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2340879046017024.000, df[0] = -1837803889491968.000, dc_hat[0] = -1058064656498688.000 + Gradient do_[0] = -136847003899396096.000 +Backward Time Step 3: + Gradient di[0] = -3675507384123392.000, df[0] = -2805945084149760.000, dc_hat[0] = -1518700033212416.000 + Gradient do_[0] = -187102862906490880.000 +Backward Time Step 2: + Gradient di[0] = -4814955765301248.000, df[0] = -3627917267435520.000, dc_hat[0] = -2653242085343232.000 + Gradient do_[0] = -214076253957783552.000 +Backward Time Step 1: + Gradient di[0] = -6074966025961472.000, df[0] = -4346429057269760.000, dc_hat[0] = -3677108601618432.000 + Gradient do_[0] = -192085282667888640.000 +Backward Time Step 0: + Gradient di[0] = -7093557248655360.000, df[0] = -5184683193139200.000, dc_hat[0] = -7171349306933248.000 + Gradient do_[0] = -110049036492341248.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1260288868352.000, df[0] = 924051374080.000, dc_hat[0] = 547143483392.000 + Gradient do_[0] = 78141083090944.000 +Backward Time Step 3: + Gradient di[0] = 1977243533312.000, df[0] = 1426868142080.000, dc_hat[0] = 774044975104.000 + Gradient do_[0] = 104593434345472.000 +Backward Time Step 2: + Gradient di[0] = 2515908952064.000, df[0] = 1807032516608.000, dc_hat[0] = 1285435031552.000 + Gradient do_[0] = 113981679206400.000 +Backward Time Step 1: + Gradient di[0] = 3155307790336.000, df[0] = 2175807389696.000, dc_hat[0] = 1754892206080.000 + Gradient do_[0] = 100443489304576.000 +Backward Time Step 0: + Gradient di[0] = 3820180471808.000, df[0] = 2718036393984.000, dc_hat[0] = 3584617611264.000 + Gradient do_[0] = 59059638108160.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2341424506863616.000, df[0] = -1838232044044288.000, dc_hat[0] = -1058311080247296.000 + Gradient do_[0] = -136878881146667008.000 +Backward Time Step 3: + Gradient di[0] = -3676364230098944.000, df[0] = -2806599261356032.000, dc_hat[0] = -1519053965361152.000 + Gradient do_[0] = -187146465414479872.000 +Backward Time Step 2: + Gradient di[0] = -4816079972990976.000, df[0] = -3628764181299200.000, dc_hat[0] = -2653860292198400.000 + Gradient do_[0] = -214126230197239808.000 +Backward Time Step 1: + Gradient di[0] = -6076386049523712.000, df[0] = -4347445353906176.000, dc_hat[0] = -3677964910723072.000 + Gradient do_[0] = -192130190845935616.000 +Backward Time Step 0: + Gradient di[0] = -7095217790386176.000, df[0] = -5185897058271232.000, dc_hat[0] = -7173028639145984.000 + Gradient do_[0] = -110074806296117248.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1260611698688.000, df[0] = 924288090112.000, dc_hat[0] = 547283599360.000 + Gradient do_[0] = 78161056366592.000 +Backward Time Step 3: + Gradient di[0] = 1977749471232.000, df[0] = 1427233439744.000, dc_hat[0] = 774242566144.000 + Gradient do_[0] = 104620210782208.000 +Backward Time Step 2: + Gradient di[0] = 2516553826304.000, df[0] = 1807495462912.000, dc_hat[0] = 1285763891200.000 + Gradient do_[0] = 114010821230592.000 +Backward Time Step 1: + Gradient di[0] = 3156117815296.000, df[0] = 2176365625344.000, dc_hat[0] = 1755339948032.000 + Gradient do_[0] = 100469250719744.000 +Backward Time Step 0: + Gradient di[0] = 3821159055360.000, df[0] = 2718732648448.000, dc_hat[0] = 3585535901696.000 + Gradient do_[0] = 59074771156992.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2341971578322944.000, df[0] = -1838661540773888.000, dc_hat[0] = -1058558376411136.000 + Gradient do_[0] = -136910861473153024.000 +Backward Time Step 3: + Gradient di[0] = -3677223491993600.000, df[0] = -2807255049175040.000, dc_hat[0] = -1519408434380800.000 + Gradient do_[0] = -187190205361422336.000 +Backward Time Step 2: + Gradient di[0] = -4817202570067968.000, df[0] = -3629610021421056.000, dc_hat[0] = -2654476620005376.000 + Gradient do_[0] = -214176086177611776.000 +Backward Time Step 1: + Gradient di[0] = -6077802851860480.000, df[0] = -4348458160881664.000, dc_hat[0] = -3678817998602240.000 + Gradient do_[0] = -192174910045421568.000 +Backward Time Step 0: + Gradient di[0] = -7096868131569664.000, df[0] = -5187103407210496.000, dc_hat[0] = -7174696697069568.000 + Gradient do_[0] = -110100412891136000.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1260980797440.000, df[0] = 924558819328.000, dc_hat[0] = 547443867648.000 + Gradient do_[0] = 78183957266432.000 +Backward Time Step 3: + Gradient di[0] = 1978329333760.000, df[0] = 1427651690496.000, dc_hat[0] = 774469255168.000 + Gradient do_[0] = 104650871144448.000 +Backward Time Step 2: + Gradient di[0] = 2517289926656.000, df[0] = 1808024338432.000, dc_hat[0] = 1286139281408.000 + Gradient do_[0] = 114044191113216.000 +Backward Time Step 1: + Gradient di[0] = 3157045280768.000, df[0] = 2177004863488.000, dc_hat[0] = 1755853094912.000 + Gradient do_[0] = 100498686345216.000 +Backward Time Step 0: + Gradient di[0] = 3822278148096.000, df[0] = 2719529041920.000, dc_hat[0] = 3586586050560.000 + Gradient do_[0] = 59092076855296.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2342538514006016.000, df[0] = -1839106875195392.000, dc_hat[0] = -1058814396727296.000 + Gradient do_[0] = -136944018620678144.000 +Backward Time Step 3: + Gradient di[0] = -3678110402740224.000, df[0] = -2807932580265984.000, dc_hat[0] = -1519774043471872.000 + Gradient do_[0] = -187235405597245440.000 +Backward Time Step 2: + Gradient di[0] = -4818371338043392.000, df[0] = -3630490758152192.000, dc_hat[0] = -2655118180745216.000 + Gradient do_[0] = -214228055281893376.000 +Backward Time Step 1: + Gradient di[0] = -6079282468093952.000, df[0] = -4349516601884672.000, dc_hat[0] = -3679708399009792.000 + Gradient do_[0] = -192221656469471232.000 +Backward Time Step 0: + Gradient di[0] = -7098593634680832.000, df[0] = -5188365053853696.000, dc_hat[0] = -7176441527533568.000 + Gradient do_[0] = -110127187717259264.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1261256310784.000, df[0] = 924760801280.000, dc_hat[0] = 547563503616.000 + Gradient do_[0] = 78201036472320.000 +Backward Time Step 3: + Gradient di[0] = 1978764361728.000, df[0] = 1427965607936.000, dc_hat[0] = 774638993408.000 + Gradient do_[0] = 104673830764544.000 +Backward Time Step 2: + Gradient di[0] = 2517843312640.000, df[0] = 1808421486592.000, dc_hat[0] = 1286420955136.000 + Gradient do_[0] = 114069231108096.000 +Backward Time Step 1: + Gradient di[0] = 3157735505920.000, df[0] = 2177480654848.000, dc_hat[0] = 1756235300864.000 + Gradient do_[0] = 100520647720960.000 +Backward Time Step 0: + Gradient di[0] = 3823110979584.000, df[0] = 2720121487360.000, dc_hat[0] = 3587367763968.000 + Gradient do_[0] = 59104949174272.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2343093906964480.000, df[0] = -1839542545940480.000, dc_hat[0] = -1059064243027968.000 + Gradient do_[0] = -136976462803632128.000 +Backward Time Step 3: + Gradient di[0] = -3678980670488576.000, df[0] = -2808596958019584.000, dc_hat[0] = -1520133612765184.000 + Gradient do_[0] = -187279609400655872.000 +Backward Time Step 2: + Gradient di[0] = -4819508430635008.000, df[0] = -3631346798821376.000, dc_hat[0] = -2655743903793152.000 + Gradient do_[0] = -214278461018079232.000 +Backward Time Step 1: + Gradient di[0] = -6080719671525376.000, df[0] = -4350544978116608.000, dc_hat[0] = -3680574103355392.000 + Gradient do_[0] = -192267045683855360.000 +Backward Time Step 0: + Gradient di[0] = -7100268135055360.000, df[0] = -5189588582662144.000, dc_hat[0] = -7178134281519104.000 + Gradient do_[0] = -110153155089530880.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1261608894464.000, df[0] = 925019406336.000, dc_hat[0] = 547716628480.000 + Gradient do_[0] = 78222922350592.000 +Backward Time Step 3: + Gradient di[0] = 1979316436992.000, df[0] = 1428364066816.000, dc_hat[0] = 774855196672.000 + Gradient do_[0] = 104703014731776.000 +Backward Time Step 2: + Gradient di[0] = 2518545858560.000, df[0] = 1808926113792.000, dc_hat[0] = 1286778519552.000 + Gradient do_[0] = 114101032321024.000 +Backward Time Step 1: + Gradient di[0] = 3158615523328.000, df[0] = 2178087518208.000, dc_hat[0] = 1756722495488.000 + Gradient do_[0] = 100548640505856.000 +Backward Time Step 0: + Gradient di[0] = 3824176332800.000, df[0] = 2720879607808.000, dc_hat[0] = 3588367319040.000 + Gradient do_[0] = 59121416011776.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2343645810262016.000, df[0] = -1839975666548736.000, dc_hat[0] = -1059313552457728.000 + Gradient do_[0] = -137008718008025088.000 +Backward Time Step 3: + Gradient di[0] = -3679853891026944.000, df[0] = -2809263751692288.000, dc_hat[0] = -1520494255800320.000 + Gradient do_[0] = -187324019362496512.000 +Backward Time Step 2: + Gradient di[0] = -4820650891935744.000, df[0] = -3632207939764224.000, dc_hat[0] = -2656371774324736.000 + Gradient do_[0] = -214329227531517952.000 +Backward Time Step 1: + Gradient di[0] = -6082154727473152.000, df[0] = -4351570669993984.000, dc_hat[0] = -3681437391781888.000 + Gradient do_[0] = -192312331819024384.000 +Backward Time Step 0: + Gradient di[0] = -7101942098558976.000, df[0] = -5190811574599680.000, dc_hat[0] = -7179826498633728.000 + Gradient do_[0] = -110179122461802496.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1261972488192.000, df[0] = 925286006784.000, dc_hat[0] = 547874439168.000 + Gradient do_[0] = 78245454151680.000 +Backward Time Step 3: + Gradient di[0] = 1979884371968.000, df[0] = 1428773797888.000, dc_hat[0] = 775077101568.000 + Gradient do_[0] = 104733054337024.000 +Backward Time Step 2: + Gradient di[0] = 2519268851712.000, df[0] = 1809445289984.000, dc_hat[0] = 1287147749376.000 + Gradient do_[0] = 114133789835264.000 +Backward Time Step 1: + Gradient di[0] = 3159528046592.000, df[0] = 2178716794880.000, dc_hat[0] = 1757228171264.000 + Gradient do_[0] = 100577665089536.000 +Backward Time Step 0: + Gradient di[0] = 3825282580480.000, df[0] = 2721666564096.000, dc_hat[0] = 3589405671424.000 + Gradient do_[0] = 59138524577792.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2344190734237696.000, df[0] = -1840403686883328.000, dc_hat[0] = -1059559976206336.000 + Gradient do_[0] = -137040552305623040.000 +Backward Time Step 3: + Gradient di[0] = -3680706173599744.000, df[0] = -2809913902366720.000, dc_hat[0] = -1520844966723584.000 + Gradient do_[0] = -187367381352316928.000 +Backward Time Step 2: + Gradient di[0] = -4821768657174528.000, df[0] = -3633049484918784.000, dc_hat[0] = -2656985417777152.000 + Gradient do_[0] = -214378860173590528.000 +Backward Time Step 1: + Gradient di[0] = -6083568308584448.000, df[0] = -4352582403227648.000, dc_hat[0] = -3682289942790144.000 + Gradient do_[0] = -192356930759426048.000 +Backward Time Step 0: + Gradient di[0] = -7103585997291520.000, df[0] = -5192013091700736.000, dc_hat[0] = -7181488114106368.000 + Gradient do_[0] = -110204634567540736.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1262233845760.000, df[0] = 925477371904.000, dc_hat[0] = 547987456000.000 + Gradient do_[0] = 78261585444864.000 +Backward Time Step 3: + Gradient di[0] = 1980296200192.000, df[0] = 1429071069184.000, dc_hat[0] = 775238189056.000 + Gradient do_[0] = 104754831163392.000 +Backward Time Step 2: + Gradient di[0] = 2519797334016.000, df[0] = 1809825136640.000, dc_hat[0] = 1287416971264.000 + Gradient do_[0] = 114157680590848.000 +Backward Time Step 1: + Gradient di[0] = 3160186290176.000, df[0] = 2179170566144.000, dc_hat[0] = 1757591764992.000 + Gradient do_[0] = 100598586277888.000 +Backward Time Step 0: + Gradient di[0] = 3826073468928.000, df[0] = 2722229125120.000, dc_hat[0] = 3590147276800.000 + Gradient do_[0] = 59150746779648.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2344753106518016.000, df[0] = -1840845263208448.000, dc_hat[0] = -1059813916147712.000 + Gradient do_[0] = -137073408805437440.000 +Backward Time Step 3: + Gradient di[0] = -3681587178766336.000, df[0] = -2810586601619456.000, dc_hat[0] = -1521209099419648.000 + Gradient do_[0] = -187412203631017984.000 +Backward Time Step 2: + Gradient di[0] = -4822922929635328.000, df[0] = -3633920021102592.000, dc_hat[0] = -2657620267630592.000 + Gradient do_[0] = -214430124903235584.000 +Backward Time Step 1: + Gradient di[0] = -6085026449981440.000, df[0] = -4353625543409664.000, dc_hat[0] = -3683168532037632.000 + Gradient do_[0] = -192403007168577536.000 +Backward Time Step 0: + Gradient di[0] = -7105290025566208.000, df[0] = -5193259169087488.000, dc_hat[0] = -7183210932862976.000 + Gradient do_[0] = -110231074386214912.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1262608842752.000, df[0] = 925752426496.000, dc_hat[0] = 548150280192.000 + Gradient do_[0] = 78284847054848.000 +Backward Time Step 3: + Gradient di[0] = 1980883009536.000, df[0] = 1429494562816.000, dc_hat[0] = 775467433984.000 + Gradient do_[0] = 104785869012992.000 +Backward Time Step 2: + Gradient di[0] = 2520542085120.000, df[0] = 1810359648256.000, dc_hat[0] = 1287796293632.000 + Gradient do_[0] = 114191386017792.000 +Backward Time Step 1: + Gradient di[0] = 3161120047104.000, df[0] = 2179814129664.000, dc_hat[0] = 1758108712960.000 + Gradient do_[0] = 100628240007168.000 +Backward Time Step 0: + Gradient di[0] = 3827204882432.000, df[0] = 2723034169344.000, dc_hat[0] = 3591208960000.000 + Gradient do_[0] = 59168241221632.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2345306083557376.000, df[0] = -1841279323340800.000, dc_hat[0] = -1060063963774976.000 + Gradient do_[0] = -137105732729307136.000 +Backward Time Step 3: + Gradient di[0] = -3682459325562880.000, df[0] = -2811252321550336.000, dc_hat[0] = -1521567863406592.000 + Gradient do_[0] = -187456613592858624.000 +Backward Time Step 2: + Gradient di[0] = -4824065927806976.000, df[0] = -3634780356739072.000, dc_hat[0] = -2658248138162176.000 + Gradient do_[0] = -214480960136151040.000 +Backward Time Step 1: + Gradient di[0] = -6086464190283776.000, df[0] = -4354653919641600.000, dc_hat[0] = -3684033431076864.000 + Gradient do_[0] = -192448362023223296.000 +Backward Time Step 0: + Gradient di[0] = -7106962915328000.000, df[0] = -5194481624154112.000, dc_hat[0] = -7184902613106688.000 + Gradient do_[0] = -110257015988682752.000 +Epoch 800, Train Loss=0.011262, Weight Norm=13.014078 +Sample Predictions at Epoch 800: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.42 | 63.87 | 6.45 | +| 193 | 2024-10-14 | 56.80 | 66.55 | 9.75 | +| 194 | 2024-10-15 | 56.99 | 66.00 | 9.01 | +| 195 | 2024-10-16 | 57.95 | 67.20 | 9.25 | +| 196 | 2024-10-17 | 57.48 | 66.76 | 9.28 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1263036137472.000, df[0] = 926065623040.000, dc_hat[0] = 548335648768.000 + Gradient do_[0] = 78311287947264.000 +Backward Time Step 3: + Gradient di[0] = 1981556195328.000, df[0] = 1429980315648.000, dc_hat[0] = 775730954240.000 + Gradient do_[0] = 104821445099520.000 +Backward Time Step 2: + Gradient di[0] = 2521398509568.000, df[0] = 1810975162368.000, dc_hat[0] = 1288233943040.000 + Gradient do_[0] = 114230166552576.000 +Backward Time Step 1: + Gradient di[0] = 3162195361792.000, df[0] = 2180555735040.000, dc_hat[0] = 1758705090560.000 + Gradient do_[0] = 100662465527808.000 +Backward Time Step 0: + Gradient di[0] = 3828499087360.000, df[0] = 2723955081216.000, dc_hat[0] = 3592423211008.000 + Gradient do_[0] = 59188248051712.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2345841880727552.000, df[0] = -1841699693264896.000, dc_hat[0] = -1060305287249920.000 + Gradient do_[0] = -137137060220764160.000 +Backward Time Step 3: + Gradient di[0] = -3683299528540160.000, df[0] = -2811893345419264.000, dc_hat[0] = -1521914547798016.000 + Gradient do_[0] = -187499271208042496.000 +Backward Time Step 2: + Gradient di[0] = -4825164902563840.000, df[0] = -3635609016991744.000, dc_hat[0] = -2658853191680000.000 + Gradient do_[0] = -214529716604895232.000 +Backward Time Step 1: + Gradient di[0] = -6087848243494912.000, df[0] = -4355643909603328.000, dc_hat[0] = -3684866654732288.000 + Gradient do_[0] = -192492084790296576.000 +Backward Time Step 0: + Gradient di[0] = -7108584802353152.000, df[0] = -5195667035127808.000, dc_hat[0] = -7186541680001024.000 + Gradient do_[0] = -110282175907102720.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1263385313280.000, df[0] = 926321410048.000, dc_hat[0] = 548487266304.000 + Gradient do_[0] = 78332980887552.000 +Backward Time Step 3: + Gradient di[0] = 1982102765568.000, df[0] = 1430374711296.000, dc_hat[0] = 775944536064.000 + Gradient do_[0] = 104850335465472.000 +Backward Time Step 2: + Gradient di[0] = 2522092666880.000, df[0] = 1811473498112.000, dc_hat[0] = 1288587444224.000 + Gradient do_[0] = 114261581889536.000 +Backward Time Step 1: + Gradient di[0] = 3163065679872.000, df[0] = 2181155782656.000, dc_hat[0] = 1759186518016.000 + Gradient do_[0] = 100690097602560.000 +Backward Time Step 0: + Gradient di[0] = 3829553168384.000, df[0] = 2724705075200.000, dc_hat[0] = 3593412280320.000 + Gradient do_[0] = 59204542922752.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2346419016957952.000, df[0] = -1842152946532352.000, dc_hat[0] = -1060566206513152.000 + Gradient do_[0] = -137170741354299392.000 +Backward Time Step 3: + Gradient di[0] = -3684208182558720.000, df[0] = -2812587251073024.000, dc_hat[0] = -1522290089000960.000 + Gradient do_[0] = -187545536595755008.000 +Backward Time Step 2: + Gradient di[0] = -4826354071633920.000, df[0] = -3636504517672960.000, dc_hat[0] = -2659505489838080.000 + Gradient do_[0] = -214582579062374400.000 +Backward Time Step 1: + Gradient di[0] = -6089352018919424.000, df[0] = -4356719262040064.000, dc_hat[0] = -3685771819089920.000 + Gradient do_[0] = -192539552768851968.000 +Backward Time Step 0: + Gradient di[0] = -7110333927784448.000, df[0] = -5196945324769280.000, dc_hat[0] = -7188310669656064.000 + Gradient do_[0] = -110309320100413440.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1263829385216.000, df[0] = 926646992896.000, dc_hat[0] = 548680007680.000 + Gradient do_[0] = 78360478744576.000 +Backward Time Step 3: + Gradient di[0] = 1982800068608.000, df[0] = 1430878027776.000, dc_hat[0] = 776217559040.000 + Gradient do_[0] = 104887144677376.000 +Backward Time Step 2: + Gradient di[0] = 2522979762176.000, df[0] = 1812110639104.000, dc_hat[0] = 1289039773696.000 + Gradient do_[0] = 114301738156032.000 +Backward Time Step 1: + Gradient di[0] = 3164177432576.000, df[0] = 2181922422784.000, dc_hat[0] = 1759802556416.000 + Gradient do_[0] = 100725447196672.000 +Backward Time Step 0: + Gradient di[0] = 3830897967104.000, df[0] = 2725661900800.000, dc_hat[0] = 3594674765824.000 + Gradient do_[0] = 59225338281984.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2346969041207296.000, df[0] = -1842584993398784.000, dc_hat[0] = -1060814777745408.000 + Gradient do_[0] = -137202876299608064.000 +Backward Time Step 3: + Gradient di[0] = -3685074692210688.000, df[0] = -2813248676036608.000, dc_hat[0] = -1522646705504256.000 + Gradient do_[0] = -187589517060866048.000 +Backward Time Step 2: + Gradient di[0] = -4827493311709184.000, df[0] = -3637362437390336.000, dc_hat[0] = -2660132286627840.000 + Gradient do_[0] = -214633139417382912.000 +Backward Time Step 1: + Gradient di[0] = -6090789222350848.000, df[0] = -4357747101401088.000, dc_hat[0] = -3686637791870976.000 + Gradient do_[0] = -192584924803366912.000 +Backward Time Step 0: + Gradient di[0] = -7112004133191680.000, df[0] = -5198166169223168.000, dc_hat[0] = -7189999128674304.000 + Gradient do_[0] = -110335244523012096.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1264081436672.000, df[0] = 926831869952.000, dc_hat[0] = 548789485568.000 + Gradient do_[0] = 78376115109888.000 +Backward Time Step 3: + Gradient di[0] = 1983194988544.000, df[0] = 1431163109376.000, dc_hat[0] = 776372158464.000 + Gradient do_[0] = 104908040699904.000 +Backward Time Step 2: + Gradient di[0] = 2523483602944.000, df[0] = 1812472528896.000, dc_hat[0] = 1289296674816.000 + Gradient do_[0] = 114324546781184.000 +Backward Time Step 1: + Gradient di[0] = 3164809723904.000, df[0] = 2182358499328.000, dc_hat[0] = 1760152518656.000 + Gradient do_[0] = 100745571467264.000 +Backward Time Step 0: + Gradient di[0] = 3831655038976.000, df[0] = 2726200344576.000, dc_hat[0] = 3595384651776.000 + Gradient do_[0] = 59237044584448.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2347518528585728.000, df[0] = -1843016234958848.000, dc_hat[0] = -1061063013433344.000 + Gradient do_[0] = -137235011244916736.000 +Backward Time Step 3: + Gradient di[0] = -3685931001315328.000, df[0] = -2813902853242880.000, dc_hat[0] = -1523000637652992.000 + Gradient do_[0] = -187633136748724224.000 +Backward Time Step 2: + Gradient di[0] = -4828610540077056.000, df[0] = -3638204787851264.000, dc_hat[0] = -2660747809128448.000 + Gradient do_[0] = -214682754879586304.000 +Backward Time Step 1: + Gradient di[0] = -6092200119107584.000, df[0] = -4358756955586560.000, dc_hat[0] = -3687489000701952.000 + Gradient do_[0] = -192629506563899392.000 +Backward Time Step 0: + Gradient di[0] = -7113655011246080.000, df[0] = -5199373055033344.000, dc_hat[0] = -7191668260339712.000 + Gradient do_[0] = -110360851118030848.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1264424321024.000, df[0] = 927083397120.000, dc_hat[0] = 548937826304.000 + Gradient do_[0] = 78397380231168.000 +Backward Time Step 3: + Gradient di[0] = 1983734611968.000, df[0] = 1431552393216.000, dc_hat[0] = 776582856704.000 + Gradient do_[0] = 104936578744320.000 +Backward Time Step 2: + Gradient di[0] = 2524164915200.000, df[0] = 1812961427456.000, dc_hat[0] = 1289643491328.000 + Gradient do_[0] = 114355391692800.000 +Backward Time Step 1: + Gradient di[0] = 3165661167616.000, df[0] = 2182945177600.000, dc_hat[0] = 1760624115712.000 + Gradient do_[0] = 100772649893888.000 +Backward Time Step 0: + Gradient di[0] = 3832690245632.000, df[0] = 2726936969216.000, dc_hat[0] = 3596356157440.000 + Gradient do_[0] = 59253045854208.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2348046809563136.000, df[0] = -1843431101956096.000, dc_hat[0] = -1061301384118272.000 + Gradient do_[0] = -137265892059774976.000 +Backward Time Step 3: + Gradient di[0] = -3686765030277120.000, df[0] = -2814539313709056.000, dc_hat[0] = -1523345040343040.000 + Gradient do_[0] = -187675605385347072.000 +Backward Time Step 2: + Gradient di[0] = -4829707904221184.000, df[0] = -3639031300620288.000, dc_hat[0] = -2661349104549888.000 + Gradient do_[0] = -214731494168461312.000 +Backward Time Step 1: + Gradient di[0] = -6093580414222336.000, df[0] = -4359744261193728.000, dc_hat[0] = -3688319808438272.000 + Gradient do_[0] = -192673109071888384.000 +Backward Time Step 0: + Gradient di[0] = -7115261329014784.000, df[0] = -5200546654846976.000, dc_hat[0] = -7193291221106688.000 + Gradient do_[0] = -110385761928347648.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1264741908480.000, df[0] = 927316115456.000, dc_hat[0] = 549075681280.000 + Gradient do_[0] = 78417076682752.000 +Backward Time Step 3: + Gradient di[0] = 1984232685568.000, df[0] = 1431911661568.000, dc_hat[0] = 776777760768.000 + Gradient do_[0] = 104962918973440.000 +Backward Time Step 2: + Gradient di[0] = 2524800876544.000, df[0] = 1813418475520.000, dc_hat[0] = 1289968418816.000 + Gradient do_[0] = 114384206561280.000 +Backward Time Step 1: + Gradient di[0] = 3166463066112.000, df[0] = 2183498039296.000, dc_hat[0] = 1761067008000.000 + Gradient do_[0] = 100798151262208.000 +Backward Time Step 0: + Gradient di[0] = 3833665159168.000, df[0] = 2727630864384.000, dc_hat[0] = 3597271040000.000 + Gradient do_[0] = 59268115988480.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2348598712860672.000, df[0] = -1843864088346624.000, dc_hat[0] = -1061550760656896.000 + Gradient do_[0] = -137298138674233344.000 +Backward Time Step 3: + Gradient di[0] = -3687629392445440.000, df[0] = -2815199128059904.000, dc_hat[0] = -1523701254193152.000 + Gradient do_[0] = -187719551490719744.000 +Backward Time Step 2: + Gradient di[0] = -4830834796265472.000, df[0] = -3639880630403072.000, dc_hat[0] = -2661968653582336.000 + Gradient do_[0] = -214781642206609408.000 +Backward Time Step 1: + Gradient di[0] = -6095009564590080.000, df[0] = -4360766463410176.000, dc_hat[0] = -3689179607203840.000 + Gradient do_[0] = -192718189048627200.000 +Backward Time Step 0: + Gradient di[0] = -7116927239454720.000, df[0] = -5201764814946304.000, dc_hat[0] = -7194976458899456.000 + Gradient do_[0] = -110411600451600384.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1265069850624.000, df[0] = 927556632576.000, dc_hat[0] = 549217959936.000 + Gradient do_[0] = 78437385502720.000 +Backward Time Step 3: + Gradient di[0] = 1984745308160.000, df[0] = 1432281677824.000, dc_hat[0] = 776978038784.000 + Gradient do_[0] = 104990047731712.000 +Backward Time Step 2: + Gradient di[0] = 2525455974400.000, df[0] = 1813888892928.000, dc_hat[0] = 1290302390272.000 + Gradient do_[0] = 114413860290560.000 +Backward Time Step 1: + Gradient di[0] = 3167288295424.000, df[0] = 2184066891776.000, dc_hat[0] = 1761523400704.000 + Gradient do_[0] = 100824407605248.000 +Backward Time Step 0: + Gradient di[0] = 3834658160640.000, df[0] = 2728337342464.000, dc_hat[0] = 3598202699776.000 + Gradient do_[0] = 59283467141120.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2349143099965440.000, df[0] = -1844291571810304.000, dc_hat[0] = -1061796647534592.000 + Gradient do_[0] = -137329930022158336.000 +Backward Time Step 3: + Gradient di[0] = -3688485969985536.000, df[0] = -2815853305266176.000, dc_hat[0] = -1524054246817792.000 + Gradient do_[0] = -187763102459101184.000 +Backward Time Step 2: + Gradient di[0] = -4831962225180672.000, df[0] = -3640729960185856.000, dc_hat[0] = -2662587665743872.000 + Gradient do_[0] = -214831549726588928.000 +Backward Time Step 1: + Gradient di[0] = -6096422608830464.000, df[0] = -4361777122902016.000, dc_hat[0] = -3690029742292992.000 + Gradient do_[0] = -192762856708505600.000 +Backward Time Step 0: + Gradient di[0] = -7118573285670912.000, df[0] = -5202967405789184.000, dc_hat[0] = -7196639684984832.000 + Gradient do_[0] = -110437155507011584.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1265375641600.000, df[0] = 927781093376.000, dc_hat[0] = 549350703104.000 + Gradient do_[0] = 78456318590976.000 +Backward Time Step 3: + Gradient di[0] = 1985223983104.000, df[0] = 1432627052544.000, dc_hat[0] = 777164750848.000 + Gradient do_[0] = 105015372939264.000 +Backward Time Step 2: + Gradient di[0] = 2526065721344.000, df[0] = 1814326804480.000, dc_hat[0] = 1290612113408.000 + Gradient do_[0] = 114441408479232.000 +Backward Time Step 1: + Gradient di[0] = 3168048775168.000, df[0] = 2184591048704.000, dc_hat[0] = 1761944403968.000 + Gradient do_[0] = 100848541630464.000 +Backward Time Step 0: + Gradient di[0] = 3835574616064.000, df[0] = 2728989294592.000, dc_hat[0] = 3599062794240.000 + Gradient do_[0] = 59297635500032.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2349681313054720.000, df[0] = -1844713955000320.000, dc_hat[0] = -1062039111860224.000 + Gradient do_[0] = -137361274693484544.000 +Backward Time Step 3: + Gradient di[0] = -3689328052011008.000, df[0] = -2816495939747840.000, dc_hat[0] = -1524402139168768.000 + Gradient do_[0] = -187805914693107712.000 +Backward Time Step 2: + Gradient di[0] = -4833064421163008.000, df[0] = -3641560499486720.000, dc_hat[0] = -2663194598309888.000 + Gradient do_[0] = -214880546713501696.000 +Backward Time Step 1: + Gradient di[0] = -6097822768168960.000, df[0] = -4362778655588352.000, dc_hat[0] = -3690873703366656.000 + Gradient do_[0] = -192807008972308480.000 +Backward Time Step 0: + Gradient di[0] = -7120201078276096.000, df[0] = -5204157648601088.000, dc_hat[0] = -7198285731201024.000 + Gradient do_[0] = -110462392734842880.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1265747755008.000, df[0] = 928053919744.000, dc_hat[0] = 549512249344.000 + Gradient do_[0] = 78479395651584.000 +Backward Time Step 3: + Gradient di[0] = 1985808826368.000, df[0] = 1433048973312.000, dc_hat[0] = 777393209344.000 + Gradient do_[0] = 105046284959744.000 +Backward Time Step 2: + Gradient di[0] = 2526809686016.000, df[0] = 1814861185024.000, dc_hat[0] = 1290992222208.000 + Gradient do_[0] = 114475097128960.000 +Backward Time Step 1: + Gradient di[0] = 3168987512832.000, df[0] = 2185238282240.000, dc_hat[0] = 1762464497664.000 + Gradient do_[0] = 100878405074944.000 +Backward Time Step 0: + Gradient di[0] = 3836712321024.000, df[0] = 2729798533120.000, dc_hat[0] = 3600130244608.000 + Gradient do_[0] = 59315222216704.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2350227310772224.000, df[0] = -1845142914859008.000, dc_hat[0] = -1062286005370880.000 + Gradient do_[0] = -137393229250166784.000 +Backward Time Step 3: + Gradient di[0] = -3690184092680192.000, df[0] = -2817149580083200.000, dc_hat[0] = -1524754997575680.000 + Gradient do_[0] = -187849448481619968.000 +Backward Time Step 2: + Gradient di[0] = -4834185407627264.000, df[0] = -3642404192124928.000, dc_hat[0] = -2663809583939584.000 + Gradient do_[0] = -214930385514004480.000 +Backward Time Step 1: + Gradient di[0] = -6099235812409344.000, df[0] = -4363789315080192.000, dc_hat[0] = -3691725449068544.000 + Gradient do_[0] = -192851693812056064.000 +Backward Time Step 0: + Gradient di[0] = -7121860009394176.000, df[0] = -5205370439991296.000, dc_hat[0] = -7199962915930112.000 + Gradient do_[0] = -110488119588945920.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1266070454272.000, df[0] = 928290439168.000, dc_hat[0] = 549652267008.000 + Gradient do_[0] = 78499377315840.000 +Backward Time Step 3: + Gradient di[0] = 1986315419648.000, df[0] = 1433414533120.000, dc_hat[0] = 777591390208.000 + Gradient do_[0] = 105073069785088.000 +Backward Time Step 2: + Gradient di[0] = 2527455346688.000, df[0] = 1815324786688.000, dc_hat[0] = 1291321212928.000 + Gradient do_[0] = 114504297873408.000 +Backward Time Step 1: + Gradient di[0] = 3169793605632.000, df[0] = 2185794158592.000, dc_hat[0] = 1762910797824.000 + Gradient do_[0] = 100903998717952.000 +Backward Time Step 0: + Gradient di[0] = 3837681991680.000, df[0] = 2730488758272.000, dc_hat[0] = 3601040146432.000 + Gradient do_[0] = 59330216853504.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2350781629988864.000, df[0] = -1845578182950912.000, dc_hat[0] = -1062536522760192.000 + Gradient do_[0] = -137425613303578624.000 +Backward Time Step 3: + Gradient di[0] = -3691054360428544.000, df[0] = -2817814494707712.000, dc_hat[0] = -1525114566868992.000 + Gradient do_[0] = -187893789723983872.000 +Backward Time Step 2: + Gradient di[0] = -4835323573960704.000, df[0] = -3643261843406848.000, dc_hat[0] = -2664434501681152.000 + Gradient do_[0] = -214980928689143808.000 +Backward Time Step 1: + Gradient di[0] = -6100681068904448.000, df[0] = -4364823596892160.000, dc_hat[0] = -3692595985252352.000 + Gradient do_[0] = -192897289184870400.000 +Backward Time Step 0: + Gradient di[0] = -7123543636574208.000, df[0] = -5206600948121600.000, dc_hat[0] = -7201664796721152.000 + Gradient do_[0] = -110514258759909376.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1266371788800.000, df[0] = 928511229952.000, dc_hat[0] = 549782945792.000 + Gradient do_[0] = 78518109077504.000 +Backward Time Step 3: + Gradient di[0] = 1986789507072.000, df[0] = 1433756762112.000, dc_hat[0] = 777776988160.000 + Gradient do_[0] = 105098109779968.000 +Backward Time Step 2: + Gradient di[0] = 2528060637184.000, df[0] = 1815759421440.000, dc_hat[0] = 1291629887488.000 + Gradient do_[0] = 114531745398784.000 +Backward Time Step 1: + Gradient di[0] = 3170551988224.000, df[0] = 2186317135872.000, dc_hat[0] = 1763329966080.000 + Gradient do_[0] = 100928124354560.000 +Backward Time Step 0: + Gradient di[0] = 3838595039232.000, df[0] = 2731138351104.000, dc_hat[0] = 3601896833024.000 + Gradient do_[0] = 59344334880768.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2351324138045440.000, df[0] = -1846003921584128.000, dc_hat[0] = -1062781201678336.000 + Gradient do_[0] = -137457318752157696.000 +Backward Time Step 3: + Gradient di[0] = -3691906374565888.000, df[0] = -2818464645382144.000, dc_hat[0] = -1525466083098624.000 + Gradient do_[0] = -187937134533935104.000 +Backward Time Step 2: + Gradient di[0] = -4836443486683136.000, df[0] = -3644105536045056.000, dc_hat[0] = -2665050024181760.000 + Gradient do_[0] = -215030698770169856.000 +Backward Time Step 1: + Gradient di[0] = -6102083912597504.000, df[0] = -4365826740191232.000, dc_hat[0] = -3693439677890560.000 + Gradient do_[0] = -192941561707757568.000 +Backward Time Step 0: + Gradient di[0] = -7125167671083008.000, df[0] = -5207787432837120.000, dc_hat[0] = -7203307084840960.000 + Gradient do_[0] = -110539444448133120.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1266900271104.000, df[0] = 928898613248.000, dc_hat[0] = 550012387328.000 + Gradient do_[0] = 78550841425920.000 +Backward Time Step 3: + Gradient di[0] = 1987616309248.000, df[0] = 1434353270784.000, dc_hat[0] = 778100211712.000 + Gradient do_[0] = 105141831204864.000 +Backward Time Step 2: + Gradient di[0] = 2529110786048.000, df[0] = 1816513740800.000, dc_hat[0] = 1292165709824.000 + Gradient do_[0] = 114579233308672.000 +Backward Time Step 1: + Gradient di[0] = 3171871621120.000, df[0] = 2187226513408.000, dc_hat[0] = 1764062003200.000 + Gradient do_[0] = 100970084171776.000 +Backward Time Step 0: + Gradient di[0] = 3840196214784.000, df[0] = 2732277366784.000, dc_hat[0] = 3603399180288.000 + Gradient do_[0] = 59369081274368.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2351869598892032.000, df[0] = -1846432344571904.000, dc_hat[0] = -1063027826753536.000 + Gradient do_[0] = -137489221769232384.000 +Backward Time Step 3: + Gradient di[0] = -3692764294283264.000, df[0] = -2819119359459328.000, dc_hat[0] = -1525819612594176.000 + Gradient do_[0] = -187980771401662464.000 +Backward Time Step 2: + Gradient di[0] = -4837562862534656.000, df[0] = -3644949228683264.000, dc_hat[0] = -2665666351988736.000 + Gradient do_[0] = -215080451671326720.000 +Backward Time Step 1: + Gradient di[0] = -6103497493708800.000, df[0] = -4366837668118528.000, dc_hat[0] = -3694290349850624.000 + Gradient do_[0] = -192986229367635968.000 +Backward Time Step 0: + Gradient di[0] = -7126817475395584.000, df[0] = -5208993781776384.000, dc_hat[0] = -7204975142764544.000 + Gradient do_[0] = -110565051043151872.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1267055853568.000, df[0] = 929012711424.000, dc_hat[0] = 550079627264.000 + Gradient do_[0] = 78560513490944.000 +Backward Time Step 3: + Gradient di[0] = 1987861544960.000, df[0] = 1434530349056.000, dc_hat[0] = 778195763200.000 + Gradient do_[0] = 105154816770048.000 +Backward Time Step 2: + Gradient di[0] = 2529424048128.000, df[0] = 1816738791424.000, dc_hat[0] = 1292324962304.000 + Gradient do_[0] = 114593410056192.000 +Backward Time Step 1: + Gradient di[0] = 3172262739968.000, df[0] = 2187496521728.000, dc_hat[0] = 1764277354496.000 + Gradient do_[0] = 100982516088832.000 +Backward Time Step 0: + Gradient di[0] = 3840664141824.000, df[0] = 2732610289664.000, dc_hat[0] = 3603838533632.000 + Gradient do_[0] = 59376320643072.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2352411033206784.000, df[0] = -1846857009463296.000, dc_hat[0] = -1063272237236224.000 + Gradient do_[0] = -137520824138596352.000 +Backward Time Step 3: + Gradient di[0] = -3693615771549696.000, df[0] = -2819769241698304.000, dc_hat[0] = -1526170323517440.000 + Gradient do_[0] = -188024099031744512.000 +Backward Time Step 2: + Gradient di[0] = -4838684922740736.000, df[0] = -3645794531934208.000, dc_hat[0] = -2666281874489344.000 + Gradient do_[0] = -215130221752352768.000 +Backward Time Step 1: + Gradient di[0] = -6104915369787392.000, df[0] = -4367852354142208.000, dc_hat[0] = -3695145048342528.000 + Gradient do_[0] = -193031017286598656.000 +Backward Time Step 0: + Gradient di[0] = -7128474259030016.000, df[0] = -5210204425682944.000, dc_hat[0] = -7206649106268160.000 + Gradient do_[0] = -110590734947581952.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1267581321216.000, df[0] = 929397997568.000, dc_hat[0] = 550307823616.000 + Gradient do_[0] = 78593086455808.000 +Backward Time Step 3: + Gradient di[0] = 1988685594624.000, df[0] = 1435125022720.000, dc_hat[0] = 778518331392.000 + Gradient do_[0] = 105198362034176.000 +Backward Time Step 2: + Gradient di[0] = 2530470264832.000, df[0] = 1817490096128.000, dc_hat[0] = 1292858425344.000 + Gradient do_[0] = 114640805691392.000 +Backward Time Step 1: + Gradient di[0] = 3173576343552.000, df[0] = 2188401967104.000, dc_hat[0] = 1765005459456.000 + Gradient do_[0] = 101024282968064.000 +Backward Time Step 0: + Gradient di[0] = 3842255093760.000, df[0] = 2733742227456.000, dc_hat[0] = 3605331181568.000 + Gradient do_[0] = 59400920236032.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2352940119490560.000, df[0] = -1847272547549184.000, dc_hat[0] = -1063511413227520.000 + Gradient do_[0] = -137551756493062144.000 +Backward Time Step 3: + Gradient di[0] = -3694446847721472.000, df[0] = -2820403823116288.000, dc_hat[0] = -1526513786683392.000 + Gradient do_[0] = -188066258430722048.000 +Backward Time Step 2: + Gradient di[0] = -4839769938853888.000, df[0] = -3646611381026816.000, dc_hat[0] = -2666878606508032.000 + Gradient do_[0] = -215178325386067968.000 +Backward Time Step 1: + Gradient di[0] = -6106274190065664.000, df[0] = -4368824090492928.000, dc_hat[0] = -3695963239612416.000 + Gradient do_[0] = -193073881060212736.000 +Backward Time Step 0: + Gradient di[0] = -7130059638833152.000, df[0] = -5211362993111040.000, dc_hat[0] = -7208252739682304.000 + Gradient do_[0] = -110615345110188032.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1267724189696.000, df[0] = 929502789632.000, dc_hat[0] = 550369820672.000 + Gradient do_[0] = 78601894494208.000 +Backward Time Step 3: + Gradient di[0] = 1988910383104.000, df[0] = 1435287158784.000, dc_hat[0] = 778606018560.000 + Gradient do_[0] = 105210257080320.000 +Backward Time Step 2: + Gradient di[0] = 2530757312512.000, df[0] = 1817696403456.000, dc_hat[0] = 1293004570624.000 + Gradient do_[0] = 114653808033792.000 +Backward Time Step 1: + Gradient di[0] = 3173937577984.000, df[0] = 2188651266048.000, dc_hat[0] = 1765204426752.000 + Gradient do_[0] = 101035758583808.000 +Backward Time Step 0: + Gradient di[0] = 3842692874240.000, df[0] = 2734053654528.000, dc_hat[0] = 3605742223360.000 + Gradient do_[0] = 59407681454080.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2353478064144384.000, df[0] = -1847694930739200.000, dc_hat[0] = -1063754011770880.000 + Gradient do_[0] = -137583161293930496.000 +Backward Time Step 3: + Gradient di[0] = -3695289198182400.000, df[0] = -2821046726033408.000, dc_hat[0] = -1526860471074816.000 + Gradient do_[0] = -188109139384205312.000 +Backward Time Step 2: + Gradient di[0] = -4840876966674432.000, df[0] = -3647445946859520.000, dc_hat[0] = -2667487149686784.000 + Gradient do_[0] = -215227562891149312.000 +Backward Time Step 1: + Gradient di[0] = -6107680254984192.000, df[0] = -4369829381275648.000, dc_hat[0] = -3696808811298816.000 + Gradient do_[0] = -193118291022053376.000 +Backward Time Step 0: + Gradient di[0] = -7131701926952960.000, df[0] = -5212563973341184.000, dc_hat[0] = -7209912744542208.000 + Gradient do_[0] = -110640814266253312.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1268075331584.000, df[0] = 929760083968.000, dc_hat[0] = 550521929728.000 + Gradient do_[0] = 78623662931968.000 +Backward Time Step 3: + Gradient di[0] = 1989460099072.000, df[0] = 1435684044800.000, dc_hat[0] = 778821173248.000 + Gradient do_[0] = 105239315218432.000 +Backward Time Step 2: + Gradient di[0] = 2531455401984.000, df[0] = 1818197622784.000, dc_hat[0] = 1293360824320.000 + Gradient do_[0] = 114685374365696.000 +Backward Time Step 1: + Gradient di[0] = 3174813925376.000, df[0] = 2189255245824.000, dc_hat[0] = 1765689262080.000 + Gradient do_[0] = 101063575207936.000 +Backward Time Step 0: + Gradient di[0] = 3843754557440.000, df[0] = 2734809153536.000, dc_hat[0] = 3606738370560.000 + Gradient do_[0] = 59424097959936.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2354013324443648.000, df[0] = -1848114763792384.000, dc_hat[0] = -1063995872116736.000 + Gradient do_[0] = -137614497375322112.000 +Backward Time Step 3: + Gradient di[0] = -3696128595853312.000, df[0] = -2821687481466880.000, dc_hat[0] = -1527206484377600.000 + Gradient do_[0] = -188151917258473472.000 +Backward Time Step 2: + Gradient di[0] = -4841978625785856.000, df[0] = -3648276486160384.000, dc_hat[0] = -2668093008510976.000 + Gradient do_[0] = -215276491158585344.000 +Backward Time Step 1: + Gradient di[0] = -6109072898129920.000, df[0] = -4370825545252864.000, dc_hat[0] = -3697647940534272.000 + Gradient do_[0] = -193162254307295232.000 +Backward Time Step 0: + Gradient di[0] = -7133310929076224.000, df[0] = -5213739183767552.000, dc_hat[0] = -7211538926534656.000 + Gradient do_[0] = -110665776616177664.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1268458979328.000, df[0] = 930041495552.000, dc_hat[0] = 550688456704.000 + Gradient do_[0] = 78647436247040.000 +Backward Time Step 3: + Gradient di[0] = 1990063685632.000, df[0] = 1436119465984.000, dc_hat[0] = 779057496064.000 + Gradient do_[0] = 105271242260480.000 +Backward Time Step 2: + Gradient di[0] = 2532227153920.000, df[0] = 1818751795200.000, dc_hat[0] = 1293754433536.000 + Gradient do_[0] = 114720304529408.000 +Backward Time Step 1: + Gradient di[0] = 3175777566720.000, df[0] = 2189919649792.000, dc_hat[0] = 1766223642624.000 + Gradient do_[0] = 101094235570176.000 +Backward Time Step 0: + Gradient di[0] = 3844914544640.000, df[0] = 2735634644992.000, dc_hat[0] = 3607826792448.000 + Gradient do_[0] = 59442032803840.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2354551805968384.000, df[0] = -1848537415417856.000, dc_hat[0] = -1064238940422144.000 + Gradient do_[0] = -137645936535928832.000 +Backward Time Step 3: + Gradient di[0] = -3696975241281536.000, df[0] = -2822333874044928.000, dc_hat[0] = -1527556121559040.000 + Gradient do_[0] = -188194970010648576.000 +Backward Time Step 2: + Gradient di[0] = -4843089411702784.000, df[0] = -3649112931041280.000, dc_hat[0] = -2668702893867008.000 + Gradient do_[0] = -215325848922750976.000 +Backward Time Step 1: + Gradient di[0] = -6110471446855680.000, df[0] = -4371826272632832.000, dc_hat[0] = -3698491096301568.000 + Gradient do_[0] = -193206458110705664.000 +Backward Time Step 0: + Gradient di[0] = -7134946774745088.000, df[0] = -5214934795288576.000, dc_hat[0] = -7213192488943616.000 + Gradient do_[0] = -110691151282962432.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1268836073472.000, df[0] = 930317926400.000, dc_hat[0] = 550852165632.000 + Gradient do_[0] = 78670806908928.000 +Backward Time Step 3: + Gradient di[0] = 1990653509632.000, df[0] = 1436545056768.000, dc_hat[0] = 779287789568.000 + Gradient do_[0] = 105302414327808.000 +Backward Time Step 2: + Gradient di[0] = 2532975312896.000, df[0] = 1819289059328.000, dc_hat[0] = 1294135066624.000 + Gradient do_[0] = 114754152562688.000 +Backward Time Step 1: + Gradient di[0] = 3176722071552.000, df[0] = 2190570946560.000, dc_hat[0] = 1766746226688.000 + Gradient do_[0] = 101124266786816.000 +Backward Time Step 0: + Gradient di[0] = 3846055919616.000, df[0] = 2736446767104.000, dc_hat[0] = 3608897650688.000 + Gradient do_[0] = 59459674046464.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2355116057296896.000, df[0] = -1848980602355712.000, dc_hat[0] = -1064493954105344.000 + Gradient do_[0] = -137678921884762112.000 +Backward Time Step 3: + Gradient di[0] = -3697865641689088.000, df[0] = -2823014357925888.000, dc_hat[0] = -1527924146569216.000 + Gradient do_[0] = -188240135886733312.000 +Backward Time Step 2: + Gradient di[0] = -4844245294776320.000, df[0] = -3649983735660544.000, dc_hat[0] = -2669338280591360.000 + Gradient do_[0] = -215377182371872768.000 +Backward Time Step 1: + Gradient di[0] = -6111931735736320.000, df[0] = -4372869949685760.000, dc_hat[0] = -3699369417113600.000 + Gradient do_[0] = -193252465800380416.000 +Backward Time Step 0: + Gradient di[0] = -7136644897439744.000, df[0] = -5216176577708032.000, dc_hat[0] = -7214909402120192.000 + Gradient do_[0] = -110717505202290688.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1269155102720.000, df[0] = 930551889920.000, dc_hat[0] = 550990708736.000 + Gradient do_[0] = 78690604023808.000 +Backward Time Step 3: + Gradient di[0] = 1991152893952.000, df[0] = 1436905373696.000, dc_hat[0] = 779483283456.000 + Gradient do_[0] = 105328830054400.000 +Backward Time Step 2: + Gradient di[0] = 2533612060672.000, df[0] = 1819746369536.000, dc_hat[0] = 1294459863040.000 + Gradient do_[0] = 114782992596992.000 +Backward Time Step 1: + Gradient di[0] = 3177517416448.000, df[0] = 2191119089664.000, dc_hat[0] = 1767186235392.000 + Gradient do_[0] = 101149558439936.000 +Backward Time Step 0: + Gradient di[0] = 3847016153088.000, df[0] = 2737129914368.000, dc_hat[0] = 3609798639616.000 + Gradient do_[0] = 59474517688320.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2355649975418880.000, df[0] = -1849399495884800.000, dc_hat[0] = -1064734740709376.000 + Gradient do_[0] = -137710120527200256.000 +Backward Time Step 3: + Gradient di[0] = -3698703160311808.000, df[0] = -2823653234311168.000, dc_hat[0] = -1528269220347904.000 + Gradient do_[0] = -188282759142178816.000 +Backward Time Step 2: + Gradient di[0] = -4845344806404096.000, df[0] = -3650811590606848.000, dc_hat[0] = -2669940381319168.000 + Gradient do_[0] = -215426076279570432.000 +Backward Time Step 1: + Gradient di[0] = -6113322768269312.000, df[0] = -4373866113662976.000, dc_hat[0] = -3700207472607232.000 + Gradient do_[0] = -193296446265491456.000 +Backward Time Step 0: + Gradient di[0] = -7138271616303104.000, df[0] = -5217365209907200.000, dc_hat[0] = -7216554374594560.000 + Gradient do_[0] = -110742733840187392.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1269461024768.000, df[0] = 930776154112.000, dc_hat[0] = 551123156992.000 + Gradient do_[0] = 78709511946240.000 +Backward Time Step 3: + Gradient di[0] = 1991631831040.000, df[0] = 1437251403776.000, dc_hat[0] = 779670323200.000 + Gradient do_[0] = 105354096541696.000 +Backward Time Step 2: + Gradient di[0] = 2534219186176.000, df[0] = 1820182446080.000, dc_hat[0] = 1294768668672.000 + Gradient do_[0] = 114810473676800.000 +Backward Time Step 1: + Gradient di[0] = 3178280255488.000, df[0] = 2191645081600.000, dc_hat[0] = 1767608025088.000 + Gradient do_[0] = 101173835071488.000 +Backward Time Step 0: + Gradient di[0] = 3847943094272.000, df[0] = 2737789206528.000, dc_hat[0] = 3610668695552.000 + Gradient do_[0] = 59488849625088.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2356188188508160.000, df[0] = -1849822147510272.000, dc_hat[0] = -1064978010341376.000 + Gradient do_[0] = -137741568277741568.000 +Backward Time Step 3: + Gradient di[0] = -3699546047643648.000, df[0] = -2824296405663744.000, dc_hat[0] = -1528616441610240.000 + Gradient do_[0] = -188325622915792896.000 +Backward Time Step 2: + Gradient di[0] = -4846445391773696.000, df[0] = -3651641056165888.000, dc_hat[0] = -2670547045449728.000 + Gradient do_[0] = -215474918647660544.000 +Backward Time Step 1: + Gradient di[0] = -6114708432093184.000, df[0] = -4374856908931072.000, dc_hat[0] = -3701042575310848.000 + Gradient do_[0] = -193340220572172288.000 +Backward Time Step 0: + Gradient di[0] = -7139892966457344.000, df[0] = -5218550084009984.000, dc_hat[0] = -7218192904617984.000 + Gradient do_[0] = -110767893758607360.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1269813215232.000, df[0] = 931034300416.000, dc_hat[0] = 551276052480.000 + Gradient do_[0] = 78731355881472.000 +Backward Time Step 3: + Gradient di[0] = 1992187183104.000, df[0] = 1437651959808.000, dc_hat[0] = 779887247360.000 + Gradient do_[0] = 105383465058304.000 +Backward Time Step 2: + Gradient di[0] = 2534931431424.000, df[0] = 1820694020096.000, dc_hat[0] = 1295132524544.000 + Gradient do_[0] = 114842677542912.000 +Backward Time Step 1: + Gradient di[0] = 3179173380096.000, df[0] = 2192260988928.000, dc_hat[0] = 1768103084032.000 + Gradient do_[0] = 101202222120960.000 +Backward Time Step 0: + Gradient di[0] = 3849015787520.000, df[0] = 2738552569856.000, dc_hat[0] = 3611675066368.000 + Gradient do_[0] = 59505438097408.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2356732038742016.000, df[0] = -1850249228320768.000, dc_hat[0] = -1065223695892480.000 + Gradient do_[0] = -137773273726320640.000 +Backward Time Step 3: + Gradient di[0] = -3700399403958272.000, df[0] = -2824948166950912.000, dc_hat[0] = -1528968897363968.000 + Gradient do_[0] = -188369087984828416.000 +Backward Time Step 2: + Gradient di[0] = -4847570673205248.000, df[0] = -3652488506900480.000, dc_hat[0] = -2671165520740352.000 + Gradient do_[0] = -215524929246855168.000 +Backward Time Step 1: + Gradient di[0] = -6116131140009984.000, df[0] = -4375874547744768.000, dc_hat[0] = -3701898347544576.000 + Gradient do_[0] = -193385180289826816.000 +Backward Time Step 0: + Gradient di[0] = -7141545455124480.000, df[0] = -5219758043561984.000, dc_hat[0] = -7219863646896128.000 + Gradient do_[0] = -110793517533495296.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1270181265408.000, df[0] = 931304177664.000, dc_hat[0] = 551435829248.000 + Gradient do_[0] = 78754181283840.000 +Backward Time Step 3: + Gradient di[0] = 1992765210624.000, df[0] = 1438069030912.000, dc_hat[0] = 780113412096.000 + Gradient do_[0] = 105414049923072.000 +Backward Time Step 2: + Gradient di[0] = 2535665434624.000, df[0] = 1821220929536.000, dc_hat[0] = 1295506604032.000 + Gradient do_[0] = 114875896430592.000 +Backward Time Step 1: + Gradient di[0] = 3180090359808.000, df[0] = 2192893018112.000, dc_hat[0] = 1768610594816.000 + Gradient do_[0] = 101231372533760.000 +Backward Time Step 0: + Gradient di[0] = 3850124132352.000, df[0] = 2739341099008.000, dc_hat[0] = 3612715253760.000 + Gradient do_[0] = 59522576023552.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2357267835912192.000, df[0] = -1850669732462592.000, dc_hat[0] = -1065465287802880.000 + Gradient do_[0] = -137804626987581440.000 +Backward Time Step 3: + Gradient di[0] = -3701235848839168.000, df[0] = -2825587311771648.000, dc_hat[0] = -1529315447537664.000 + Gradient do_[0] = -188411694060404736.000 +Backward Time Step 2: + Gradient di[0] = -4848668574220288.000, df[0] = -3653316093411328.000, dc_hat[0] = -2671768695209984.000 + Gradient do_[0] = -215573668535730176.000 +Backward Time Step 1: + Gradient di[0] = -6117517340704768.000, df[0] = -4376865879883776.000, dc_hat[0] = -3702733987119104.000 + Gradient do_[0] = -193428868697161728.000 +Backward Time Step 0: + Gradient di[0] = -7143160899698688.000, df[0] = -5220938622697472.000, dc_hat[0] = -7221497345081344.000 + Gradient do_[0] = -110818600142503936.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1270388752384.000, df[0] = 931456221184.000, dc_hat[0] = 551525744640.000 + Gradient do_[0] = 78767024242688.000 +Backward Time Step 3: + Gradient di[0] = 1993092104192.000, df[0] = 1438305222656.000, dc_hat[0] = 780241272832.000 + Gradient do_[0] = 105431246569472.000 +Backward Time Step 2: + Gradient di[0] = 2536078835712.000, df[0] = 1821517938688.000, dc_hat[0] = 1295717236736.000 + Gradient do_[0] = 114894619803648.000 +Backward Time Step 1: + Gradient di[0] = 3180609929216.000, df[0] = 2193251368960.000, dc_hat[0] = 1768897511424.000 + Gradient do_[0] = 101247822594048.000 +Backward Time Step 0: + Gradient di[0] = 3850754588672.000, df[0] = 2739789627392.000, dc_hat[0] = 3613306388480.000 + Gradient do_[0] = 59532319391744.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2357781352939520.000, df[0] = -1851073325170688.000, dc_hat[0] = -1065697484472320.000 + Gradient do_[0] = -137834665988849664.000 +Backward Time Step 3: + Gradient di[0] = -3702043302690816.000, df[0] = -2826203371143168.000, dc_hat[0] = -1529647770632192.000 + Gradient do_[0] = -188452736767885312.000 +Backward Time Step 2: + Gradient di[0] = -4849724062433280.000, df[0] = -3654111199232000.000, dc_hat[0] = -2672348247359488.000 + Gradient do_[0] = -215620672657817600.000 +Backward Time Step 1: + Gradient di[0] = -6118854149275648.000, df[0] = -4377822046978048.000, dc_hat[0] = -3703537682874368.000 + Gradient do_[0] = -193471096815616000.000 +Backward Time Step 0: + Gradient di[0] = -7144715677859840.000, df[0] = -5222075178418176.000, dc_hat[0] = -7223068766240768.000 + Gradient do_[0] = -110842729268772864.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1270813294592.000, df[0] = 931767582720.000, dc_hat[0] = 551710031872.000 + Gradient do_[0] = 78793339305984.000 +Backward Time Step 3: + Gradient di[0] = 1993756770304.000, df[0] = 1438784552960.000, dc_hat[0] = 780500860928.000 + Gradient do_[0] = 105466420002816.000 +Backward Time Step 2: + Gradient di[0] = 2536922939392.000, df[0] = 1822124277760.000, dc_hat[0] = 1296147939328.000 + Gradient do_[0] = 114932855078912.000 +Backward Time Step 1: + Gradient di[0] = 3181671088128.000, df[0] = 2193983012864.000, dc_hat[0] = 1769486417920.000 + Gradient do_[0] = 101281611907072.000 +Backward Time Step 0: + Gradient di[0] = 3852040667136.000, df[0] = 2740704772096.000, dc_hat[0] = 3614513561600.000 + Gradient do_[0] = 59552204587008.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2358321445076992.000, df[0] = -1851497184755712.000, dc_hat[0] = -1065941559410688.000 + Gradient do_[0] = -137866251178344448.000 +Backward Time Step 3: + Gradient di[0] = -3702898001182720.000, df[0] = -2826855669301248.000, dc_hat[0] = -1529999823732736.000 + Gradient do_[0] = -188496201836920832.000 +Backward Time Step 2: + Gradient di[0] = -4850844512026624.000, df[0] = -3654955697176576.000, dc_hat[0] = -2672965917343744.000 + Gradient do_[0] = -215670339659628544.000 +Backward Time Step 1: + Gradient di[0] = -6120261824806912.000, df[0] = -4378829485244416.000, dc_hat[0] = -3704385402044416.000 + Gradient do_[0] = -193515506777456640.000 +Backward Time Step 0: + Gradient di[0] = -7146353134141440.000, df[0] = -5223271863681024.000, dc_hat[0] = -7224724476133376.000 + Gradient do_[0] = -110868112525492224.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1271032578048.000, df[0] = 931928408064.000, dc_hat[0] = 551804993536.000 + Gradient do_[0] = 78806937239552.000 +Backward Time Step 3: + Gradient di[0] = 1994100965376.000, df[0] = 1439033065472.000, dc_hat[0] = 780635602944.000 + Gradient do_[0] = 105484606504960.000 +Backward Time Step 2: + Gradient di[0] = 2537364652032.000, df[0] = 1822441209856.000, dc_hat[0] = 1296371810304.000 + Gradient do_[0] = 114952836743168.000 +Backward Time Step 1: + Gradient di[0] = 3182222376960.000, df[0] = 2194362728448.000, dc_hat[0] = 1769789587456.000 + Gradient do_[0] = 101299110543360.000 +Backward Time Step 0: + Gradient di[0] = 3852699435008.000, df[0] = 2741173223424.000, dc_hat[0] = 3615131435008.000 + Gradient do_[0] = 59562379968512.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2358875764293632.000, df[0] = -1851932318629888.000, dc_hat[0] = -1066191875473408.000 + Gradient do_[0] = -137898566512279552.000 +Backward Time Step 3: + Gradient di[0] = -3703768537366528.000, df[0] = -2827520315490304.000, dc_hat[0] = -1530358721937408.000 + Gradient do_[0] = -188540491539677184.000 +Backward Time Step 2: + Gradient di[0] = -4851983215230976.000, df[0] = -3655813348458496.000, dc_hat[0] = -2673590835085312.000 + Gradient do_[0] = -215720900014637056.000 +Backward Time Step 1: + Gradient di[0] = -6121701712592896.000, df[0] = -4379858935218176.000, dc_hat[0] = -3705253253873664.000 + Gradient do_[0] = -193560930351579136.000 +Backward Time Step 0: + Gradient di[0] = -7148036224450560.000, df[0] = -5224502371811328.000, dc_hat[0] = -7226425820053504.000 + Gradient do_[0] = -110894225926651904.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1271612309504.000, df[0] = 932353409024.000, dc_hat[0] = 552056651776.000 + Gradient do_[0] = 78842857259008.000 +Backward Time Step 3: + Gradient di[0] = 1995011522560.000, df[0] = 1439690260480.000, dc_hat[0] = 780992053248.000 + Gradient do_[0] = 105532773892096.000 +Backward Time Step 2: + Gradient di[0] = 2538519658496.000, df[0] = 1823270764544.000, dc_hat[0] = 1296961896448.000 + Gradient do_[0] = 115005173268480.000 +Backward Time Step 1: + Gradient di[0] = 3183674654720.000, df[0] = 2195364642816.000, dc_hat[0] = 1770595549184.000 + Gradient do_[0] = 101345331773440.000 +Backward Time Step 0: + Gradient di[0] = 3854456848384.000, df[0] = 2742423912448.000, dc_hat[0] = 3616780582912.000 + Gradient do_[0] = 59589550669824.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2359413172076544.000, df[0] = -1852354030731264.000, dc_hat[0] = -1066434943778816.000 + Gradient do_[0] = -137929954133278720.000 +Backward Time Step 3: + Gradient di[0] = -3704607129731072.000, df[0] = -2828161070923776.000, dc_hat[0] = -1530705272111104.000 + Gradient do_[0] = -188583114795122688.000 +Backward Time Step 2: + Gradient di[0] = -4853082189987840.000, df[0] = -3656640934969344.000, dc_hat[0] = -2674194814861312.000 + Gradient do_[0] = -215769725202857984.000 +Backward Time Step 1: + Gradient di[0] = -6123085765804032.000, df[0] = -4380848925179904.000, dc_hat[0] = -3706085940658176.000 + Gradient do_[0] = -193604670298521600.000 +Backward Time Step 0: + Gradient di[0] = -7149643615961088.000, df[0] = -5225677582237696.000, dc_hat[0] = -7228051465175040.000 + Gradient do_[0] = -110919179686641664.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1271901978624.000, df[0] = 932565745664.000, dc_hat[0] = 552182349824.000 + Gradient do_[0] = 78860808880128.000 +Backward Time Step 3: + Gradient di[0] = 1995466866688.000, df[0] = 1440018726912.000, dc_hat[0] = 781170245632.000 + Gradient do_[0] = 105556849197056.000 +Backward Time Step 2: + Gradient di[0] = 2539099783168.000, df[0] = 1823687704576.000, dc_hat[0] = 1297257463808.000 + Gradient do_[0] = 115031396057088.000 +Backward Time Step 1: + Gradient di[0] = 3184401580032.000, df[0] = 2195865468928.000, dc_hat[0] = 1770997153792.000 + Gradient do_[0] = 101368433999872.000 +Backward Time Step 0: + Gradient di[0] = 3855337390080.000, df[0] = 2743050436608.000, dc_hat[0] = 3617607122944.000 + Gradient do_[0] = 59603169574912.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2359953264214016.000, df[0] = -1852777756098560.000, dc_hat[0] = -1066678146301952.000 + Gradient do_[0] = -137961599452315648.000 +Backward Time Step 3: + Gradient di[0] = -3705459949174784.000, df[0] = -2828811490033664.000, dc_hat[0] = -1531056922558464.000 + Gradient do_[0] = -188626528324550656.000 +Backward Time Step 2: + Gradient di[0] = -4854203713323008.000, df[0] = -3657485969784832.000, dc_hat[0] = -2674810337361920.000 + Gradient do_[0] = -215819426564407296.000 +Backward Time Step 1: + Gradient di[0] = -6124502031269888.000, df[0] = -4381862805897216.000, dc_hat[0] = -3706939565408256.000 + Gradient do_[0] = -193649406677876736.000 +Backward Time Step 0: + Gradient di[0] = -7151295030886400.000, df[0] = -5226884468047872.000, dc_hat[0] = -7229720596840448.000 + Gradient do_[0] = -110944786281660416.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1272093605888.000, df[0] = 932706189312.000, dc_hat[0] = 552265056256.000 + Gradient do_[0] = 78872687149056.000 +Backward Time Step 3: + Gradient di[0] = 1995765710848.000, df[0] = 1440234209280.000, dc_hat[0] = 781286244352.000 + Gradient do_[0] = 105572644945920.000 +Backward Time Step 2: + Gradient di[0] = 2539483824128.000, df[0] = 1823963217920.000, dc_hat[0] = 1297453154304.000 + Gradient do_[0] = 115048760475648.000 +Backward Time Step 1: + Gradient di[0] = 3184881303552.000, df[0] = 2196196294656.000, dc_hat[0] = 1771262050304.000 + Gradient do_[0] = 101383676100608.000 +Backward Time Step 0: + Gradient di[0] = 3855915417600.000, df[0] = 2743461740544.000, dc_hat[0] = 3618149236736.000 + Gradient do_[0] = 59612099248128.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2360465975934976.000, df[0] = -1853180543500288.000, dc_hat[0] = -1066909470556160.000 + Gradient do_[0] = -137991492424695808.000 +Backward Time Step 3: + Gradient di[0] = -3706263644930048.000, df[0] = -2829425133486080.000, dc_hat[0] = -1531388842999808.000 + Gradient do_[0] = -188667399233339392.000 +Backward Time Step 2: + Gradient di[0] = -4855250611601408.000, df[0] = -3658275706896384.000, dc_hat[0] = -2675387473592320.000 + Gradient do_[0] = -215865984009895936.000 +Backward Time Step 1: + Gradient di[0] = -6125820586229760.000, df[0] = -4382805282783232.000, dc_hat[0] = -3707731986874368.000 + Gradient do_[0] = -193691050680778752.000 +Backward Time Step 0: + Gradient di[0] = -7152840682242048.000, df[0] = -5228014044446720.000, dc_hat[0] = -7231283428065280.000 + Gradient do_[0] = -110968769379041280.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1272442257408.000, df[0] = 932961779712.000, dc_hat[0] = 552416444416.000 + Gradient do_[0] = 78894329757696.000 +Backward Time Step 3: + Gradient di[0] = 1996312936448.000, df[0] = 1440628998144.000, dc_hat[0] = 781500416000.000 + Gradient do_[0] = 105601602420736.000 +Backward Time Step 2: + Gradient di[0] = 2540177981440.000, df[0] = 1824461946880.000, dc_hat[0] = 1297807310848.000 + Gradient do_[0] = 115080192589824.000 +Backward Time Step 1: + Gradient di[0] = 3185754505216.000, df[0] = 2196798308352.000, dc_hat[0] = 1771745705984.000 + Gradient do_[0] = 101411417227264.000 +Backward Time Step 0: + Gradient di[0] = 3856971595776.000, df[0] = 2744213045248.000, dc_hat[0] = 3619140403200.000 + Gradient do_[0] = 59628431867904.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2360949696626688.000, df[0] = -1853560513888256.000, dc_hat[0] = -1067128513888256.000 + Gradient do_[0] = -138019753309503488.000 +Backward Time Step 3: + Gradient di[0] = -3707026001625088.000, df[0] = -2830007101554688.000, dc_hat[0] = -1531703046701056.000 + Gradient do_[0] = -188706122658480128.000 +Backward Time Step 2: + Gradient di[0] = -4856253486465024.000, df[0] = -3659030278963200.000, dc_hat[0] = -2675937497841664.000 + Gradient do_[0] = -215910514230820864.000 +Backward Time Step 1: + Gradient di[0] = -6127089212194816.000, df[0] = -4383712863059968.000, dc_hat[0] = -3708495685746688.000 + Gradient do_[0] = -193731062596108288.000 +Backward Time Step 0: + Gradient di[0] = -7154312245411840.000, df[0] = -5229089396883456.000, dc_hat[0] = -7232770560491520.000 + Gradient do_[0] = -110991592835252224.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1272636768256.000, df[0] = 933104386048.000, dc_hat[0] = 552500854784.000 + Gradient do_[0] = 78906367410176.000 +Backward Time Step 3: + Gradient di[0] = 1996618072064.000, df[0] = 1440849592320.000, dc_hat[0] = 781619953664.000 + Gradient do_[0] = 105617733713920.000 +Backward Time Step 2: + Gradient di[0] = 2540570673152.000, df[0] = 1824743751680.000, dc_hat[0] = 1298005884928.000 + Gradient do_[0] = 115097968050176.000 +Backward Time Step 1: + Gradient di[0] = 3186249957376.000, df[0] = 2197139750912.000, dc_hat[0] = 1772019122176.000 + Gradient do_[0] = 101427120701440.000 +Backward Time Step 0: + Gradient di[0] = 3857560895488.000, df[0] = 2744632475648.000, dc_hat[0] = 3619693527040.000 + Gradient do_[0] = 59637546090496.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2361534617485312.000, df[0] = -1854019404300288.000, dc_hat[0] = -1067392385941504.000 + Gradient do_[0] = -138053932659245056.000 +Backward Time Step 3: + Gradient di[0] = -3707943245578240.000, df[0] = -2830707449659392.000, dc_hat[0] = -1532082077564928.000 + Gradient do_[0] = -188752834722791424.000 +Backward Time Step 2: + Gradient di[0] = -4857450708598784.000, df[0] = -3659933027401728.000, dc_hat[0] = -2676596238450688.000 + Gradient do_[0] = -215963737465552896.000 +Backward Time Step 1: + Gradient di[0] = -6128608020004864.000, df[0] = -4384799758221312.000, dc_hat[0] = -3709411587522560.000 + Gradient do_[0] = -193779045970739200.000 +Backward Time Step 0: + Gradient di[0] = -7156077476970496.000, df[0] = -5230379497684992.000, dc_hat[0] = -7234555119403008.000 + Gradient do_[0] = -111018977546731520.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1273039552512.000, df[0] = 933399494656.000, dc_hat[0] = 552675704832.000 + Gradient do_[0] = 78931357073408.000 +Backward Time Step 3: + Gradient di[0] = 1997250494464.000, df[0] = 1441305722880.000, dc_hat[0] = 781866696704.000 + Gradient do_[0] = 105651170705408.000 +Backward Time Step 2: + Gradient di[0] = 2541374930944.000, df[0] = 1825321123840.000, dc_hat[0] = 1298416664576.000 + Gradient do_[0] = 115134374608896.000 +Backward Time Step 1: + Gradient di[0] = 3187256328192.000, df[0] = 2197833515008.000, dc_hat[0] = 1772576309248.000 + Gradient do_[0] = 101459131629568.000 +Backward Time Step 0: + Gradient di[0] = 3858779865088.000, df[0] = 2745499648000.000, dc_hat[0] = 3620836737024.000 + Gradient do_[0] = 59656382709760.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2362061556285440.000, df[0] = -1854433331773440.000, dc_hat[0] = -1067630018428928.000 + Gradient do_[0] = -138084744754626560.000 +Backward Time Step 3: + Gradient di[0] = -3708770563653632.000, df[0] = -2831339078287360.000, dc_hat[0] = -1532422051069952.000 + Gradient do_[0] = -188794942582161408.000 +Backward Time Step 2: + Gradient di[0] = -4858539482808320.000, df[0] = -3660752292413440.000, dc_hat[0] = -2677192433598464.000 + Gradient do_[0] = -216012081617436672.000 +Backward Time Step 1: + Gradient di[0] = -6129972208992256.000, df[0] = -4385774715797504.000, dc_hat[0] = -3710231657840640.000 + Gradient do_[0] = -193822133082652672.000 +Backward Time Step 0: + Gradient di[0] = -7157676278546432.000, df[0] = -5231547728789504.000, dc_hat[0] = -7236171100848128.000 + Gradient do_[0] = -111043776687898624.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1273401442304.000, df[0] = 933664915456.000, dc_hat[0] = 552832794624.000 + Gradient do_[0] = 78953779822592.000 +Backward Time Step 3: + Gradient di[0] = 1997816463360.000, df[0] = 1441714012160.000, dc_hat[0] = 782088273920.000 + Gradient do_[0] = 105681118035968.000 +Backward Time Step 2: + Gradient di[0] = 2542093467648.000, df[0] = 1825837678592.000, dc_hat[0] = 1298783666176.000 + Gradient do_[0] = 115166888853504.000 +Backward Time Step 1: + Gradient di[0] = 3188158627840.000, df[0] = 2198455451648.000, dc_hat[0] = 1773075824640.000 + Gradient do_[0] = 101487854223360.000 +Backward Time Step 0: + Gradient di[0] = 3859869335552.000, df[0] = 2746274807808.000, dc_hat[0] = 3621859360768.000 + Gradient do_[0] = 59673227034624.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2362579905150976.000, df[0] = -1854840279924736.000, dc_hat[0] = -1067864027037696.000 + Gradient do_[0] = -138115058633801728.000 +Backward Time Step 3: + Gradient di[0] = -3709584728391680.000, df[0] = -2831960506368000.000, dc_hat[0] = -1532757729607680.000 + Gradient do_[0] = -188836277347418112.000 +Backward Time Step 2: + Gradient di[0] = -4859602487214080.000, df[0] = -3661553303814144.000, dc_hat[0] = -2677777891328000.000 + Gradient do_[0] = -216059291897954304.000 +Backward Time Step 1: + Gradient di[0] = -6131314386272256.000, df[0] = -4386735446294528.000, dc_hat[0] = -3711039916998656.000 + Gradient do_[0] = -193864481460191232.000 +Backward Time Step 0: + Gradient di[0] = -7159238036029440.000, df[0] = -5232689653219328.000, dc_hat[0] = -7237751111942144.000 + Gradient do_[0] = -111068017483317248.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1273709068288.000, df[0] = 933890686976.000, dc_hat[0] = 552965767168.000 + Gradient do_[0] = 78972863905792.000 +Backward Time Step 3: + Gradient di[0] = 1998301036544.000, df[0] = 1442063712256.000, dc_hat[0] = 782277410816.000 + Gradient do_[0] = 105706661347328.000 +Backward Time Step 2: + Gradient di[0] = 2542712389632.000, df[0] = 1826281488384.000, dc_hat[0] = 1299097583616.000 + Gradient do_[0] = 115194839695360.000 +Backward Time Step 1: + Gradient di[0] = 3188932476928.000, df[0] = 2198989045760.000, dc_hat[0] = 1773504299008.000 + Gradient do_[0] = 101512449622016.000 +Backward Time Step 0: + Gradient di[0] = 3860807024640.000, df[0] = 2746942226432.000, dc_hat[0] = 3622739378176.000 + Gradient do_[0] = 59687722549248.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2363108991434752.000, df[0] = -1855255683792896.000, dc_hat[0] = -1068103068811264.000 + Gradient do_[0] = -138145982398332928.000 +Backward Time Step 3: + Gradient di[0] = -3710413925515264.000, df[0] = -2832592671866880.000, dc_hat[0] = -1533099582160896.000 + Gradient do_[0] = -188878488286003200.000 +Backward Time Step 2: + Gradient di[0] = -4860693408907264.000, df[0] = -3662374984744960.000, dc_hat[0] = -2678376770830336.000 + Gradient do_[0] = -216107687589445632.000 +Backward Time Step 1: + Gradient di[0] = -6132696828870656.000, df[0] = -4387724094078976.000, dc_hat[0] = -3711871798476800.000 + Gradient do_[0] = -193908118327918592.000 +Backward Time Step 0: + Gradient di[0] = -7160853480603648.000, df[0] = -5233870232354816.000, dc_hat[0] = -7239383736385536.000 + Gradient do_[0] = -111093082912456704.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1274019577856.000, df[0] = 934118031360.000, dc_hat[0] = 553100574720.000 + Gradient do_[0] = 78992107372544.000 +Backward Time Step 3: + Gradient di[0] = 1998790852608.000, df[0] = 1442417213440.000, dc_hat[0] = 782468710400.000 + Gradient do_[0] = 105732565368832.000 +Backward Time Step 2: + Gradient di[0] = 2543333408768.000, df[0] = 1826727788544.000, dc_hat[0] = 1299414646784.000 + Gradient do_[0] = 115222958309376.000 +Backward Time Step 1: + Gradient di[0] = 3189712879616.000, df[0] = 2199527096320.000, dc_hat[0] = 1773936181248.000 + Gradient do_[0] = 101537229570048.000 +Backward Time Step 0: + Gradient di[0] = 3861752053760.000, df[0] = 2747614625792.000, dc_hat[0] = 3623625949184.000 + Gradient do_[0] = 59702339698688.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2363633245880320.000, df[0] = -1855667195346944.000, dc_hat[0] = -1068340097318912.000 + Gradient do_[0] = -138176571155415040.000 +Backward Time Step 3: + Gradient di[0] = -3711232922091520.000, df[0] = -2833218931785728.000, dc_hat[0] = -1533438079270912.000 + Gradient do_[0] = -188920166648643584.000 +Backward Time Step 2: + Gradient di[0] = -4861763392634880.000, df[0] = -3663181633290240.000, dc_hat[0] = -2678964912914432.000 + Gradient do_[0] = -216155258647216128.000 +Backward Time Step 1: + Gradient di[0] = -6134043301117952.000, df[0] = -4388686972059648.000, dc_hat[0] = -3712682741989376.000 + Gradient do_[0] = -193950655684018176.000 +Backward Time Step 0: + Gradient di[0] = -7162428659859456.000, df[0] = -5235021820461056.000, dc_hat[0] = -7240976632381440.000 + Gradient do_[0] = -111117512686436352.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1274457358336.000, df[0] = 934439092224.000, dc_hat[0] = 553290629120.000 + Gradient do_[0] = 79019210964992.000 +Backward Time Step 3: + Gradient di[0] = 1999475441664.000, df[0] = 1442911092736.000, dc_hat[0] = 782736621568.000 + Gradient do_[0] = 105768778989568.000 +Backward Time Step 2: + Gradient di[0] = 2544204513280.000, df[0] = 1827353526272.000, dc_hat[0] = 1299859767296.000 + Gradient do_[0] = 115262460264448.000 +Backward Time Step 1: + Gradient di[0] = 3190806806528.000, df[0] = 2200281284608.000, dc_hat[0] = 1774541864960.000 + Gradient do_[0] = 101572033904640.000 +Backward Time Step 0: + Gradient di[0] = 3863071948800.000, df[0] = 2748553363456.000, dc_hat[0] = 3624864579584.000 + Gradient do_[0] = 59722744987648.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2364167700873216.000, df[0] = -1856086759964672.000, dc_hat[0] = -1068581555011584.000 + Gradient do_[0] = -138207744028049408.000 +Backward Time Step 3: + Gradient di[0] = -3712073930375168.000, df[0] = -2833860760961024.000, dc_hat[0] = -1533784629444608.000 + Gradient do_[0] = -188962996062519296.000 +Backward Time Step 2: + Gradient di[0] = -4862867736100864.000, df[0] = -3664013246332928.000, dc_hat[0] = -2679572113915904.000 + Gradient do_[0] = -216204204094521344.000 +Backward Time Step 1: + Gradient di[0] = -6135434870521856.000, df[0] = -4389683136036864.000, dc_hat[0] = -3713521871224832.000 + Gradient do_[0] = -193994601789390848.000 +Backward Time Step 0: + Gradient di[0] = -7164047325659136.000, df[0] = -5236205083951104.000, dc_hat[0] = -7242613551792128.000 + Gradient do_[0] = -111142629655183360.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1274913751040.000, df[0] = 934773719040.000, dc_hat[0] = 553488744448.000 + Gradient do_[0] = 79047505739776.000 +Backward Time Step 3: + Gradient di[0] = 2000191881216.000, df[0] = 1443428171776.000, dc_hat[0] = 783016984576.000 + Gradient do_[0] = 105806670331904.000 +Backward Time Step 2: + Gradient di[0] = 2545114677248.000, df[0] = 1828007182336.000, dc_hat[0] = 1300323631104.000 + Gradient do_[0] = 115303656718336.000 +Backward Time Step 1: + Gradient di[0] = 3191949492224.000, df[0] = 2201069289472.000, dc_hat[0] = 1775175467008.000 + Gradient do_[0] = 101608348188672.000 +Backward Time Step 0: + Gradient di[0] = 3864453971968.000, df[0] = 2749536665600.000, dc_hat[0] = 3626161143808.000 + Gradient do_[0] = 59744106577920.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2364679875723264.000, df[0] = -1856488742060032.000, dc_hat[0] = -1068812476612608.000 + Gradient do_[0] = -138237748669579264.000 +Backward Time Step 3: + Gradient di[0] = -3712877089259520.000, df[0] = -2834473599107072.000, dc_hat[0] = -1534115476144128.000 + Gradient do_[0] = -189003884151177216.000 +Backward Time Step 2: + Gradient di[0] = -4863927519281152.000, df[0] = -3664811841814528.000, dc_hat[0] = -2680155155726336.000 + Gradient do_[0] = -216251328475693056.000 +Backward Time Step 1: + Gradient di[0] = -6136770068480000.000, df[0] = -4390637424082944.000, dc_hat[0] = -3714324224802816.000 + Gradient do_[0] = -194036692468891648.000 +Backward Time Step 0: + Gradient di[0] = -7165596198240256.000, df[0] = -5237336807833600.000, dc_hat[0] = -7244178530500608.000 + Gradient do_[0] = -111166664292171776.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1275099480064.000, df[0] = 934909837312.000, dc_hat[0] = 553569353728.000 + Gradient do_[0] = 79058989744128.000 +Backward Time Step 3: + Gradient di[0] = 2000482861056.000, df[0] = 1443638411264.000, dc_hat[0] = 783130755072.000 + Gradient do_[0] = 105822038261760.000 +Backward Time Step 2: + Gradient di[0] = 2545486659584.000, df[0] = 1828274044928.000, dc_hat[0] = 1300512374784.000 + Gradient do_[0] = 115320408768512.000 +Backward Time Step 1: + Gradient di[0] = 3192413749248.000, df[0] = 2201389367296.000, dc_hat[0] = 1775431581696.000 + Gradient do_[0] = 101623120527360.000 +Backward Time Step 0: + Gradient di[0] = 3865018368000.000, df[0] = 2749938532352.000, dc_hat[0] = 3626691198976.000 + Gradient do_[0] = 59752834924544.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2365210035748864.000, df[0] = -1856904682799104.000, dc_hat[0] = -1069051451277312.000 + Gradient do_[0] = -138268741153587200.000 +Backward Time Step 3: + Gradient di[0] = -3713709507608576.000, df[0] = -2835109254266880.000, dc_hat[0] = -1534459341963264.000 + Gradient do_[0] = -189046215348846592.000 +Backward Time Step 2: + Gradient di[0] = -4865009314168832.000, df[0] = -3665627348729856.000, dc_hat[0] = -2680750008696832.000 + Gradient do_[0] = -216299414929539072.000 +Backward Time Step 1: + Gradient di[0] = -6138142310531072.000, df[0] = -4391618824110080.000, dc_hat[0] = -3715149932265472.000 + Gradient do_[0] = -194080020098973696.000 +Backward Time Step 0: + Gradient di[0] = -7167196073558016.000, df[0] = -5238506112679936.000, dc_hat[0] = -7245796659429376.000 + Gradient do_[0] = -111191480613208064.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1275472510976.000, df[0] = 935183515648.000, dc_hat[0] = 553731293184.000 + Gradient do_[0] = 79082125524992.000 +Backward Time Step 3: + Gradient di[0] = 2001068883968.000, df[0] = 1444061118464.000, dc_hat[0] = 783360131072.000 + Gradient do_[0] = 105853009002496.000 +Backward Time Step 2: + Gradient di[0] = 2546231672832.000, df[0] = 1828809211904.000, dc_hat[0] = 1300892745728.000 + Gradient do_[0] = 115354198081536.000 +Backward Time Step 1: + Gradient di[0] = 3193348554752.000, df[0] = 2202033717248.000, dc_hat[0] = 1775948791808.000 + Gradient do_[0] = 101652832976896.000 +Backward Time Step 0: + Gradient di[0] = 3866144800768.000, df[0] = 2750739906560.000, dc_hat[0] = 3627748163584.000 + Gradient do_[0] = 59770253869056.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2365740464209920.000, df[0] = -1857321294626816.000, dc_hat[0] = -1069291298357248.000 + Gradient do_[0] = -138299673508052992.000 +Backward Time Step 3: + Gradient di[0] = -3714541925957632.000, df[0] = -2835744640991232.000, dc_hat[0] = -1534803073564672.000 + Gradient do_[0] = -189088598086123520.000 +Backward Time Step 2: + Gradient di[0] = -4866099698991104.000, df[0] = -3666447955918848.000, dc_hat[0] = -2681346740715520.000 + Gradient do_[0] = -216347827800899584.000 +Backward Time Step 1: + Gradient di[0] = -6139512405098496.000, df[0] = -4392599150395392.000, dc_hat[0] = -3715975908163584.000 + Gradient do_[0] = -194123330549186560.000 +Backward Time Step 0: + Gradient di[0] = -7168798096359424.000, df[0] = -5239677028139008.000, dc_hat[0] = -7247415862099968.000 + Gradient do_[0] = -111216331293982720.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1275730984960.000, df[0] = 935372980224.000, dc_hat[0] = 553843490816.000 + Gradient do_[0] = 79098147766272.000 +Backward Time Step 3: + Gradient di[0] = 2001474945024.000, df[0] = 1444354195456.000, dc_hat[0] = 783518531584.000 + Gradient do_[0] = 105874450284544.000 +Backward Time Step 2: + Gradient di[0] = 2546746523648.000, df[0] = 1829179228160.000, dc_hat[0] = 1301155676160.000 + Gradient do_[0] = 115377493245952.000 +Backward Time Step 1: + Gradient di[0] = 3193996050432.000, df[0] = 2202480148480.000, dc_hat[0] = 1776307142656.000 + Gradient do_[0] = 101673401843712.000 +Backward Time Step 0: + Gradient di[0] = 3866923892736.000, df[0] = 2751294078976.000, dc_hat[0] = 3628478758912.000 + Gradient do_[0] = 59782291521536.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2366248612528128.000, df[0] = -1857720458149888.000, dc_hat[0] = -1069521011998720.000 + Gradient do_[0] = -138329411861610496.000 +Backward Time Step 3: + Gradient di[0] = -3715344816406528.000, df[0] = -2836357747572736.000, dc_hat[0] = -1535134725570560.000 + Gradient do_[0] = -189129400275435520.000 +Backward Time Step 2: + Gradient di[0] = -4867153576591360.000, df[0] = -3667242793304064.000, dc_hat[0] = -2681926292865024.000 + Gradient do_[0] = -216394711663902720.000 +Backward Time Step 1: + Gradient di[0] = -6140846529314816.000, df[0] = -4393553438441472.000, dc_hat[0] = -3716778261741568.000 + Gradient do_[0] = -194165386868948992.000 +Backward Time Step 0: + Gradient di[0] = -7170342673973248.000, df[0] = -5240806067666944.000, dc_hat[0] = -7248977082712064.000 + Gradient do_[0] = -111240297211494400.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1276172435456.000, df[0] = 935696596992.000, dc_hat[0] = 554034659328.000 + Gradient do_[0] = 79125511405568.000 +Backward Time Step 3: + Gradient di[0] = 2002166480896.000, df[0] = 1444853055488.000, dc_hat[0] = 783788736512.000 + Gradient do_[0] = 105911016226816.000 +Backward Time Step 2: + Gradient di[0] = 2547629686784.000, df[0] = 1829813485568.000, dc_hat[0] = 1301605777408.000 + Gradient do_[0] = 115417448185856.000 +Backward Time Step 1: + Gradient di[0] = 3195102035968.000, df[0] = 2203242463232.000, dc_hat[0] = 1776919904256.000 + Gradient do_[0] = 101708575277056.000 +Backward Time Step 0: + Gradient di[0] = 3868260827136.000, df[0] = 2752245399552.000, dc_hat[0] = 3629733380096.000 + Gradient do_[0] = 59802965245952.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2366781456908288.000, df[0] = -1858138546372608.000, dc_hat[0] = -1069761395949568.000 + Gradient do_[0] = -138360550374506496.000 +Backward Time Step 3: + Gradient di[0] = -3716181798158336.000, df[0] = -2836996355522560.000, dc_hat[0] = -1535480202002432.000 + Gradient do_[0] = -189171989171142656.000 +Backward Time Step 2: + Gradient di[0] = -4868252551348224.000, df[0] = -3668070111379456.000, dc_hat[0] = -2682529735770112.000 + Gradient do_[0] = -216443502492385280.000 +Backward Time Step 1: + Gradient di[0] = -6142224676945920.000, df[0] = -4394539401871360.000, dc_hat[0] = -3717608801042432.000 + Gradient do_[0] = -194208886297722880.000 +Backward Time Step 0: + Gradient di[0] = -7171955971063808.000, df[0] = -5241985036189696.000, dc_hat[0] = -7250608096542720.000 + Gradient do_[0] = -111265311101026304.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1276449783808.000, df[0] = 935900020736.000, dc_hat[0] = 554155048960.000 + Gradient do_[0] = 79142666108928.000 +Backward Time Step 3: + Gradient di[0] = 2002597969920.000, df[0] = 1445164482560.000, dc_hat[0] = 783957622784.000 + Gradient do_[0] = 105933841629184.000 +Backward Time Step 2: + Gradient di[0] = 2548177043456.000, df[0] = 1830206701568.000, dc_hat[0] = 1301884698624.000 + Gradient do_[0] = 115442244911104.000 +Backward Time Step 1: + Gradient di[0] = 3195791474688.000, df[0] = 2203717730304.000, dc_hat[0] = 1777301454848.000 + Gradient do_[0] = 101730519875584.000 +Backward Time Step 0: + Gradient di[0] = 3869096542208.000, df[0] = 2752839680000.000, dc_hat[0] = 3630517452800.000 + Gradient do_[0] = 59815879507968.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2367296584548352.000, df[0] = -1858543212822528.000, dc_hat[0] = -1069994330816512.000 + Gradient do_[0] = -138390658095251456.000 +Backward Time Step 3: + Gradient di[0] = -3716986299219968.000, df[0] = -2837611072716800.000, dc_hat[0] = -1535811719790592.000 + Gradient do_[0] = -189212963159146496.000 +Backward Time Step 2: + Gradient di[0] = -4869303207723008.000, df[0] = -3668861459103744.000, dc_hat[0] = -2683108214177792.000 + Gradient do_[0] = -216490231736565760.000 +Backward Time Step 1: + Gradient di[0] = -6143572222935040.000, df[0] = -4395502816722944.000, dc_hat[0] = -3718420012990464.000 + Gradient do_[0] = -194251440833691648.000 +Backward Time Step 0: + Gradient di[0] = -7173525781610496.000, df[0] = -5243132866199552.000, dc_hat[0] = -7252195623829504.000 + Gradient do_[0] = -111289672155529216.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1276820979712.000, df[0] = 936172126208.000, dc_hat[0] = 554316267520.000 + Gradient do_[0] = 79165676060672.000 +Backward Time Step 3: + Gradient di[0] = 2003180978176.000, df[0] = 1445585485824.000, dc_hat[0] = 784185360384.000 + Gradient do_[0] = 105964694929408.000 +Backward Time Step 2: + Gradient di[0] = 2548919435264.000, df[0] = 1830739247104.000, dc_hat[0] = 1302262054912.000 + Gradient do_[0] = 115475824508928.000 +Backward Time Step 1: + Gradient di[0] = 3196721823744.000, df[0] = 2204359458816.000, dc_hat[0] = 1777816174592.000 + Gradient do_[0] = 101760072941568.000 +Backward Time Step 0: + Gradient di[0] = 3870226907136.000, df[0] = 2753643937792.000, dc_hat[0] = 3631578087424.000 + Gradient do_[0] = 59833357172736.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2367818154639360.000, df[0] = -1858952442675200.000, dc_hat[0] = -1070229815820288.000 + Gradient do_[0] = -138421118003314688.000 +Backward Time Step 3: + Gradient di[0] = -3717806101102592.000, df[0] = -2838236527329280.000, dc_hat[0] = -1536149545811968.000 + Gradient do_[0] = -189254675881525248.000 +Backward Time Step 2: + Gradient di[0] = -4870374802063360.000, df[0] = -3669668912955392.000, dc_hat[0] = -2683696893132800.000 + Gradient do_[0] = -216537837154074624.000 +Backward Time Step 1: + Gradient di[0] = -6144914937085952.000, df[0] = -4396463547219968.000, dc_hat[0] = -3719229077454848.000 + Gradient do_[0] = -194293892290445312.000 +Backward Time Step 0: + Gradient di[0] = -7175083780997120.000, df[0] = -5244271569403904.000, dc_hat[0] = -7253770803085312.000 + Gradient do_[0] = -111313852821405696.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1276939206656.000, df[0] = 936258895872.000, dc_hat[0] = 554367516672.000 + Gradient do_[0] = 79173007704064.000 +Backward Time Step 3: + Gradient di[0] = 2003368017920.000, df[0] = 1445720358912.000, dc_hat[0] = 784258564096.000 + Gradient do_[0] = 105974568321024.000 +Backward Time Step 2: + Gradient di[0] = 2549160083456.000, df[0] = 1830912655360.000, dc_hat[0] = 1302385655808.000 + Gradient do_[0] = 115486746476544.000 +Backward Time Step 1: + Gradient di[0] = 3197024337920.000, df[0] = 2204567863296.000, dc_hat[0] = 1777982636032.000 + Gradient do_[0] = 101769652731904.000 +Backward Time Step 0: + Gradient di[0] = 3870585782272.000, df[0] = 2753899528192.000, dc_hat[0] = 3631914942464.000 + Gradient do_[0] = 59838902042624.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2368346167181312.000, df[0] = -1859367175454720.000, dc_hat[0] = -1070468522049536.000 + Gradient do_[0] = -138451904328892416.000 +Backward Time Step 3: + Gradient di[0] = -3718634224484352.000, df[0] = -2838869229699072.000, dc_hat[0] = -1536491398365184.000 + Gradient do_[0] = -189296663481810944.000 +Backward Time Step 2: + Gradient di[0] = -4871464650014720.000, df[0] = -3670490862321664.000, dc_hat[0] = -2684297114812416.000 + Gradient do_[0] = -216586232845565952.000 +Backward Time Step 1: + Gradient di[0] = -6146295232200704.000, df[0] = -4397450047520768.000, dc_hat[0] = -3720059348320256.000 + Gradient do_[0] = -194337494798434304.000 +Backward Time Step 0: + Gradient di[0] = -7176694393733120.000, df[0] = -5245448390443008.000, dc_hat[0] = -7255398595690496.000 + Gradient do_[0] = -111338832351199232.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1277371351040.000, df[0] = 936575762432.000, dc_hat[0] = 554555211776.000 + Gradient do_[0] = 79199775752192.000 +Backward Time Step 3: + Gradient di[0] = 2004045398016.000, df[0] = 1446209126400.000, dc_hat[0] = 784523526144.000 + Gradient do_[0] = 106010320568320.000 +Backward Time Step 2: + Gradient di[0] = 2550018080768.000, df[0] = 1831528955904.000, dc_hat[0] = 1302823436288.000 + Gradient do_[0] = 115525577342976.000 +Backward Time Step 1: + Gradient di[0] = 3198100701184.000, df[0] = 2205309992960.000, dc_hat[0] = 1778579013632.000 + Gradient do_[0] = 101803895029760.000 +Backward Time Step 0: + Gradient di[0] = 3871888637952.000, df[0] = 2754826469376.000, dc_hat[0] = 3633137844224.000 + Gradient do_[0] = 59859051479040.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2368848141484032.000, df[0] = -1859761238704128.000, dc_hat[0] = -1070695618445312.000 + Gradient do_[0] = -138481247545458688.000 +Backward Time Step 3: + Gradient di[0] = -3719420740370432.000, df[0] = -2839469182943232.000, dc_hat[0] = -1536815399960576.000 + Gradient do_[0] = -189336658217271296.000 +Backward Time Step 2: + Gradient di[0] = -4872492220940288.000, df[0] = -3671263956434944.000, dc_hat[0] = -2684860023963648.000 + Gradient do_[0] = -216631793858641920.000 +Backward Time Step 1: + Gradient di[0] = -6147584796131328.000, df[0] = -4398372928618496.000, dc_hat[0] = -3720834858352640.000 + Gradient do_[0] = -194378193908531200.000 +Backward Time Step 0: + Gradient di[0] = -7178208906575872.000, df[0] = -5246555418263552.000, dc_hat[0] = -7256929214660608.000 + Gradient do_[0] = -111362317232373760.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1277719740416.000, df[0] = 936830959616.000, dc_hat[0] = 554706337792.000 + Gradient do_[0] = 79221376417792.000 +Backward Time Step 3: + Gradient di[0] = 2004594720768.000, df[0] = 1446605488128.000, dc_hat[0] = 784738484224.000 + Gradient do_[0] = 106039353540608.000 +Backward Time Step 2: + Gradient di[0] = 2550718005248.000, df[0] = 1832031617024.000, dc_hat[0] = 1303179689984.000 + Gradient do_[0] = 115557244338176.000 +Backward Time Step 1: + Gradient di[0] = 3198977310720.000, df[0] = 2205914234880.000, dc_hat[0] = 1779063717888.000 + Gradient do_[0] = 101831745208320.000 +Backward Time Step 0: + Gradient di[0] = 3872946651136.000, df[0] = 2755579084800.000, dc_hat[0] = 3634130321408.000 + Gradient do_[0] = 59875400876032.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2369368906268672.000, df[0] = -1860169931685888.000, dc_hat[0] = -1070930767904768.000 + Gradient do_[0] = -138511707453521920.000 +Backward Time Step 3: + Gradient di[0] = -3720243226607616.000, df[0] = -2840097053474816.000, dc_hat[0] = -1537154031288320.000 + Gradient do_[0] = -189378594277949440.000 +Backward Time Step 2: + Gradient di[0] = -4873572942086144.000, df[0] = -3672078121172992.000, dc_hat[0] = -2685454608498688.000 + Gradient do_[0] = -216679674154057728.000 +Backward Time Step 1: + Gradient di[0] = -6148942005796864.000, df[0] = -4399343859662848.000, dc_hat[0] = -3721652781187072.000 + Gradient do_[0] = -194421023322406912.000 +Backward Time Step 0: + Gradient di[0] = -7179783548960768.000, df[0] = -5247706469498880.000, dc_hat[0] = -7258521573785600.000 + Gradient do_[0] = -111386755596288000.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1278110334976.000, df[0] = 937117351936.000, dc_hat[0] = 554875355136.000 + Gradient do_[0] = 79245594329088.000 +Backward Time Step 3: + Gradient di[0] = 2005206433792.000, df[0] = 1447047069696.000, dc_hat[0] = 784977494016.000 + Gradient do_[0] = 106071716790272.000 +Backward Time Step 2: + Gradient di[0] = 2551498407936.000, df[0] = 1832591687680.000, dc_hat[0] = 1303577493504.000 + Gradient do_[0] = 115592526823424.000 +Backward Time Step 1: + Gradient di[0] = 3199954583552.000, df[0] = 2206588207104.000, dc_hat[0] = 1779605569536.000 + Gradient do_[0] = 101862841778176.000 +Backward Time Step 0: + Gradient di[0] = 3874129182720.000, df[0] = 2756420567040.000, dc_hat[0] = 3635239714816.000 + Gradient do_[0] = 59893688041472.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2369887255134208.000, df[0] = -1860576611401728.000, dc_hat[0] = -1071163904098304.000 + Gradient do_[0] = -138541978383024128.000 +Backward Time Step 3: + Gradient di[0] = -3721056317603840.000, df[0] = -2840717944684544.000, dc_hat[0] = -1537489978261504.000 + Gradient do_[0] = -189419963402944512.000 +Backward Time Step 2: + Gradient di[0] = -4874638630846464.000, df[0] = -3672881816928256.000, dc_hat[0] = -2686040871534592.000 + Gradient do_[0] = -216727056233267200.000 +Backward Time Step 1: + Gradient di[0] = -6150288478044160.000, df[0] = -4400307274514432.000, dc_hat[0] = -3722463456264192.000 + Gradient do_[0] = -194463577858375680.000 +Backward Time Step 0: + Gradient di[0] = -7181347990798336.000, df[0] = -5248850004541440.000, dc_hat[0] = -7260102658621440.000 + Gradient do_[0] = -111411030751444992.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1278472224768.000, df[0] = 937382707200.000, dc_hat[0] = 555032772608.000 + Gradient do_[0] = 79268033855488.000 +Backward Time Step 3: + Gradient di[0] = 2005773320192.000, df[0] = 1447456145408.000, dc_hat[0] = 785199398912.000 + Gradient do_[0] = 106101697675264.000 +Backward Time Step 2: + Gradient di[0] = 2552219566080.000, df[0] = 1833109553152.000, dc_hat[0] = 1303945281536.000 + Gradient do_[0] = 115625183674368.000 +Backward Time Step 1: + Gradient di[0] = 3200861339648.000, df[0] = 2207213420544.000, dc_hat[0] = 1780107182080.000 + Gradient do_[0] = 101891665035264.000 +Backward Time Step 0: + Gradient di[0] = 3875221012480.000, df[0] = 2757197561856.000, dc_hat[0] = 3636264435712.000 + Gradient do_[0] = 59910570115072.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2370400772161536.000, df[0] = -1860980069892096.000, dc_hat[0] = -1071396033658880.000 + Gradient do_[0] = -138571983024553984.000 +Backward Time Step 3: + Gradient di[0] = -3721864308326400.000, df[0] = -2841334809362432.000, dc_hat[0] = -1537823643533312.000 + Gradient do_[0] = -189461006110425088.000 +Backward Time Step 2: + Gradient di[0] = -4875693045317632.000, df[0] = -3673676385878016.000, dc_hat[0] = -2686620423684096.000 + Gradient do_[0] = -216773957276139520.000 +Backward Time Step 1: + Gradient di[0] = -6151625286615040.000, df[0] = -4401263710044160.000, dc_hat[0] = -3723268494196736.000 + Gradient do_[0] = -194505702897614848.000 +Backward Time Step 0: + Gradient di[0] = -7182915116990464.000, df[0] = -5249995150196736.000, dc_hat[0] = -7261687501553664.000 + Gradient do_[0] = -111435340266340352.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1278737776640.000, df[0] = 937577480192.000, dc_hat[0] = 555148378112.000 + Gradient do_[0] = 79284509081600.000 +Backward Time Step 3: + Gradient di[0] = 2006191046656.000, df[0] = 1447757611008.000, dc_hat[0] = 785362386944.000 + Gradient do_[0] = 106123793268736.000 +Backward Time Step 2: + Gradient di[0] = 2552752242688.000, df[0] = 1833492283392.000, dc_hat[0] = 1304216469504.000 + Gradient do_[0] = 115649326088192.000 +Backward Time Step 1: + Gradient di[0] = 3201528233984.000, df[0] = 2207673221120.000, dc_hat[0] = 1780475756544.000 + Gradient do_[0] = 101912879824896.000 +Backward Time Step 0: + Gradient di[0] = 3876025532416.000, df[0] = 2757769822208.000, dc_hat[0] = 3637019410432.000 + Gradient do_[0] = 59923006226432.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2370925026607104.000, df[0] = -1861391715663872.000, dc_hat[0] = -1071632793731072.000 + Gradient do_[0] = -138602606141374464.000 +Backward Time Step 3: + Gradient di[0] = -3722686794563584.000, df[0] = -2841962411458560.000, dc_hat[0] = -1538162677514240.000 + Gradient do_[0] = -189502890631495680.000 +Backward Time Step 2: + Gradient di[0] = -4876772692721664.000, df[0] = -3674488940003328.000, dc_hat[0] = -2687212860735488.000 + Gradient do_[0] = -216821906291032064.000 +Backward Time Step 1: + Gradient di[0] = -6152990012473344.000, df[0] = -4402238936055808.000, dc_hat[0] = -3724089101385728.000 + Gradient do_[0] = -194548790009528320.000 +Backward Time Step 0: + Gradient di[0] = -7184504254889984.000, df[0] = -5251156938850304.000, dc_hat[0] = -7263293819322368.000 + Gradient do_[0] = -111459993378619392.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1279085641728.000, df[0] = 937832415232.000, dc_hat[0] = 555298717696.000 + Gradient do_[0] = 79306067804160.000 +Backward Time Step 3: + Gradient di[0] = 2006736437248.000, df[0] = 1448151089152.000, dc_hat[0] = 785575378944.000 + Gradient do_[0] = 106152633303040.000 +Backward Time Step 2: + Gradient di[0] = 2553445351424.000, df[0] = 1833989963776.000, dc_hat[0] = 1304569708544.000 + Gradient do_[0] = 115680707870720.000 +Backward Time Step 1: + Gradient di[0] = 3202396454912.000, df[0] = 2208271695872.000, dc_hat[0] = 1780956528640.000 + Gradient do_[0] = 101940453179392.000 +Backward Time Step 0: + Gradient di[0] = 3877079613440.000, df[0] = 2758519816192.000, dc_hat[0] = 3638008217600.000 + Gradient do_[0] = 59939301097472.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2371436396150784.000, df[0] = -1861792892452864.000, dc_hat[0] = -1071863312678912.000 + Gradient do_[0] = -138632473343950848.000 +Backward Time Step 3: + Gradient di[0] = -3723488074399744.000, df[0] = -2842574712733696.000, dc_hat[0] = -1538493121560576.000 + Gradient do_[0] = -189543641281200128.000 +Backward Time Step 2: + Gradient di[0] = -4877826570321920.000, df[0] = -3675283240517632.000, dc_hat[0] = -2687792144449536.000 + Gradient do_[0] = -216868704254689280.000 +Backward Time Step 1: + Gradient di[0] = -6154321989206016.000, df[0] = -4403191881924608.000, dc_hat[0] = -3724890918092800.000 + Gradient do_[0] = -194590880689029120.000 +Backward Time Step 0: + Gradient di[0] = -7186057959309312.000, df[0] = -5252292420829184.000, dc_hat[0] = -7264864166739968.000 + Gradient do_[0] = -111484088145149952.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1279297847296.000, df[0] = 937987997696.000, dc_hat[0] = 555391123456.000 + Gradient do_[0] = 79319221141504.000 +Backward Time Step 3: + Gradient di[0] = 2007070146560.000, df[0] = 1448391868416.000, dc_hat[0] = 785706057728.000 + Gradient do_[0] = 106170299711488.000 +Backward Time Step 2: + Gradient di[0] = 2553870024704.000, df[0] = 1834295099392.000, dc_hat[0] = 1304786108416.000 + Gradient do_[0] = 115699959726080.000 +Backward Time Step 1: + Gradient di[0] = 3202932277248.000, df[0] = 2208640532480.000, dc_hat[0] = 1781252227072.000 + Gradient do_[0] = 101957498830848.000 +Backward Time Step 0: + Gradient di[0] = 3877726322688.000, df[0] = 2758979878912.000, dc_hat[0] = 3638615343104.000 + Gradient do_[0] = 59949300318208.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2371929780518912.000, df[0] = -1862180513251328.000, dc_hat[0] = -1072086718087168.000 + Gradient do_[0] = -138661318344310784.000 +Backward Time Step 3: + Gradient di[0] = -3724261705383936.000, df[0] = -2843165270736896.000, dc_hat[0] = -1538812828188672.000 + Gradient do_[0] = -189582914462154752.000 +Backward Time Step 2: + Gradient di[0] = -4878834813894656.000, df[0] = -3676043181293568.000, dc_hat[0] = -2688345658359808.000 + Gradient do_[0] = -216913492173651968.000 +Backward Time Step 1: + Gradient di[0] = -6155593299525632.000, df[0] = -4404101609684992.000, dc_hat[0] = -3725657032884224.000 + Gradient do_[0] = -194631012863442944.000 +Backward Time Step 0: + Gradient di[0] = -7187540796768256.000, df[0] = -5253376363200512.000, dc_hat[0] = -7266363647197184.000 + Gradient do_[0] = -111507100579921920.000 +Epoch 900, Train Loss=0.011198, Weight Norm=13.137237 +Sample Predictions at Epoch 900: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.49 | 63.87 | 6.38 | +| 193 | 2024-10-14 | 56.87 | 66.55 | 9.68 | +| 194 | 2024-10-15 | 57.05 | 66.00 | 8.95 | +| 195 | 2024-10-16 | 58.02 | 67.20 | 9.18 | +| 196 | 2024-10-17 | 57.54 | 66.76 | 9.22 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1279672713216.000, df[0] = 938262790144.000, dc_hat[0] = 555553587200.000 + Gradient do_[0] = 79342432419840.000 +Backward Time Step 3: + Gradient di[0] = 2007656431616.000, df[0] = 1448814706688.000, dc_hat[0] = 785934778368.000 + Gradient do_[0] = 106201245286400.000 +Backward Time Step 2: + Gradient di[0] = 2554616348672.000, df[0] = 1834830790656.000, dc_hat[0] = 1305166086144.000 + Gradient do_[0] = 115733681930240.000 +Backward Time Step 1: + Gradient di[0] = 3203867607040.000, df[0] = 2209285931008.000, dc_hat[0] = 1781770747904.000 + Gradient do_[0] = 101987219668992.000 +Backward Time Step 0: + Gradient di[0] = 3878860619776.000, df[0] = 2759786758144.000, dc_hat[0] = 3639679647744.000 + Gradient do_[0] = 59966832508928.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2372427728289792.000, df[0] = -1862571086839808.000, dc_hat[0] = -1072311264346112.000 + Gradient do_[0] = -138690446812512256.000 +Backward Time Step 3: + Gradient di[0] = -3725043389431808.000, df[0] = -2843761465884672.000, dc_hat[0] = -1539135621824512.000 + Gradient do_[0] = -189622823298269184.000 +Backward Time Step 2: + Gradient di[0] = -4879867753529344.000, df[0] = -3676820033503232.000, dc_hat[0] = -2688911520301056.000 + Gradient do_[0] = -216959413963980800.000 +Backward Time Step 1: + Gradient di[0] = -6156894137745408.000, df[0] = -4405031738540032.000, dc_hat[0] = -3726439253803008.000 + Gradient do_[0] = -194672089930661888.000 +Backward Time Step 0: + Gradient di[0] = -7189048330289152.000, df[0] = -5254478022311936.000, dc_hat[0] = -7267888360587264.000 + Gradient do_[0] = -111530482381881344.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1280000000000.000, df[0] = 938502848512.000, dc_hat[0] = 555695734784.000 + Gradient do_[0] = 79362724462592.000 +Backward Time Step 3: + Gradient di[0] = 2008169971712.000, df[0] = 1449185378304.000, dc_hat[0] = 786135711744.000 + Gradient do_[0] = 106228390821888.000 +Backward Time Step 2: + Gradient di[0] = 2555269873664.000, df[0] = 1835300421632.000, dc_hat[0] = 1305499926528.000 + Gradient do_[0] = 115763285327872.000 +Backward Time Step 1: + Gradient di[0] = 3204688117760.000, df[0] = 2209851899904.000, dc_hat[0] = 1782225305600.000 + Gradient do_[0] = 102013316628480.000 +Backward Time Step 0: + Gradient di[0] = 3879848902656.000, df[0] = 2760490090496.000, dc_hat[0] = 3640606851072.000 + Gradient do_[0] = 59982116552704.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2372963525459968.000, df[0] = -1862992127852544.000, dc_hat[0] = -1072553393127424.000 + Gradient do_[0] = -138721731354296320.000 +Backward Time Step 3: + Gradient di[0] = -3725890840166400.000, df[0] = -2844408395333632.000, dc_hat[0] = -1539484051046400.000 + Gradient do_[0] = -189665841690705920.000 +Backward Time Step 2: + Gradient di[0] = -4880970486382592.000, df[0] = -3677651109675008.000, dc_hat[0] = -2689517110689792.000 + Gradient do_[0] = -217008290691809280.000 +Backward Time Step 1: + Gradient di[0] = -6158284633407488.000, df[0] = -4406026828775424.000, dc_hat[0] = -3727277040861184.000 + Gradient do_[0] = -194715932956819456.000 +Backward Time Step 0: + Gradient di[0] = -7190673975410688.000, df[0] = -5255666117640192.000, dc_hat[0] = -7269531185577984.000 + Gradient do_[0] = -111555711019778048.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1280346161152.000, df[0] = 938756603904.000, dc_hat[0] = 555845877760.000 + Gradient do_[0] = 79384182521856.000 +Backward Time Step 3: + Gradient di[0] = 2008714706944.000, df[0] = 1449578463232.000, dc_hat[0] = 786348900352.000 + Gradient do_[0] = 106257197301760.000 +Backward Time Step 2: + Gradient di[0] = 2555964555264.000, df[0] = 1835799281664.000, dc_hat[0] = 1305853952000.000 + Gradient do_[0] = 115794742607872.000 +Backward Time Step 1: + Gradient di[0] = 3205558697984.000, df[0] = 2210451947520.000, dc_hat[0] = 1782706339840.000 + Gradient do_[0] = 102040982257664.000 +Backward Time Step 0: + Gradient di[0] = 3880899051520.000, df[0] = 2761237463040.000, dc_hat[0] = 3641592250368.000 + Gradient do_[0] = 59998352703488.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2373470063165440.000, df[0] = -1863389546545152.000, dc_hat[0] = -1072782301462528.000 + Gradient do_[0] = -138751323678965760.000 +Backward Time Step 3: + Gradient di[0] = -3726679771971584.000, df[0] = -2845010764496896.000, dc_hat[0] = -1539809260601344.000 + Gradient do_[0] = -189706008224858112.000 +Backward Time Step 2: + Gradient di[0] = -4882005036630016.000, df[0] = -3678430914674688.000, dc_hat[0] = -2690085656985600.000 + Gradient do_[0] = -217054298381484032.000 +Backward Time Step 1: + Gradient di[0] = -6159588155981824.000, df[0] = -4406959105114112.000, dc_hat[0] = -3728061677699072.000 + Gradient do_[0] = -194757130283122688.000 +Backward Time Step 0: + Gradient di[0] = -7192191709478912.000, df[0] = -5256775829815296.000, dc_hat[0] = -7271065562644480.000 + Gradient do_[0] = -111579256030494720.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1280724041728.000, df[0] = 939033690112.000, dc_hat[0] = 556009848832.000 + Gradient do_[0] = 79407611904000.000 +Backward Time Step 3: + Gradient di[0] = 2009308594176.000, df[0] = 1450007068672.000, dc_hat[0] = 786580963328.000 + Gradient do_[0] = 106288587472896.000 +Backward Time Step 2: + Gradient di[0] = 2556717694976.000, df[0] = 1836339822592.000, dc_hat[0] = 1306238255104.000 + Gradient do_[0] = 115828850688000.000 +Backward Time Step 1: + Gradient di[0] = 3206499270656.000, df[0] = 2211100491776.000, dc_hat[0] = 1783227744256.000 + Gradient do_[0] = 102070912811008.000 +Backward Time Step 0: + Gradient di[0] = 3882035445760.000, df[0] = 2762045915136.000, dc_hat[0] = 3642658652160.000 + Gradient do_[0] = 60015914254336.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2374001296932864.000, df[0] = -1863806829461504.000, dc_hat[0] = -1073022416977920.000 + Gradient do_[0] = -138782290393169920.000 +Backward Time Step 3: + Gradient di[0] = -3727513532497920.000, df[0] = -2845646956527616.000, dc_hat[0] = -1540153126420480.000 + Gradient do_[0] = -189748390962135040.000 +Backward Time Step 2: + Gradient di[0] = -4883098642677760.000, df[0] = -3679254743089152.000, dc_hat[0] = -2690687757713408.000 + Gradient do_[0] = -217102762792452096.000 +Backward Time Step 1: + Gradient di[0] = -6160972209192960.000, df[0] = -4407949095075840.000, dc_hat[0] = -3728894096048128.000 + Gradient do_[0] = -194800801510588416.000 +Backward Time Step 0: + Gradient di[0] = -7193809838407680.000, df[0] = -5257958019563520.000, dc_hat[0] = -7272701408313344.000 + Gradient do_[0] = -111604347229437952.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1281062207488.000, df[0] = 939281547264.000, dc_hat[0] = 556156059648.000 + Gradient do_[0] = 79428575035392.000 +Backward Time Step 3: + Gradient di[0] = 2009836421120.000, df[0] = 1450387963904.000, dc_hat[0] = 786787598336.000 + Gradient do_[0] = 106316513148928.000 +Backward Time Step 2: + Gradient di[0] = 2557390094336.000, df[0] = 1836823085056.000, dc_hat[0] = 1306581270528.000 + Gradient do_[0] = 115859309723648.000 +Backward Time Step 1: + Gradient di[0] = 3207349403648.000, df[0] = 2211686383616.000, dc_hat[0] = 1783697637376.000 + Gradient do_[0] = 102097932517376.000 +Backward Time Step 0: + Gradient di[0] = 3883062001664.000, df[0] = 2762776248320.000, dc_hat[0] = 3643622031360.000 + Gradient do_[0] = 60031793889280.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2374497902526464.000, df[0] = -1864196731961344.000, dc_hat[0] = -1073246963236864.000 + Gradient do_[0] = -138811418861371392.000 +Backward Time Step 3: + Gradient di[0] = -3728298169335808.000, df[0] = -2846246372900864.000, dc_hat[0] = -1540477262233600.000 + Gradient do_[0] = -189788334157987840.000 +Backward Time Step 2: + Gradient di[0] = -4884124066119680.000, df[0] = -3680027568766976.000, dc_hat[0] = -2691251203735552.000 + Gradient do_[0] = -217148392525004800.000 +Backward Time Step 1: + Gradient di[0] = -6162268752445440.000, df[0] = -4408876002705408.000, dc_hat[0] = -3729675243225088.000 + Gradient do_[0] = -194841723958984704.000 +Backward Time Step 0: + Gradient di[0] = -7195321130024960.000, df[0] = -5259062363029504.000, dc_hat[0] = -7274228806057984.000 + Gradient do_[0] = -111627797750874112.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1281330905088.000, df[0] = 939478614016.000, dc_hat[0] = 556273238016.000 + Gradient do_[0] = 79445226422272.000 +Backward Time Step 3: + Gradient di[0] = 2010260439040.000, df[0] = 1450694148096.000, dc_hat[0] = 786953601024.000 + Gradient do_[0] = 106338919120896.000 +Backward Time Step 2: + Gradient di[0] = 2557929848832.000, df[0] = 1837210402816.000, dc_hat[0] = 1306855342080.000 + Gradient do_[0] = 115883703795712.000 +Backward Time Step 1: + Gradient di[0] = 3208023900160.000, df[0] = 2212151427072.000, dc_hat[0] = 1784070668288.000 + Gradient do_[0] = 102119382188032.000 +Backward Time Step 0: + Gradient di[0] = 3883880939520.000, df[0] = 2763358994432.000, dc_hat[0] = 3644390375424.000 + Gradient do_[0] = 60044448104448.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2375032089083904.000, df[0] = -1864616162361344.000, dc_hat[0] = -1073488420929536.000 + Gradient do_[0] = -138842600323940352.000 +Backward Time Step 3: + Gradient di[0] = -3729133540474880.000, df[0] = -2846883907108864.000, dc_hat[0] = -1540821396488192.000 + Gradient do_[0] = -189830751255003136.000 +Backward Time Step 2: + Gradient di[0] = -4885216061554688.000, df[0] = -3680849518133248.000, dc_hat[0] = -2691851156979712.000 + Gradient do_[0] = -217196856935972864.000 +Backward Time Step 1: + Gradient di[0] = -6163651731914752.000, df[0] = -4409864650489856.000, dc_hat[0] = -3730507124703232.000 + Gradient do_[0] = -194885378006581248.000 +Backward Time Step 0: + Gradient di[0] = -7196930669019136.000, df[0] = -5260239184068608.000, dc_hat[0] = -7275856061792256.000 + Gradient do_[0] = -111652768690733056.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1281652424704.000, df[0] = 939714347008.000, dc_hat[0] = 556412174336.000 + Gradient do_[0] = 79465115811840.000 +Backward Time Step 3: + Gradient di[0] = 2010763100160.000, df[0] = 1451056824320.000, dc_hat[0] = 787150340096.000 + Gradient do_[0] = 106365494231040.000 +Backward Time Step 2: + Gradient di[0] = 2558569218048.000, df[0] = 1837669810176.000, dc_hat[0] = 1307181580288.000 + Gradient do_[0] = 115912636104704.000 +Backward Time Step 1: + Gradient di[0] = 3208827633664.000, df[0] = 2212705599488.000, dc_hat[0] = 1784515657728.000 + Gradient do_[0] = 102144883556352.000 +Backward Time Step 0: + Gradient di[0] = 3884846153728.000, df[0] = 2764045811712.000, dc_hat[0] = 3645296082944.000 + Gradient do_[0] = 60059367243776.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2375539968966656.000, df[0] = -1865015191666688.000, dc_hat[0] = -1073717933244416.000 + Gradient do_[0] = -138872287137890304.000 +Backward Time Step 3: + Gradient di[0] = -3729930793779200.000, df[0] = -2847492987158528.000, dc_hat[0] = -1541150901010432.000 + Gradient do_[0] = -189871347285884928.000 +Backward Time Step 2: + Gradient di[0] = -4886261886091264.000, df[0] = -3681638181502976.000, dc_hat[0] = -2692425608855552.000 + Gradient do_[0] = -217243328482115584.000 +Backward Time Step 1: + Gradient di[0] = -6164956328230912.000, df[0] = -4410799074312192.000, dc_hat[0] = -3731293103718400.000 + Gradient do_[0] = -194926592512753664.000 +Backward Time Step 0: + Gradient di[0] = -7198460214247424.000, df[0] = -5261356949307392.000, dc_hat[0] = -7277402786889728.000 + Gradient do_[0] = -111676502680010752.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1281944584192.000, df[0] = 939928453120.000, dc_hat[0] = 556539052032.000 + Gradient do_[0] = 79483218427904.000 +Backward Time Step 3: + Gradient di[0] = 2011221196800.000, df[0] = 1451387125760.000, dc_hat[0] = 787329056768.000 + Gradient do_[0] = 106389712142336.000 +Backward Time Step 2: + Gradient di[0] = 2559152750592.000, df[0] = 1838088585216.000, dc_hat[0] = 1307478458368.000 + Gradient do_[0] = 115939018276864.000 +Backward Time Step 1: + Gradient di[0] = 3209559277568.000, df[0] = 2213209702400.000, dc_hat[0] = 1784920801280.000 + Gradient do_[0] = 102168136777728.000 +Backward Time Step 0: + Gradient di[0] = 3885738229760.000, df[0] = 2764680462336.000, dc_hat[0] = 3646133370880.000 + Gradient do_[0] = 60073162309632.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2376043017011200.000, df[0] = -1865409926004736.000, dc_hat[0] = -1073944157224960.000 + Gradient do_[0] = -138901664714194944.000 +Backward Time Step 3: + Gradient di[0] = -3730721336197120.000, df[0] = -2848096430063616.000, dc_hat[0] = -1541477452742656.000 + Gradient do_[0] = -189911599719383040.000 +Backward Time Step 2: + Gradient di[0] = -4887296973209600.000, df[0] = -3682417181196288.000, dc_hat[0] = -2692994423586816.000 + Gradient do_[0] = -217289336171790336.000 +Backward Time Step 1: + Gradient di[0] = -6166277030674432.000, df[0] = -4411743430246400.000, dc_hat[0] = -3732088477974528.000 + Gradient do_[0] = -194968305235132416.000 +Backward Time Step 0: + Gradient di[0] = -7199991906959360.000, df[0] = -5262476325158912.000, dc_hat[0] = -7278951659470848.000 + Gradient do_[0] = -111700253849157632.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1282231631872.000, df[0] = 940139020288.000, dc_hat[0] = 556663701504.000 + Gradient do_[0] = 79501077774336.000 +Backward Time Step 3: + Gradient di[0] = 2011671429120.000, df[0] = 1451712184320.000, dc_hat[0] = 787505348608.000 + Gradient do_[0] = 106413535789056.000 +Backward Time Step 2: + Gradient di[0] = 2559729205248.000, df[0] = 1838502510592.000, dc_hat[0] = 1307772321792.000 + Gradient do_[0] = 115965157179392.000 +Backward Time Step 1: + Gradient di[0] = 3210280435712.000, df[0] = 2213706989568.000, dc_hat[0] = 1785319522304.000 + Gradient do_[0] = 102191079620608.000 +Backward Time Step 0: + Gradient di[0] = 3886603829248.000, df[0] = 2765296238592.000, dc_hat[0] = 3646945230848.000 + Gradient do_[0] = 60086542139392.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2376558949957632.000, df[0] = -1865814458236928.000, dc_hat[0] = -1074177293418496.000 + Gradient do_[0] = -138931781024874496.000 +Backward Time Step 3: + Gradient di[0] = -3731528253177856.000, df[0] = -2848711952564224.000, dc_hat[0] = -1541809775837184.000 + Gradient do_[0] = -189952573707386880.000 +Backward Time Step 2: + Gradient di[0] = -4888359977615360.000, df[0] = -3683217924161536.000, dc_hat[0] = -2693578807574528.000 + Gradient do_[0] = -217336512092569600.000 +Backward Time Step 1: + Gradient di[0] = -6167615449858048.000, df[0] = -4412701476388864.000, dc_hat[0] = -3732895126519808.000 + Gradient do_[0] = -195010584893194240.000 +Backward Time Step 0: + Gradient di[0] = -7201550980087808.000, df[0] = -5263616102105088.000, dc_hat[0] = -7280527912468480.000 + Gradient do_[0] = -111724468874772480.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1282642542592.000, df[0] = 940440027136.000, dc_hat[0] = 556842090496.000 + Gradient do_[0] = 79526503645184.000 +Backward Time Step 3: + Gradient di[0] = 2012316565504.000, df[0] = 1452177883136.000, dc_hat[0] = 787757531136.000 + Gradient do_[0] = 106447652257792.000 +Backward Time Step 2: + Gradient di[0] = 2560546308096.000, df[0] = 1839089975296.000, dc_hat[0] = 1308190179328.000 + Gradient do_[0] = 116002125774848.000 +Backward Time Step 1: + Gradient di[0] = 3211307778048.000, df[0] = 2214415040512.000, dc_hat[0] = 1785888112640.000 + Gradient do_[0] = 102223702917120.000 +Backward Time Step 0: + Gradient di[0] = 3887841148928.000, df[0] = 2766176518144.000, dc_hat[0] = 3648106266624.000 + Gradient do_[0] = 60105676554240.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2377044549697536.000, df[0] = -1866195905019904.000, dc_hat[0] = -1074396806512640.000 + Gradient do_[0] = -138960222298308608.000 +Backward Time Step 3: + Gradient di[0] = -3732298931372032.000, df[0] = -2849300899954688.000, dc_hat[0] = -1542128274505728.000 + Gradient do_[0] = -189991829708472320.000 +Backward Time Step 2: + Gradient di[0] = -4889368758059008.000, df[0] = -3683978401808384.000, dc_hat[0] = -2694133663662080.000 + Gradient do_[0] = -217381334371270656.000 +Backward Time Step 1: + Gradient di[0] = -6168881391468544.000, df[0] = -4413606103875584.000, dc_hat[0] = -3733655872602112.000 + Gradient do_[0] = -195050545268916224.000 +Backward Time Step 0: + Gradient di[0] = -7203025227612160.000, df[0] = -5264693065154560.000, dc_hat[0] = -7282017729249280.000 + Gradient do_[0] = -111747326690721792.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1282993029120.000, df[0] = 940697124864.000, dc_hat[0] = 556994199552.000 + Gradient do_[0] = 79548246917120.000 +Backward Time Step 3: + Gradient di[0] = 2012868247552.000, df[0] = 1452575817728.000, dc_hat[0] = 787973079040.000 + Gradient do_[0] = 106476811059200.000 +Backward Time Step 2: + Gradient di[0] = 2561249902592.000, df[0] = 1839594995712.000, dc_hat[0] = 1308548530176.000 + Gradient do_[0] = 116033960542208.000 +Backward Time Step 1: + Gradient di[0] = 3212191727616.000, df[0] = 2215024787456.000, dc_hat[0] = 1786377928704.000 + Gradient do_[0] = 102251855085568.000 +Backward Time Step 0: + Gradient di[0] = 3888913580032.000, df[0] = 2766939619328.000, dc_hat[0] = 3649112899584.000 + Gradient do_[0] = 60122252443648.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2377567998836736.000, df[0] = -1866606477049856.000, dc_hat[0] = -1074632425734144.000 + Gradient do_[0] = -138990768105717760.000 +Backward Time Step 3: + Gradient di[0] = -3733113901416448.000, df[0] = -2849922864906240.000, dc_hat[0] = -1542463684608000.000 + Gradient do_[0] = -190033319092551680.000 +Backward Time Step 2: + Gradient di[0] = -4890435520561152.000, df[0] = -3684782634434560.000, dc_hat[0] = -2694719926697984.000 + Gradient do_[0] = -217428664910872576.000 +Backward Time Step 1: + Gradient di[0] = -6170233769295872.000, df[0] = -4414574082129920.000, dc_hat[0] = -3734470574211072.000 + Gradient do_[0] = -195093185704230912.000 +Backward Time Step 0: + Gradient di[0] = -7204599333126144.000, df[0] = -5265844653260800.000, dc_hat[0] = -7283610088374272.000 + Gradient do_[0] = -111771747874766848.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1283232366592.000, df[0] = 940872564736.000, dc_hat[0] = 557097811968.000 + Gradient do_[0] = 79563077976064.000 +Backward Time Step 3: + Gradient di[0] = 2013240885248.000, df[0] = 1452844777472.000, dc_hat[0] = 788118372352.000 + Gradient do_[0] = 106496423624704.000 +Backward Time Step 2: + Gradient di[0] = 2561721499648.000, df[0] = 1839933554688.000, dc_hat[0] = 1308788129792.000 + Gradient do_[0] = 116055301160960.000 +Backward Time Step 1: + Gradient di[0] = 3212783386624.000, df[0] = 2215432421376.000, dc_hat[0] = 1786704429056.000 + Gradient do_[0] = 102270662344704.000 +Backward Time Step 0: + Gradient di[0] = 3889633165312.000, df[0] = 2767451586560.000, dc_hat[0] = 3649787920384.000 + Gradient do_[0] = 60133371543552.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2378066483478528.000, df[0] = -1866997855944704.000, dc_hat[0] = -1074858247061504.000 + Gradient do_[0] = -139019845034311680.000 +Backward Time Step 3: + Gradient di[0] = -3733896659206144.000, df[0] = -2850520402231296.000, dc_hat[0] = -1542787015114752.000 + Gradient do_[0] = -190073124849451008.000 +Backward Time Step 2: + Gradient di[0] = -4891459333390336.000, df[0] = -3685553581064192.000, dc_hat[0] = -2695282030542848.000 + Gradient do_[0] = -217474174384340992.000 +Backward Time Step 1: + Gradient di[0] = -6171525480710144.000, df[0] = -4415498305404928.000, dc_hat[0] = -3735248500162560.000 + Gradient do_[0] = -195133970713673728.000 +Backward Time Step 0: + Gradient di[0] = -7206103645421568.000, df[0] = -5266943628017664.000, dc_hat[0] = -7285129969926144.000 + Gradient do_[0] = -111795086727053312.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1283553755136.000, df[0] = 941108166656.000, dc_hat[0] = 557237272576.000 + Gradient do_[0] = 79583009308672.000 +Backward Time Step 3: + Gradient di[0] = 2013747347456.000, df[0] = 1453210075136.000, dc_hat[0] = 788316225536.000 + Gradient do_[0] = 106523250393088.000 +Backward Time Step 2: + Gradient di[0] = 2562365849600.000, df[0] = 1840396238848.000, dc_hat[0] = 1309116727296.000 + Gradient do_[0] = 116084476739584.000 +Backward Time Step 1: + Gradient di[0] = 3213590528000.000, df[0] = 2215989215232.000, dc_hat[0] = 1787151908864.000 + Gradient do_[0] = 102296306319360.000 +Backward Time Step 0: + Gradient di[0] = 3890606243840.000, df[0] = 2768143908864.000, dc_hat[0] = 3650700967936.000 + Gradient do_[0] = 60148424900608.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2378556915056640.000, df[0] = -1867383195041792.000, dc_hat[0] = -1075079504986112.000 + Gradient do_[0] = -139048526825914368.000 +Backward Time Step 3: + Gradient di[0] = -3734669753319424.000, df[0] = -2851110691799040.000, dc_hat[0] = -1543106319089664.000 + Gradient do_[0] = -190112363670667264.000 +Backward Time Step 2: + Gradient di[0] = -4892474019414016.000, df[0] = -3686317548371968.000, dc_hat[0] = -2695838497243136.000 + Gradient do_[0] = -217519065382518784.000 +Backward Time Step 1: + Gradient di[0] = -6172804307222528.000, df[0] = -4416412596568064.000, dc_hat[0] = -3736017299308544.000 + Gradient do_[0] = -195174360586125312.000 +Backward Time Step 0: + Gradient di[0] = -7207592388460544.000, df[0] = -5268031865356288.000, dc_hat[0] = -7286634819092480.000 + Gradient do_[0] = -111818176471236608.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1283836215296.000, df[0] = 941315391488.000, dc_hat[0] = 557359955968.000 + Gradient do_[0] = 79600482779136.000 +Backward Time Step 3: + Gradient di[0] = 2014189453312.000, df[0] = 1453529366528.000, dc_hat[0] = 788489306112.000 + Gradient do_[0] = 106546612666368.000 +Backward Time Step 2: + Gradient di[0] = 2562929459200.000, df[0] = 1840800989184.000, dc_hat[0] = 1309403906048.000 + Gradient do_[0] = 116110011662336.000 +Backward Time Step 1: + Gradient di[0] = 3214296743936.000, df[0] = 2216476278784.000, dc_hat[0] = 1787542110208.000 + Gradient do_[0] = 102318771011584.000 +Backward Time Step 0: + Gradient di[0] = 3891458473984.000, df[0] = 2768750247936.000, dc_hat[0] = 3651500244992.000 + Gradient do_[0] = 60161590820864.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2379071505825792.000, df[0] = -1867786787749888.000, dc_hat[0] = -1075311433220096.000 + Gradient do_[0] = -139078583007051776.000 +Backward Time Step 3: + Gradient di[0] = -3735475596558336.000, df[0] = -2851725677428736.000, dc_hat[0] = -1543438239531008.000 + Gradient do_[0] = -190153389198278656.000 +Backward Time Step 2: + Gradient di[0] = -4893527897014272.000, df[0] = -3687112117321728.000, dc_hat[0] = -2696419123134464.000 + Gradient do_[0] = -217565983605260288.000 +Backward Time Step 1: + Gradient di[0] = -6174143800147968.000, df[0] = -4417370642710528.000, dc_hat[0] = -3736823679418368.000 + Gradient do_[0] = -195216657424056320.000 +Backward Time Step 0: + Gradient di[0] = -7209151461588992.000, df[0] = -5269171642302464.000, dc_hat[0] = -7288211072090112.000 + Gradient do_[0] = -111842374316982272.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1284133224448.000, df[0] = 941533036544.000, dc_hat[0] = 557488340992.000 + Gradient do_[0] = 79618887385088.000 +Backward Time Step 3: + Gradient di[0] = 2014655414272.000, df[0] = 1453865566208.000, dc_hat[0] = 788671627264.000 + Gradient do_[0] = 106571258396672.000 +Backward Time Step 2: + Gradient di[0] = 2563525050368.000, df[0] = 1841228677120.000, dc_hat[0] = 1309707075584.000 + Gradient do_[0] = 116136913928192.000 +Backward Time Step 1: + Gradient di[0] = 3215045689344.000, df[0] = 2216992178176.000, dc_hat[0] = 1787956166656.000 + Gradient do_[0] = 102342544326656.000 +Backward Time Step 0: + Gradient di[0] = 3892352909312.000, df[0] = 2769386733568.000, dc_hat[0] = 3652340154368.000 + Gradient do_[0] = 60175419441152.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2379567842983936.000, df[0] = -1868176556032000.000, dc_hat[0] = -1075535643934720.000 + Gradient do_[0] = -139107599806103552.000 +Backward Time Step 3: + Gradient di[0] = -3736253254074368.000, df[0] = -2852319725092864.000, dc_hat[0] = -1543758751465472.000 + Gradient do_[0] = -190192988796747776.000 +Backward Time Step 2: + Gradient di[0] = -4894552783585280.000, df[0] = -3687883600822272.000, dc_hat[0] = -2696981226979328.000 + Gradient do_[0] = -217611493078728704.000 +Backward Time Step 1: + Gradient di[0] = -6175428532240384.000, df[0] = -4418289765711872.000, dc_hat[0] = -3737598384144384.000 + Gradient do_[0] = -195257253454938112.000 +Backward Time Step 0: + Gradient di[0] = -7210653626400768.000, df[0] = -5270269006446592.000, dc_hat[0] = -7289729879900160.000 + Gradient do_[0] = -111865661629661184.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1284572971008.000, df[0] = 941855408128.000, dc_hat[0] = 557679247360.000 + Gradient do_[0] = 79646133583872.000 +Backward Time Step 3: + Gradient di[0] = 2015344721920.000, df[0] = 1454362853376.000, dc_hat[0] = 788940914688.000 + Gradient do_[0] = 106607706898432.000 +Backward Time Step 2: + Gradient di[0] = 2564400349184.000, df[0] = 1841857298432.000, dc_hat[0] = 1310152982528.000 + Gradient do_[0] = 116176558489600.000 +Backward Time Step 1: + Gradient di[0] = 3216140926976.000, df[0] = 2217747677184.000, dc_hat[0] = 1788563685376.000 + Gradient do_[0] = 102377398992896.000 +Backward Time Step 0: + Gradient di[0] = 3893690105856.000, df[0] = 2770338054144.000, dc_hat[0] = 3653594513408.000 + Gradient do_[0] = 60196093165568.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2380064180142080.000, df[0] = -1868566190096384.000, dc_hat[0] = -1075759988867072.000 + Gradient do_[0] = -139136565065547776.000 +Backward Time Step 3: + Gradient di[0] = -3737036817170432.000, df[0] = -2852917799288832.000, dc_hat[0] = -1544082216189952.000 + Gradient do_[0] = -190232725834170368.000 +Backward Time Step 2: + Gradient di[0] = -4895578207027200.000, df[0] = -3688655889629184.000, dc_hat[0] = -2697545209872384.000 + Gradient do_[0] = -217657105631412224.000 +Backward Time Step 1: + Gradient di[0] = -6176724538621952.000, df[0] = -4419216404905984.000, dc_hat[0] = -3738376578531328.000 + Gradient do_[0] = -195298141543596032.000 +Backward Time Step 0: + Gradient di[0] = -7212169749856256.000, df[0] = -5271377108008960.000, dc_hat[0] = -7291262109483008.000 + Gradient do_[0] = -111889189460508672.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1284826202112.000, df[0] = 942041137152.000, dc_hat[0] = 557789216768.000 + Gradient do_[0] = 79661837058048.000 +Backward Time Step 3: + Gradient di[0] = 2015742787584.000, df[0] = 1454650032128.000, dc_hat[0] = 789096497152.000 + Gradient do_[0] = 106628728750080.000 +Backward Time Step 2: + Gradient di[0] = 2564908122112.000, df[0] = 1842221940736.000, dc_hat[0] = 1310412111872.000 + Gradient do_[0] = 116199518109696.000 +Backward Time Step 1: + Gradient di[0] = 3216779509760.000, df[0] = 2218187816960.000, dc_hat[0] = 1788916793344.000 + Gradient do_[0] = 102397665869824.000 +Backward Time Step 0: + Gradient di[0] = 3894460284928.000, df[0] = 2770885935104.000, dc_hat[0] = 3654317244416.000 + Gradient do_[0] = 60208004988928.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2380562396348416.000, df[0] = -1868957300555776.000, dc_hat[0] = -1075984937779200.000 + Gradient do_[0] = -139165693533749248.000 +Backward Time Step 3: + Gradient di[0] = -3737817695911936.000, df[0] = -2853513994436608.000, dc_hat[0] = -1544403936083968.000 + Gradient do_[0] = -190272462871592960.000 +Backward Time Step 2: + Gradient di[0] = -4896603093598208.000, df[0] = -3689428446871552.000, dc_hat[0] = -2698109461200896.000 + Gradient do_[0] = -217702597925011456.000 +Backward Time Step 1: + Gradient di[0] = -6178018397519872.000, df[0] = -4420141701922816.000, dc_hat[0] = -3739155846660096.000 + Gradient do_[0] = -195338995272515584.000 +Backward Time Step 0: + Gradient di[0] = -7213675672764416.000, df[0] = -5272478230249472.000, dc_hat[0] = -7292785212260352.000 + Gradient do_[0] = -111912554082598912.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1285111021568.000, df[0] = 942249869312.000, dc_hat[0] = 557912555520.000 + Gradient do_[0] = 79679495077888.000 +Backward Time Step 3: + Gradient di[0] = 2016188956672.000, df[0] = 1454972076032.000, dc_hat[0] = 789271150592.000 + Gradient do_[0] = 106652334292992.000 +Backward Time Step 2: + Gradient di[0] = 2565476188160.000, df[0] = 1842630098944.000, dc_hat[0] = 1310702305280.000 + Gradient do_[0] = 116225254359040.000 +Backward Time Step 1: + Gradient di[0] = 3217493065728.000, df[0] = 2218679599104.000, dc_hat[0] = 1789310402560.000 + Gradient do_[0] = 102420340277248.000 +Backward Time Step 0: + Gradient di[0] = 3895318282240.000, df[0] = 2771496468480.000, dc_hat[0] = 3655122550784.000 + Gradient do_[0] = 60221267378176.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2381066518134784.000, df[0] = -1869353511288832.000, dc_hat[0] = -1076212772372480.000 + Gradient do_[0] = -139195191369138176.000 +Backward Time Step 3: + Gradient di[0] = -3738609580507136.000, df[0] = -2854119047954432.000, dc_hat[0] = -1544730890469376.000 + Gradient do_[0] = -190312784024567808.000 +Backward Time Step 2: + Gradient di[0] = -4897633348878336.000, df[0] = -3690204225339392.000, dc_hat[0] = -2698673712529408.000 + Gradient do_[0] = -217748347916648448.000 +Backward Time Step 1: + Gradient di[0] = -6179317625126912.000, df[0] = -4421071293906944.000, dc_hat[0] = -3739938336014336.000 + Gradient do_[0] = -195380037979996160.000 +Backward Time Step 0: + Gradient di[0] = -7215195017445376.000, df[0] = -5273588479295488.000, dc_hat[0] = -7294321199939584.000 + Gradient do_[0] = -111936124863119360.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1285443551232.000, df[0] = 942493728768.000, dc_hat[0] = 558056603648.000 + Gradient do_[0] = 79700080721920.000 +Backward Time Step 3: + Gradient di[0] = 2016712851456.000, df[0] = 1455350218752.000, dc_hat[0] = 789475819520.000 + Gradient do_[0] = 106680067031040.000 +Backward Time Step 2: + Gradient di[0] = 2566145179648.000, df[0] = 1843110739968.000, dc_hat[0] = 1311043616768.000 + Gradient do_[0] = 116255461736448.000 +Backward Time Step 1: + Gradient di[0] = 3218329305088.000, df[0] = 2219256315904.000, dc_hat[0] = 1789773611008.000 + Gradient do_[0] = 102446906998784.000 +Backward Time Step 0: + Gradient di[0] = 3896323342336.000, df[0] = 2772211597312.000, dc_hat[0] = 3656065482752.000 + Gradient do_[0] = 60236811468800.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2381585940742144.000, df[0] = -1869761130528768.000, dc_hat[0] = -1076446982307840.000 + Gradient do_[0] = -139225496658378752.000 +Backward Time Step 3: + Gradient di[0] = -3739425892728832.000, df[0] = -2854741818212352.000, dc_hat[0] = -1545067105878016.000 + Gradient do_[0] = -190354273408647168.000 +Backward Time Step 2: + Gradient di[0] = -4898705480089600.000, df[0] = -3691012484497408.000, dc_hat[0] = -2699263465226240.000 + Gradient do_[0] = -217795987693895680.000 +Backward Time Step 1: + Gradient di[0] = -6180674834792448.000, df[0] = -4422041956515840.000, dc_hat[0] = -3740754379800576.000 + Gradient do_[0] = -195422815854264320.000 +Backward Time Step 0: + Gradient di[0] = -7216766975475712.000, df[0] = -5274737919918080.000, dc_hat[0] = -7295911411580928.000 + Gradient do_[0] = -111960520277360640.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1285770444800.000, df[0] = 942733459456.000, dc_hat[0] = 558198489088.000 + Gradient do_[0] = 79720355987456.000 +Backward Time Step 3: + Gradient di[0] = 2017224818688.000, df[0] = 1455719448576.000, dc_hat[0] = 789676097536.000 + Gradient do_[0] = 106707128680448.000 +Backward Time Step 2: + Gradient di[0] = 2566793199616.000, df[0] = 1843575914496.000, dc_hat[0] = 1311373918208.000 + Gradient do_[0] = 116284805087232.000 +Backward Time Step 1: + Gradient di[0] = 3219143524352.000, df[0] = 2219817566208.000, dc_hat[0] = 1790224629760.000 + Gradient do_[0] = 102472819408896.000 +Backward Time Step 0: + Gradient di[0] = 3897312411648.000, df[0] = 2772915453952.000, dc_hat[0] = 3656993472512.000 + Gradient do_[0] = 60252095512576.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2382075030142976.000, df[0] = -1870145127448576.000, dc_hat[0] = -1076667434926080.000 + Gradient do_[0] = -139254083960700928.000 +Backward Time Step 3: + Gradient di[0] = -3740192812826624.000, df[0] = -2855327275941888.000, dc_hat[0] = -1545383859716096.000 + Gradient do_[0] = -190393288891564032.000 +Backward Time Step 2: + Gradient di[0] = -4899708891824128.000, df[0] = -3691768667176960.000, dc_hat[0] = -2699814831652864.000 + Gradient do_[0] = -217840603814166528.000 +Backward Time Step 1: + Gradient di[0] = -6181937018306560.000, df[0] = -4422944436518912.000, dc_hat[0] = -3741514320576512.000 + Gradient do_[0] = -195462673150771200.000 +Backward Time Step 0: + Gradient di[0] = -7218238001774592.000, df[0] = -5275812198612992.000, dc_hat[0] = -7297397470265344.000 + Gradient do_[0] = -111983335143636992.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1286065750016.000, df[0] = 942949924864.000, dc_hat[0] = 558326677504.000 + Gradient do_[0] = 79738659930112.000 +Backward Time Step 3: + Gradient di[0] = 2017687240704.000, df[0] = 1456053420032.000, dc_hat[0] = 789856714752.000 + Gradient do_[0] = 106731564695552.000 +Backward Time Step 2: + Gradient di[0] = 2567383810048.000, df[0] = 1844000063488.000, dc_hat[0] = 1311674466304.000 + Gradient do_[0] = 116311556358144.000 +Backward Time Step 1: + Gradient di[0] = 3219884343296.000, df[0] = 2220328222720.000, dc_hat[0] = 1790634360832.000 + Gradient do_[0] = 102496374620160.000 +Backward Time Step 0: + Gradient di[0] = 3898207633408.000, df[0] = 2773552463872.000, dc_hat[0] = 3657833644032.000 + Gradient do_[0] = 60265936715776.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2382580225671168.000, df[0] = -1870541875052544.000, dc_hat[0] = -1076896410370048.000 + Gradient do_[0] = -139283633335697408.000 +Backward Time Step 3: + Gradient di[0] = -3740983355244544.000, df[0] = -2855930181976064.000, dc_hat[0] = -1545708800835584.000 + Gradient do_[0] = -190433558504931328.000 +Backward Time Step 2: + Gradient di[0] = -4900747200167936.000, df[0] = -3692551156531200.000, dc_hat[0] = -2700386330738688.000 + Gradient do_[0] = -217886611503841280.000 +Backward Time Step 1: + Gradient di[0] = -6183244835848192.000, df[0] = -4423880739389440.000, dc_hat[0] = -3742302178639872.000 + Gradient do_[0] = -195503990736158720.000 +Backward Time Step 0: + Gradient di[0] = -7219769157615616.000, df[0] = -5276932111335424.000, dc_hat[0] = -7298945805975552.000 + Gradient do_[0] = -112007094902718464.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1286428426240.000, df[0] = 943215869952.000, dc_hat[0] = 558484160512.000 + Gradient do_[0] = 79761141399552.000 +Backward Time Step 3: + Gradient di[0] = 2018257141760.000, df[0] = 1456464330752.000, dc_hat[0] = 790079668224.000 + Gradient do_[0] = 106761671409664.000 +Backward Time Step 2: + Gradient di[0] = 2568108376064.000, df[0] = 1844519895040.000, dc_hat[0] = 1312042123264.000 + Gradient do_[0] = 116344347426816.000 +Backward Time Step 1: + Gradient di[0] = 3220794245120.000, df[0] = 2220955795456.000, dc_hat[0] = 1791137808384.000 + Gradient do_[0] = 102525290151936.000 +Backward Time Step 0: + Gradient di[0] = 3899307589632.000, df[0] = 2774334963712.000, dc_hat[0] = 3658865704960.000 + Gradient do_[0] = 60282948812800.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2383086494941184.000, df[0] = -1870939025309696.000, dc_hat[0] = -1077124446289920.000 + Gradient do_[0] = -139313156940890112.000 +Backward Time Step 3: + Gradient di[0] = -3741779803242496.000, df[0] = -2856538993590272.000, dc_hat[0] = -1546037768486912.000 + Gradient do_[0] = -190474120176074752.000 +Backward Time Step 2: + Gradient di[0] = -4901789266608128.000, df[0] = -3693336330240000.000, dc_hat[0] = -2700960245743616.000 + Gradient do_[0] = -217932962790899712.000 +Backward Time Step 1: + Gradient di[0] = -6184566075162624.000, df[0] = -4424824558452736.000, dc_hat[0] = -3743096747589632.000 + Gradient do_[0] = -195545703458537472.000 +Backward Time Step 0: + Gradient di[0] = -7221307292778496.000, df[0] = -5278056319025152.000, dc_hat[0] = -7300501121007616.000 + Gradient do_[0] = -112030949151080448.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1286752567296.000, df[0] = 943453503488.000, dc_hat[0] = 558624866304.000 + Gradient do_[0] = 79781223727104.000 +Backward Time Step 3: + Gradient di[0] = 2018765570048.000, df[0] = 1456831332352.000, dc_hat[0] = 790278307840.000 + Gradient do_[0] = 106788531732480.000 +Backward Time Step 2: + Gradient di[0] = 2568756396032.000, df[0] = 1844985593856.000, dc_hat[0] = 1312373211136.000 + Gradient do_[0] = 116373665611776.000 +Backward Time Step 1: + Gradient di[0] = 3221604794368.000, df[0] = 2221514686464.000, dc_hat[0] = 1791586598912.000 + Gradient do_[0] = 102551059955712.000 +Backward Time Step 0: + Gradient di[0] = 3900284338176.000, df[0] = 2775029907456.000, dc_hat[0] = 3659782422528.000 + Gradient do_[0] = 60298039918592.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2383601085710336.000, df[0] = -1871342752235520.000, dc_hat[0] = -1077357045612544.000 + Gradient do_[0] = -139343195942158336.000 +Backward Time Step 3: + Gradient di[0] = -3742583230562304.000, df[0] = -2857152368607232.000, dc_hat[0] = -1546369152057344.000 + Gradient do_[0] = -190514991084863488.000 +Backward Time Step 2: + Gradient di[0] = -4902843144208384.000, df[0] = -3694130630754304.000, dc_hat[0] = -2701538992586752.000 + Gradient do_[0] = -217979726394818560.000 +Backward Time Step 1: + Gradient di[0] = -6185897515024384.000, df[0] = -4425777772756992.000, dc_hat[0] = -3743898564296704.000 + Gradient do_[0] = -195587759778299904.000 +Backward Time Step 0: + Gradient di[0] = -7222863681552384.000, df[0] = -5279193948487680.000, dc_hat[0] = -7302074152779776.000 + Gradient do_[0] = -112055095457218560.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1287035944960.000, df[0] = 943661252608.000, dc_hat[0] = 558747877376.000 + Gradient do_[0] = 79798781083648.000 +Backward Time Step 3: + Gradient di[0] = 2019209117696.000, df[0] = 1457151541248.000, dc_hat[0] = 790451912704.000 + Gradient do_[0] = 106812003057664.000 +Backward Time Step 2: + Gradient di[0] = 2569316859904.000, df[0] = 1845388247040.000, dc_hat[0] = 1312659341312.000 + Gradient do_[0] = 116399074705408.000 +Backward Time Step 1: + Gradient di[0] = 3222309175296.000, df[0] = 2222000439296.000, dc_hat[0] = 1791976407040.000 + Gradient do_[0] = 102573482704896.000 +Backward Time Step 0: + Gradient di[0] = 3901138403328.000, df[0] = 2775637819392.000, dc_hat[0] = 3660583534592.000 + Gradient do_[0] = 60311247781888.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2384090443546624.000, df[0] = -1871726883373056.000, dc_hat[0] = -1077577632448512.000 + Gradient do_[0] = -139371877733761024.000 +Backward Time Step 3: + Gradient di[0] = -3743357666852864.000, df[0] = -2857743463481344.000, dc_hat[0] = -1546688992903168.000 + Gradient do_[0] = -190554384524902400.000 +Backward Time Step 2: + Gradient di[0] = -4903862125199360.000, df[0] = -3694897550852096.000, dc_hat[0] = -2702098143641600.000 + Gradient do_[0] = -218024960990380032.000 +Backward Time Step 1: + Gradient di[0] = -6187181710245888.000, df[0] = -4426696358887424.000, dc_hat[0] = -3744671658409984.000 + Gradient do_[0] = -195628269909835776.000 +Backward Time Step 0: + Gradient di[0] = -7224345982140416.000, df[0] = -5280276817117184.000, dc_hat[0] = -7303572559495168.000 + Gradient do_[0] = -112078090712121344.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1287361658880.000, df[0] = 943900196864.000, dc_hat[0] = 558889304064.000 + Gradient do_[0] = 79818980851712.000 +Backward Time Step 3: + Gradient di[0] = 2019720822784.000, df[0] = 1457520640000.000, dc_hat[0] = 790651994112.000 + Gradient do_[0] = 106839056318464.000 +Backward Time Step 2: + Gradient di[0] = 2569971957760.000, df[0] = 1845858271232.000, dc_hat[0] = 1312992526336.000 + Gradient do_[0] = 116428711657472.000 +Backward Time Step 1: + Gradient di[0] = 3223131258880.000, df[0] = 2222566670336.000, dc_hat[0] = 1792430571520.000 + Gradient do_[0] = 102599571275776.000 +Backward Time Step 0: + Gradient di[0] = 3902127472640.000, df[0] = 2776341151744.000, dc_hat[0] = 3661511786496.000 + Gradient do_[0] = 60326536019968.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2384583559479296.000, df[0] = -1872114101518336.000, dc_hat[0] = -1077800568094720.000 + Gradient do_[0] = -139400722734120960.000 +Backward Time Step 3: + Gradient di[0] = -3744130760966144.000, df[0] = -2858333484613632.000, dc_hat[0] = -1547007625789440.000 + Gradient do_[0] = -190593726425333760.000 +Backward Time Step 2: + Gradient di[0] = -4904869295030272.000, df[0] = -3695656686321664.000, dc_hat[0] = -2702650583810048.000 + Gradient do_[0] = -218069783269081088.000 +Backward Time Step 1: + Gradient di[0] = -6188447114985472.000, df[0] = -4427601523245056.000, dc_hat[0] = -3745432404492288.000 + Gradient do_[0] = -195668230285557760.000 +Backward Time Step 0: + Gradient di[0] = -7225842778243072.000, df[0] = -5281370960035840.000, dc_hat[0] = -7305085461725184.000 + Gradient do_[0] = -112101300715388928.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1287719092224.000, df[0] = 944162209792.000, dc_hat[0] = 559044296704.000 + Gradient do_[0] = 79841135165440.000 +Backward Time Step 3: + Gradient di[0] = 2020282073088.000, df[0] = 1457925652480.000, dc_hat[0] = 790871474176.000 + Gradient do_[0] = 106868735213568.000 +Backward Time Step 2: + Gradient di[0] = 2570685251584.000, df[0] = 1846371024896.000, dc_hat[0] = 1313357168640.000 + Gradient do_[0] = 116460974243840.000 +Backward Time Step 1: + Gradient di[0] = 3224024121344.000, df[0] = 2223182446592.000, dc_hat[0] = 1792925368320.000 + Gradient do_[0] = 102627958325248.000 +Backward Time Step 0: + Gradient di[0] = 3903205670912.000, df[0] = 2777108185088.000, dc_hat[0] = 3662523138048.000 + Gradient do_[0] = 60343208378368.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2385081507250176.000, df[0] = -1872505211977728.000, dc_hat[0] = -1078025315680256.000 + Gradient do_[0] = -139429748123107328.000 +Backward Time Step 3: + Gradient di[0] = -3744909760659456.000, df[0] = -2858928606019584.000, dc_hat[0] = -1547328674594816.000 + Gradient do_[0] = -190633291664064512.000 +Backward Time Step 2: + Gradient di[0] = -4905894718472192.000, df[0] = -3696429511999488.000, dc_hat[0] = -2703215103574016.000 + Gradient do_[0] = -218115258382811136.000 +Backward Time Step 1: + Gradient di[0] = -6189741510754304.000, df[0] = -4428527625568256.000, dc_hat[0] = -3746212209491968.000 + Gradient do_[0] = -195709032474869760.000 +Backward Time Step 0: + Gradient di[0] = -7227335816249344.000, df[0] = -5282462418599936.000, dc_hat[0] = -7306595142729728.000 + Gradient do_[0] = -112124484948852736.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1287902461952.000, df[0] = 944296558592.000, dc_hat[0] = 559123857408.000 + Gradient do_[0] = 79852459786240.000 +Backward Time Step 3: + Gradient di[0] = 2020570300416.000, df[0] = 1458134056960.000, dc_hat[0] = 790983933952.000 + Gradient do_[0] = 106883935371264.000 +Backward Time Step 2: + Gradient di[0] = 2571052515328.000, df[0] = 1846634872832.000, dc_hat[0] = 1313544077312.000 + Gradient do_[0] = 116477566910464.000 +Backward Time Step 1: + Gradient di[0] = 3224486281216.000, df[0] = 2223500951552.000, dc_hat[0] = 1793180303360.000 + Gradient do_[0] = 102642638389248.000 +Backward Time Step 0: + Gradient di[0] = 3903766659072.000, df[0] = 2777507430400.000, dc_hat[0] = 3663050047488.000 + Gradient do_[0] = 60351882199040.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2385566838554624.000, df[0] = -1872886256107520.000, dc_hat[0] = -1078244962992128.000 + Gradient do_[0] = -139458129266999296.000 +Backward Time Step 3: + Gradient di[0] = -3745676680757248.000, df[0] = -2859513526878208.000, dc_hat[0] = -1547645428432896.000 + Gradient do_[0] = -190672324326850560.000 +Backward Time Step 2: + Gradient di[0] = -4906898667077632.000, df[0] = -3697185426243584.000, dc_hat[0] = -2703766201565184.000 + Gradient do_[0] = -218159891682951168.000 +Backward Time Step 1: + Gradient di[0] = -6191002083655680.000, df[0] = -4429428763394048.000, dc_hat[0] = -3746971344961536.000 + Gradient do_[0] = -195748872591507456.000 +Backward Time Step 0: + Gradient di[0] = -7228812211257344.000, df[0] = -5283541529133056.000, dc_hat[0] = -7308087643865088.000 + Gradient do_[0] = -112147385714475008.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1288276672512.000, df[0] = 944571088896.000, dc_hat[0] = 559286321152.000 + Gradient do_[0] = 79875679453184.000 +Backward Time Step 3: + Gradient di[0] = 2021157502976.000, df[0] = 1458557681664.000, dc_hat[0] = 791213309952.000 + Gradient do_[0] = 106914956443648.000 +Backward Time Step 2: + Gradient di[0] = 2571798577152.000, df[0] = 1847170826240.000, dc_hat[0] = 1313924710400.000 + Gradient do_[0] = 116511406555136.000 +Backward Time Step 1: + Gradient di[0] = 3225422921728.000, df[0] = 2224146612224.000, dc_hat[0] = 1793699217408.000 + Gradient do_[0] = 102672426336256.000 +Backward Time Step 0: + Gradient di[0] = 3904897810432.000, df[0] = 2778312212480.000, dc_hat[0] = 3664111206400.000 + Gradient do_[0] = 60369364058112.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2386069349728256.000, df[0] = -1873280453574656.000, dc_hat[0] = -1078471723843584.000 + Gradient do_[0] = -139487498253369344.000 +Backward Time Step 3: + Gradient di[0] = -3746462928207872.000, df[0] = -2860114016993280.000, dc_hat[0] = -1547969698463744.000 + Gradient do_[0] = -190712267522703360.000 +Backward Time Step 2: + Gradient di[0] = -4907920869294080.000, df[0] = -3697955567566848.000, dc_hat[0] = -2704327231668224.000 + Gradient do_[0] = -218205315257073664.000 +Backward Time Step 1: + Gradient di[0] = -6192302385004544.000, df[0] = -4430359697555456.000, dc_hat[0] = -3747754102751232.000 + Gradient do_[0] = -195789880939249664.000 +Backward Time Step 0: + Gradient di[0] = -7230323502874624.000, df[0] = -5284645872599040.000, dc_hat[0] = -7309615041609728.000 + Gradient do_[0] = -112170836235911168.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1288628338688.000, df[0] = 944828907520.000, dc_hat[0] = 559438692352.000 + Gradient do_[0] = 79897473056768.000 +Backward Time Step 3: + Gradient di[0] = 2021708267520.000, df[0] = 1458954829824.000, dc_hat[0] = 791428268032.000 + Gradient do_[0] = 106944090079232.000 +Backward Time Step 2: + Gradient di[0] = 2572499288064.000, df[0] = 1847673618432.000, dc_hat[0] = 1314281488384.000 + Gradient do_[0] = 116543123881984.000 +Backward Time Step 1: + Gradient di[0] = 3226302939136.000, df[0] = 2224753475584.000, dc_hat[0] = 1794185887744.000 + Gradient do_[0] = 102700435898368.000 +Backward Time Step 0: + Gradient di[0] = 3905963425792.000, df[0] = 2779070332928.000, dc_hat[0] = 3665111023616.000 + Gradient do_[0] = 60385839284224.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2386564613144576.000, df[0] = -1873669416550400.000, dc_hat[0] = -1078695062142976.000 + Gradient do_[0] = -139516454922878976.000 +Backward Time Step 3: + Gradient di[0] = -3747240317288448.000, df[0] = -2860707259351040.000, dc_hat[0] = -1548289941962752.000 + Gradient do_[0] = -190751849941303296.000 +Backward Time Step 2: + Gradient di[0] = -4908950050832384.000, df[0] = -3698730540728320.000, dc_hat[0] = -2704891482996736.000 + Gradient do_[0] = -218250979349364736.000 +Backward Time Step 1: + Gradient di[0] = -6193598391386112.000, df[0] = -4431286336749568.000, dc_hat[0] = -3748533370880000.000 + Gradient do_[0] = -195830803387645952.000 +Backward Time Step 0: + Gradient di[0] = -7231826741428224.000, df[0] = -5285744847355904.000, dc_hat[0] = -7311135460032512.000 + Gradient do_[0] = -112194149318393856.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1288790605824.000, df[0] = 944947855360.000, dc_hat[0] = 559509143552.000 + Gradient do_[0] = 79907522609152.000 +Backward Time Step 3: + Gradient di[0] = 2021962416128.000, df[0] = 1459138330624.000, dc_hat[0] = 791527555072.000 + Gradient do_[0] = 106957495074816.000 +Backward Time Step 2: + Gradient di[0] = 2572823035904.000, df[0] = 1847906795520.000, dc_hat[0] = 1314446770176.000 + Gradient do_[0] = 116557736837120.000 +Backward Time Step 1: + Gradient di[0] = 3226707689472.000, df[0] = 2225032134656.000, dc_hat[0] = 1794408579072.000 + Gradient do_[0] = 102713245302784.000 +Backward Time Step 0: + Gradient di[0] = 3906454159360.000, df[0] = 2779419770880.000, dc_hat[0] = 3665571610624.000 + Gradient do_[0] = 60393430974464.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2387050212884480.000, df[0] = -1874050729115648.000, dc_hat[0] = -1078915179216896.000 + Gradient do_[0] = -139544861836574720.000 +Backward Time Step 3: + Gradient di[0] = -3748007237386240.000, df[0] = -2861292717080576.000, dc_hat[0] = -1548606830018560.000 + Gradient do_[0] = -190790831064481792.000 +Backward Time Step 2: + Gradient di[0] = -4909950241341440.000, df[0] = -3699484575924224.000, dc_hat[0] = -2705442580987904.000 + Gradient do_[0] = -218295354951467008.000 +Backward Time Step 1: + Gradient di[0] = -6194857353674752.000, df[0] = -4432186669268992.000, dc_hat[0] = -3749291969478656.000 + Gradient do_[0] = -195870609144545280.000 +Backward Time Step 0: + Gradient di[0] = -7233302599565312.000, df[0] = -5286823421018112.000, dc_hat[0] = -7312627424296960.000 + Gradient do_[0] = -112217050084016128.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1289152495616.000, df[0] = 945212948480.000, dc_hat[0] = 559666233344.000 + Gradient do_[0] = 79929953746944.000 +Backward Time Step 3: + Gradient di[0] = 2022529302528.000, df[0] = 1459547537408.000, dc_hat[0] = 791749525504.000 + Gradient do_[0] = 106987492737024.000 +Backward Time Step 2: + Gradient di[0] = 2573543145472.000, df[0] = 1848423350272.000, dc_hat[0] = 1314813247488.000 + Gradient do_[0] = 116590343356416.000 +Backward Time Step 1: + Gradient di[0] = 3227611561984.000, df[0] = 2225655513088.000, dc_hat[0] = 1794909536256.000 + Gradient do_[0] = 102741976285184.000 +Backward Time Step 0: + Gradient di[0] = 3907539697664.000, df[0] = 2780192047104.000, dc_hat[0] = 3666590564352.000 + Gradient do_[0] = 60410212384768.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2387544402558976.000, df[0] = -1874438752567296.000, dc_hat[0] = -1079137846427648.000 + Gradient do_[0] = -139573655297327104.000 +Backward Time Step 3: + Gradient di[0] = -3748781673676800.000, df[0] = -2861884080390144.000, dc_hat[0] = -1548924926033920.000 + Gradient do_[0] = -190830172964913152.000 +Backward Time Step 2: + Gradient di[0] = -4910960632397824.000, df[0] = -3700245590441984.000, dc_hat[0] = -2705996900204544.000 + Gradient do_[0] = -218340245949644800.000 +Backward Time Step 1: + Gradient di[0] = -6196140475154432.000, df[0] = -4433103913222144.000, dc_hat[0] = -3750062916108288.000 + Gradient do_[0] = -195911119276081152.000 +Backward Time Step 0: + Gradient di[0] = -7234795637571584.000, df[0] = -5287914342711296.000, dc_hat[0] = -7314137105301504.000 + Gradient do_[0] = -112240217137610752.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1289525133312.000, df[0] = 945486102528.000, dc_hat[0] = 559827648512.000 + Gradient do_[0] = 79953055973376.000 +Backward Time Step 3: + Gradient di[0] = 2023115063296.000, df[0] = 1459970375680.000, dc_hat[0] = 791978835968.000 + Gradient do_[0] = 107018488643584.000 +Backward Time Step 2: + Gradient di[0] = 2574289993728.000, df[0] = 1848960090112.000, dc_hat[0] = 1315194667008.000 + Gradient do_[0] = 116624132669440.000 +Backward Time Step 1: + Gradient di[0] = 3228547678208.000, df[0] = 2226300911616.000, dc_hat[0] = 1795428450304.000 + Gradient do_[0] = 102771730677760.000 +Backward Time Step 0: + Gradient di[0] = 3908676878336.000, df[0] = 2781001023488.000, dc_hat[0] = 3667657228288.000 + Gradient do_[0] = 60427790712832.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2388035907878912.000, df[0] = -1874824225882112.000, dc_hat[0] = -1079359439896576.000 + Gradient do_[0] = -139602362858733568.000 +Backward Time Step 3: + Gradient di[0] = -3749549130645504.000, df[0] = -2862469806555136.000, dc_hat[0] = -1549241948307456.000 + Gradient do_[0] = -190869257167306752.000 +Backward Time Step 2: + Gradient di[0] = -4911967265357824.000, df[0] = -3701003920605184.000, dc_hat[0] = -2706550145679360.000 + Gradient do_[0] = -218384965149130752.000 +Backward Time Step 1: + Gradient di[0] = -6197411248603136.000, df[0] = -4434012835676160.000, dc_hat[0] = -3750828494028800.000 + Gradient do_[0] = -195951182731018240.000 +Backward Time Step 0: + Gradient di[0] = -7236280622514176.000, df[0] = -5289000432566272.000, dc_hat[0] = -7315638196371456.000 + Gradient do_[0] = -112263238162317312.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1289776660480.000, df[0] = 945670520832.000, dc_hat[0] = 559936831488.000 + Gradient do_[0] = 79968658784256.000 +Backward Time Step 3: + Gradient di[0] = 2023510376448.000, df[0] = 1460255457280.000, dc_hat[0] = 792133042176.000 + Gradient do_[0] = 107039393054720.000 +Backward Time Step 2: + Gradient di[0] = 2574790688768.000, df[0] = 1849319620608.000, dc_hat[0] = 1315449339904.000 + Gradient do_[0] = 116646815465472.000 +Backward Time Step 1: + Gradient di[0] = 3229178134528.000, df[0] = 2226736070656.000, dc_hat[0] = 1795776839680.000 + Gradient do_[0] = 102791771062272.000 +Backward Time Step 0: + Gradient di[0] = 3909436047360.000, df[0] = 2781541040128.000, dc_hat[0] = 3668369735680.000 + Gradient do_[0] = 60439522181120.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2388525534150656.000, df[0] = -1875208893890560.000, dc_hat[0] = -1079580832038912.000 + Gradient do_[0] = -139630993110728704.000 +Backward Time Step 3: + Gradient di[0] = -3750322224758784.000, df[0] = -2863060364558336.000, dc_hat[0] = -1549561386500096.000 + Gradient do_[0] = -190908616247607296.000 +Backward Time Step 2: + Gradient di[0] = -4912984098865152.000, df[0] = -3701769766961152.000, dc_hat[0] = -2707109565169664.000 + Gradient do_[0] = -218430148205084672.000 +Backward Time Step 1: + Gradient di[0] = -6198691148857344.000, df[0] = -4434928469016576.000, dc_hat[0] = -3751598098481152.000 + Gradient do_[0] = -195991641322946560.000 +Backward Time Step 0: + Gradient di[0] = -7237769365553152.000, df[0] = -5290088669904896.000, dc_hat[0] = -7317143582408704.000 + Gradient do_[0] = -112286353676304384.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1289987031040.000, df[0] = 945824727040.000, dc_hat[0] = 560027992064.000 + Gradient do_[0] = 79981694681088.000 +Backward Time Step 3: + Gradient di[0] = 2023839760384.000, df[0] = 1460493090816.000, dc_hat[0] = 792262410240.000 + Gradient do_[0] = 107056832970752.000 +Backward Time Step 2: + Gradient di[0] = 2575212216320.000, df[0] = 1849622134784.000, dc_hat[0] = 1315664035840.000 + Gradient do_[0] = 116665891160064.000 +Backward Time Step 1: + Gradient di[0] = 3229707403264.000, df[0] = 2227100712960.000, dc_hat[0] = 1796069392384.000 + Gradient do_[0] = 102808598609920.000 +Backward Time Step 0: + Gradient di[0] = 3910073319424.000, df[0] = 2781994811392.000, dc_hat[0] = 3668967686144.000 + Gradient do_[0] = 60449382989824.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2389007912665088.000, df[0] = -1875587522101248.000, dc_hat[0] = -1079798667411456.000 + Gradient do_[0] = -139659159506255872.000 +Backward Time Step 3: + Gradient di[0] = -3751075723083776.000, df[0] = -2863635621740544.000, dc_hat[0] = -1549872637411328.000 + Gradient do_[0] = -190946910176018432.000 +Backward Time Step 2: + Gradient di[0] = -4913969793859584.000, df[0] = -3702512259432448.000, dc_hat[0] = -2707649657307136.000 + Gradient do_[0] = -218473802252681216.000 +Backward Time Step 1: + Gradient di[0] = -6199942594953216.000, df[0] = -4435823969697792.000, dc_hat[0] = -3752352133677056.000 + Gradient do_[0] = -196031086302593024.000 +Backward Time Step 0: + Gradient di[0] = -7239229654433792.000, df[0] = -5291155969277952.000, dc_hat[0] = -7318619977416704.000 + Gradient do_[0] = -112308996743888896.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1290433855488.000, df[0] = 946152407040.000, dc_hat[0] = 560221978624.000 + Gradient do_[0] = 80009402253312.000 +Backward Time Step 3: + Gradient di[0] = 2024540471296.000, df[0] = 1460998635520.000, dc_hat[0] = 792535760896.000 + Gradient do_[0] = 107093885452288.000 +Backward Time Step 2: + Gradient di[0] = 2576102719488.000, df[0] = 1850261766144.000, dc_hat[0] = 1316118855680.000 + Gradient do_[0] = 116706231975936.000 +Backward Time Step 1: + Gradient di[0] = 3230823350272.000, df[0] = 2227870105600.000, dc_hat[0] = 1796687527936.000 + Gradient do_[0] = 102844065644544.000 +Backward Time Step 0: + Gradient di[0] = 3911423361024.000, df[0] = 2782955044864.000, dc_hat[0] = 3670234365952.000 + Gradient do_[0] = 60470249652224.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2389511766016000.000, df[0] = -1875983061745664.000, dc_hat[0] = -1080026300678144.000 + Gradient do_[0] = -139688640161775616.000 +Backward Time Step 3: + Gradient di[0] = -3751865728630784.000, df[0] = -2864238796210176.000, dc_hat[0] = -1550199189143552.000 + Gradient do_[0] = -190987128249778176.000 +Backward Time Step 2: + Gradient di[0] = -4915005954719744.000, df[0] = -3703293138173952.000, dc_hat[0] = -2708219277344768.000 + Gradient do_[0] = -218519895841701888.000 +Backward Time Step 1: + Gradient di[0] = -6201252023107584.000, df[0] = -4436760541003776.000, dc_hat[0] = -3753140528611328.000 + Gradient do_[0] = -196072455427588096.000 +Backward Time Step 0: + Gradient di[0] = -7240745241018368.000, df[0] = -5292262997098496.000, dc_hat[0] = -7320152206999552.000 + Gradient do_[0] = -112332507394867200.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1290705305600.000, df[0] = 946351505408.000, dc_hat[0] = 560340008960.000 + Gradient do_[0] = 80026204635136.000 +Backward Time Step 3: + Gradient di[0] = 2024965144576.000, df[0] = 1461305081856.000, dc_hat[0] = 792701501440.000 + Gradient do_[0] = 107116333367296.000 +Backward Time Step 2: + Gradient di[0] = 2576648241152.000, df[0] = 1850653278208.000, dc_hat[0] = 1316396335104.000 + Gradient do_[0] = 116730902872064.000 +Backward Time Step 1: + Gradient di[0] = 3231511740416.000, df[0] = 2228344324096.000, dc_hat[0] = 1797067505664.000 + Gradient do_[0] = 102865951522816.000 +Backward Time Step 0: + Gradient di[0] = 3912254357504.000, df[0] = 2783546179584.000, dc_hat[0] = 3671014244352.000 + Gradient do_[0] = 60483096805376.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2389998707933184.000, df[0] = -1876365313835008.000, dc_hat[0] = -1080245679554560.000 + Gradient do_[0] = -139717072845275136.000 +Backward Time Step 3: + Gradient di[0] = -3752631574986752.000, df[0] = -2864823448633344.000, dc_hat[0] = -1550514869239808.000 + Gradient do_[0] = -191026057833349120.000 +Backward Time Step 2: + Gradient di[0] = -4916008829583360.000, df[0] = -3704048515547136.000, dc_hat[0] = -2708769838465024.000 + Gradient do_[0] = -218564443242496000.000 +Backward Time Step 1: + Gradient di[0] = -6202509374783488.000, df[0] = -4437659799781376.000, dc_hat[0] = -3753895905984512.000 + Gradient do_[0] = -196112123745533952.000 +Backward Time Step 0: + Gradient di[0] = -7242219488542720.000, df[0] = -5293340497018880.000, dc_hat[0] = -7321641486909440.000 + Gradient do_[0] = -112355365210816512.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1290968498176.000, df[0] = 946544508928.000, dc_hat[0] = 560454238208.000 + Gradient do_[0] = 80042503700480.000 +Backward Time Step 3: + Gradient di[0] = 2025379069952.000, df[0] = 1461603794944.000, dc_hat[0] = 792863506432.000 + Gradient do_[0] = 107138219245568.000 +Backward Time Step 2: + Gradient di[0] = 2577173577728.000, df[0] = 1851030634496.000, dc_hat[0] = 1316664639488.000 + Gradient do_[0] = 116754642632704.000 +Backward Time Step 1: + Gradient di[0] = 3232167362560.000, df[0] = 2228796260352.000, dc_hat[0] = 1797430444032.000 + Gradient do_[0] = 102886797213696.000 +Backward Time Step 0: + Gradient di[0] = 3913047867392.000, df[0] = 2784111099904.000, dc_hat[0] = 3671758733312.000 + Gradient do_[0] = 60495360950272.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2390474912432128.000, df[0] = -1876739110207488.000, dc_hat[0] = -1080460897681408.000 + Gradient do_[0] = -139744912823287808.000 +Backward Time Step 3: + Gradient di[0] = -3753385341747200.000, df[0] = -2865398437380096.000, dc_hat[0] = -1550825449062400.000 + Gradient do_[0] = -191064420481236992.000 +Backward Time Step 2: + Gradient di[0] = -4916991840223232.000, df[0] = -3704789128970240.000, dc_hat[0] = -2709309125296128.000 + Gradient do_[0] = -218608114469961728.000 +Backward Time Step 1: + Gradient di[0] = -6203748472848384.000, df[0] = -4438545636786176.000, dc_hat[0] = -3754642424987648.000 + Gradient do_[0] = -196151242307665920.000 +Backward Time Step 0: + Gradient di[0] = -7243659376328704.000, df[0] = -5294392764006400.000, dc_hat[0] = -7323098017693696.000 + Gradient do_[0] = -112377707630690304.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1291467751424.000, df[0] = 946910527488.000, dc_hat[0] = 560670507008.000 + Gradient do_[0] = 80073432498176.000 +Backward Time Step 3: + Gradient di[0] = 2026163666944.000, df[0] = 1462169763840.000, dc_hat[0] = 793170214912.000 + Gradient do_[0] = 107179617026048.000 +Backward Time Step 2: + Gradient di[0] = 2578171297792.000, df[0] = 1851747336192.000, dc_hat[0] = 1317172543488.000 + Gradient do_[0] = 116799798509568.000 +Backward Time Step 1: + Gradient di[0] = 3233417003008.000, df[0] = 2229658451968.000, dc_hat[0] = 1798123683840.000 + Gradient do_[0] = 102926559215616.000 +Backward Time Step 0: + Gradient di[0] = 3914560700416.000, df[0] = 2785187463168.000, dc_hat[0] = 3673178243072.000 + Gradient do_[0] = 60518760972288.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2390959975301120.000, df[0] = -1877120020119552.000, dc_hat[0] = -1080680142340096.000 + Gradient do_[0] = -139773268197376000.000 +Backward Time Step 3: + Gradient di[0] = -3754142598168576.000, df[0] = -2865977184223232.000, dc_hat[0] = -1551137505280000.000 + Gradient do_[0] = -191102920568078336.000 +Backward Time Step 2: + Gradient di[0] = -4917989883248640.000, df[0] = -3705541285117952.000, dc_hat[0] = -2709858612674560.000 + Gradient do_[0] = -218652438532456448.000 +Backward Time Step 1: + Gradient di[0] = -6205013340717056.000, df[0] = -4439450801143808.000, dc_hat[0] = -3755404513247232.000 + Gradient do_[0] = -196191219863257088.000 +Backward Time Step 0: + Gradient di[0] = -7245136845078528.000, df[0] = -5295473485152256.000, dc_hat[0] = -7324592129441792.000 + Gradient do_[0] = -112400651345985536.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1291683495936.000, df[0] = 947068796928.000, dc_hat[0] = 560764485632.000 + Gradient do_[0] = 80086829105152.000 +Backward Time Step 3: + Gradient di[0] = 2026502881280.000, df[0] = 1462414868480.000, dc_hat[0] = 793303121920.000 + Gradient do_[0] = 107197585424384.000 +Backward Time Step 2: + Gradient di[0] = 2578601213952.000, df[0] = 1852055748608.000, dc_hat[0] = 1317390909440.000 + Gradient do_[0] = 116819285245952.000 +Backward Time Step 1: + Gradient di[0] = 3233954922496.000, df[0] = 2230028861440.000, dc_hat[0] = 1798420430848.000 + Gradient do_[0] = 102943588089856.000 +Backward Time Step 0: + Gradient di[0] = 3915204001792.000, df[0] = 2785645166592.000, dc_hat[0] = 3673781960704.000 + Gradient do_[0] = 60528701472768.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2391460875862016.000, df[0] = -1877512875409408.000, dc_hat[0] = -1080905896558592.000 + Gradient do_[0] = -139802516924661760.000 +Backward Time Step 3: + Gradient di[0] = -3754930724667392.000, df[0] = -2866578479644672.000, dc_hat[0] = -1551462177964032.000 + Gradient do_[0] = -191142984023015424.000 +Backward Time Step 2: + Gradient di[0] = -4919017991045120.000, df[0] = -3706315989843968.000, dc_hat[0] = -2710424743051264.000 + Gradient do_[0] = -218698068265009152.000 +Backward Time Step 1: + Gradient di[0] = -6206310957711360.000, df[0] = -4440379050950656.000, dc_hat[0] = -3756184586682368.000 + Gradient do_[0] = -196232125131784192.000 +Backward Time Step 0: + Gradient di[0] = -7246648673566720.000, df[0] = -5296578365489152.000, dc_hat[0] = -7326120600928256.000 + Gradient do_[0] = -112424101867421696.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1291941576704.000, df[0] = 947257868288.000, dc_hat[0] = 560876224512.000 + Gradient do_[0] = 80102817792000.000 +Backward Time Step 3: + Gradient di[0] = 2026906583040.000, df[0] = 1462706241536.000, dc_hat[0] = 793461129216.000 + Gradient do_[0] = 107218942820352.000 +Backward Time Step 2: + Gradient di[0] = 2579115802624.000, df[0] = 1852425502720.000, dc_hat[0] = 1317653577728.000 + Gradient do_[0] = 116842538467328.000 +Backward Time Step 1: + Gradient di[0] = 3234605563904.000, df[0] = 2230477651968.000, dc_hat[0] = 1798779568128.000 + Gradient do_[0] = 102964282785792.000 +Backward Time Step 0: + Gradient di[0] = 3915987812352.000, df[0] = 2786202746880.000, dc_hat[0] = 3674517536768.000 + Gradient do_[0] = 60540818817024.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2391945401860096.000, df[0] = -1877893919539200.000, dc_hat[0] = -1081125409652736.000 + Gradient do_[0] = -139830837939011584.000 +Backward Time Step 3: + Gradient di[0] = -3755693081362432.000, df[0] = -2867160716148736.000, dc_hat[0] = -1551776381665280.000 + Gradient do_[0] = -191181793347502080.000 +Backward Time Step 2: + Gradient di[0] = -4920017644683264.000, df[0] = -3707068145991680.000, dc_hat[0] = -2710973156687872.000 + Gradient do_[0] = -218742564126195712.000 +Backward Time Step 1: + Gradient di[0] = -6207569920000000.000, df[0] = -4441279383470080.000, dc_hat[0] = -3756942111539200.000 + Gradient do_[0] = -196271879349075968.000 +Backward Time Step 0: + Gradient di[0] = -7248111109931008.000, df[0] = -5297646738604032.000, dc_hat[0] = -7327598069678080.000 + Gradient do_[0] = -112446787884679168.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1292147884032.000, df[0] = 947409190912.000, dc_hat[0] = 560965484544.000 + Gradient do_[0] = 80115602030592.000 +Backward Time Step 3: + Gradient di[0] = 2027230724096.000, df[0] = 1462939942912.000, dc_hat[0] = 793587548160.000 + Gradient do_[0] = 107236072357888.000 +Backward Time Step 2: + Gradient di[0] = 2579528941568.000, df[0] = 1852722118656.000, dc_hat[0] = 1317863948288.000 + Gradient do_[0] = 116861219897344.000 +Backward Time Step 1: + Gradient di[0] = 3235121725440.000, df[0] = 2230833119232.000, dc_hat[0] = 1799064649728.000 + Gradient do_[0] = 102980682514432.000 +Backward Time Step 0: + Gradient di[0] = 3916615122944.000, df[0] = 2786649178112.000, dc_hat[0] = 3675106050048.000 + Gradient do_[0] = 60550511853568.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2392436370309120.000, df[0] = -1878278855983104.000, dc_hat[0] = -1081346600468480.000 + Gradient do_[0] = -139859528320548864.000 +Backward Time Step 3: + Gradient di[0] = -3756461343637504.000, df[0] = -2867746710749184.000, dc_hat[0] = -1552092867067904.000 + Gradient do_[0] = -191220877549895680.000 +Backward Time Step 2: + Gradient di[0] = -4921025888256000.000, df[0] = -3707827818332160.000, dc_hat[0] = -2711528281210880.000 + Gradient do_[0] = -218787283325681664.000 +Backward Time Step 1: + Gradient di[0] = -6208846599028736.000, df[0] = -4442192600891392.000, dc_hat[0] = -3757711179120640.000 + Gradient do_[0] = -196312217681920000.000 +Backward Time Step 0: + Gradient di[0] = -7249604684808192.000, df[0] = -5298738734039040.000, dc_hat[0] = -7329108287553536.000 + Gradient do_[0] = -112469946348339200.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1292659326976.000, df[0] = 947784122368.000, dc_hat[0] = 561187520512.000 + Gradient do_[0] = 80147310968832.000 +Backward Time Step 3: + Gradient di[0] = 2028032491520.000, df[0] = 1463518494720.000, dc_hat[0] = 793901268992.000 + Gradient do_[0] = 107278476771328.000 +Backward Time Step 2: + Gradient di[0] = 2580549468160.000, df[0] = 1853455204352.000, dc_hat[0] = 1318384828416.000 + Gradient do_[0] = 116907457904640.000 +Backward Time Step 1: + Gradient di[0] = 3236397580288.000, df[0] = 2231712874496.000, dc_hat[0] = 1799772045312.000 + Gradient do_[0] = 103021274988544.000 +Backward Time Step 0: + Gradient di[0] = 3918156791808.000, df[0] = 2787745988608.000, dc_hat[0] = 3676552560640.000 + Gradient do_[0] = 60574352277504.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2392916601339904.000, df[0] = -1878656142016512.000, dc_hat[0] = -1081564234514432.000 + Gradient do_[0] = -139887608816730112.000 +Backward Time Step 3: + Gradient di[0] = -3757213499785216.000, df[0] = -2868321162625024.000, dc_hat[0] = -1552403715325952.000 + Gradient do_[0] = -191259188658176000.000 +Backward Time Step 2: + Gradient di[0] = -4922017488830464.000, df[0] = -3708574605770752.000, dc_hat[0] = -2712071594573824.000 + Gradient do_[0] = -218831298150531072.000 +Backward Time Step 1: + Gradient di[0] = -6210088918319104.000, df[0] = -4443081659121664.000, dc_hat[0] = -3758459845607424.000 + Gradient do_[0] = -196351387783659520.000 +Backward Time Step 0: + Gradient di[0] = -7251047793819648.000, df[0] = -5299793148510208.000, dc_hat[0] = -7330566965821440.000 + Gradient do_[0] = -112492331717885952.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1292928286720.000, df[0] = 947981320192.000, dc_hat[0] = 561304305664.000 + Gradient do_[0] = 80163987521536.000 +Backward Time Step 3: + Gradient di[0] = 2028455723008.000, df[0] = 1463823892480.000, dc_hat[0] = 794066878464.000 + Gradient do_[0] = 107300773691392.000 +Backward Time Step 2: + Gradient di[0] = 2581087911936.000, df[0] = 1853841866752.000, dc_hat[0] = 1318659686400.000 + Gradient do_[0] = 116931801645056.000 +Backward Time Step 1: + Gradient di[0] = 3237076008960.000, df[0] = 2232180539392.000, dc_hat[0] = 1800146649088.000 + Gradient do_[0] = 103042842099712.000 +Backward Time Step 0: + Gradient di[0] = 3918972583936.000, df[0] = 2788326375424.000, dc_hat[0] = 3677318283264.000 + Gradient do_[0] = 60586964549632.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2393386631823360.000, df[0] = -1879024703897600.000, dc_hat[0] = -1081775828762624.000 + Gradient do_[0] = -139915036477882368.000 +Backward Time Step 3: + Gradient di[0] = -3757955992256512.000, df[0] = -2868888098308096.000, dc_hat[0] = -1552710537052160.000 + Gradient do_[0] = -191296898471034880.000 +Backward Time Step 2: + Gradient di[0] = -4922985467084800.000, df[0] = -3709303676469248.000, dc_hat[0] = -2712604438953984.000 + Gradient do_[0] = -218874282183229440.000 +Backward Time Step 1: + Gradient di[0] = -6211313520869376.000, df[0] = -4443957295579136.000, dc_hat[0] = -3759195358756864.000 + Gradient do_[0] = -196390042489323520.000 +Backward Time Step 0: + Gradient di[0] = -7252474796703744.000, df[0] = -5300836288692224.000, dc_hat[0] = -7332010074832896.000 + Gradient do_[0] = -112514485159198720.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1293271695360.000, df[0] = 948233175040.000, dc_hat[0] = 561453072384.000 + Gradient do_[0] = 80185277808640.000 +Backward Time Step 3: + Gradient di[0] = 2028994297856.000, df[0] = 1464212652032.000, dc_hat[0] = 794277380096.000 + Gradient do_[0] = 107329311735808.000 +Backward Time Step 2: + Gradient di[0] = 2581776826368.000, df[0] = 1854336532480.000, dc_hat[0] = 1319009910784.000 + Gradient do_[0] = 116962973712384.000 +Backward Time Step 1: + Gradient di[0] = 3237938200576.000, df[0] = 2232775344128.000, dc_hat[0] = 1800624275456.000 + Gradient do_[0] = 103070256070656.000 +Backward Time Step 0: + Gradient di[0] = 3920019062784.000, df[0] = 2789070864384.000, dc_hat[0] = 3678300012544.000 + Gradient do_[0] = 60603146174464.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2393880553062400.000, df[0] = -1879412861566976.000, dc_hat[0] = -1081999099953152.000 + Gradient do_[0] = -139943941607784448.000 +Backward Time Step 3: + Gradient di[0] = -3758728281063424.000, df[0] = -2869477582569472.000, dc_hat[0] = -1553029572591616.000 + Gradient do_[0] = -191336171651989504.000 +Backward Time Step 2: + Gradient di[0] = -4923995858141184.000, df[0] = -3710065496293376.000, dc_hat[0] = -2713157952864256.000 + Gradient do_[0] = -218919156001538048.000 +Backward Time Step 1: + Gradient di[0] = -6212595031736320.000, df[0] = -4444873734225920.000, dc_hat[0] = -3759966573821952.000 + Gradient do_[0] = -196430518261121024.000 +Backward Time Step 0: + Gradient di[0] = -7253969982193664.000, df[0] = -5301929357869056.000, dc_hat[0] = -7333521366450176.000 + Gradient do_[0] = -112537669392662528.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1293596360704.000, df[0] = 948471005184.000, dc_hat[0] = 561593974784.000 + Gradient do_[0] = 80205385302016.000 +Backward Time Step 3: + Gradient di[0] = 2029503381504.000, df[0] = 1464580046848.000, dc_hat[0] = 794476019712.000 + Gradient do_[0] = 107356205613056.000 +Backward Time Step 2: + Gradient di[0] = 2582421176320.000, df[0] = 1854799347712.000, dc_hat[0] = 1319338377216.000 + Gradient do_[0] = 116992098959360.000 +Backward Time Step 1: + Gradient di[0] = 3238748749824.000, df[0] = 2233333972992.000, dc_hat[0] = 1801072934912.000 + Gradient do_[0] = 103096025874432.000 +Backward Time Step 0: + Gradient di[0] = 3921002627072.000, df[0] = 2789770526720.000, dc_hat[0] = 3679222759424.000 + Gradient do_[0] = 60618342137856.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2394359441915904.000, df[0] = -1879788671205376.000, dc_hat[0] = -1082214989168640.000 + Gradient do_[0] = -139971850305273856.000 +Backward Time Step 3: + Gradient di[0] = -3759481510952960.000, df[0] = -2870052839751680.000, dc_hat[0] = -1553340689285120.000 + Gradient do_[0] = -191374448400531456.000 +Backward Time Step 2: + Gradient di[0] = -4924985311232000.000, df[0] = -3710810404683776.000, dc_hat[0] = -2713701803098112.000 + Gradient do_[0] = -218963016207564800.000 +Backward Time Step 1: + Gradient di[0] = -6213836814155776.000, df[0] = -4445762524020736.000, dc_hat[0] = -3760714435002368.000 + Gradient do_[0] = -196469774262206464.000 +Backward Time Step 0: + Gradient di[0] = -7255417386172416.000, df[0] = -5302986993565696.000, dc_hat[0] = -7334984876556288.000 + Gradient do_[0] = -112560132071620608.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1293841727488.000, df[0] = 948651032576.000, dc_hat[0] = 561700536320.000 + Gradient do_[0] = 80220593848320.000 +Backward Time Step 3: + Gradient di[0] = 2029887946752.000, df[0] = 1464857395200.000, dc_hat[0] = 794626293760.000 + Gradient do_[0] = 107376522821632.000 +Backward Time Step 2: + Gradient di[0] = 2582909812736.000, df[0] = 1855150227456.000, dc_hat[0] = 1319587807232.000 + Gradient do_[0] = 117014236495872.000 +Backward Time Step 1: + Gradient di[0] = 3239361380352.000, df[0] = 2233756024832.000, dc_hat[0] = 1801411493888.000 + Gradient do_[0] = 103115462279168.000 +Backward Time Step 0: + Gradient di[0] = 3921740824576.000, df[0] = 2790295863296.000, dc_hat[0] = 3679915606016.000 + Gradient do_[0] = 60629759033344.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2394851752542208.000, df[0] = -1880175218262016.000, dc_hat[0] = -1082437522161664.000 + Gradient do_[0] = -140000635176091648.000 +Backward Time Step 3: + Gradient di[0] = -3760256484114432.000, df[0] = -2870644471496704.000, dc_hat[0] = -1553659187953664.000 + Gradient do_[0] = -191413910560047104.000 +Backward Time Step 2: + Gradient di[0] = -4925998923513856.000, df[0] = -3711574908862464.000, dc_hat[0] = -2714258001362944.000 + Gradient do_[0] = -219008096184303616.000 +Backward Time Step 1: + Gradient di[0] = -6215121009377280.000, df[0] = -4446681110151168.000, dc_hat[0] = -3761486723809280.000 + Gradient do_[0] = -196510250034003968.000 +Backward Time Step 0: + Gradient di[0] = -7256906666082304.000, df[0] = -5304075767775232.000, dc_hat[0] = -7336490262593536.000 + Gradient do_[0] = -112583238995673088.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1294130610176.000, df[0] = 948862779392.000, dc_hat[0] = 561825710080.000 + Gradient do_[0] = 80238503526400.000 +Backward Time Step 3: + Gradient di[0] = 2030340276224.000, df[0] = 1465183764480.000, dc_hat[0] = 794802913280.000 + Gradient do_[0] = 107400430354432.000 +Backward Time Step 2: + Gradient di[0] = 2583486791680.000, df[0] = 1855564546048.000, dc_hat[0] = 1319881539584.000 + Gradient do_[0] = 117040358621184.000 +Backward Time Step 1: + Gradient di[0] = 3240088567808.000, df[0] = 2234257506304.000, dc_hat[0] = 1801813360640.000 + Gradient do_[0] = 103138556116992.000 +Backward Time Step 0: + Gradient di[0] = 3922616123392.000, df[0] = 2790918717440.000, dc_hat[0] = 3680736903168.000 + Gradient do_[0] = 60643289858048.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2395329836089344.000, df[0] = -1880550759464960.000, dc_hat[0] = -1082653679812608.000 + Gradient do_[0] = -140028586823254016.000 +Backward Time Step 3: + Gradient di[0] = -3761004613730304.000, df[0] = -2871215433711616.000, dc_hat[0] = -1553967888728064.000 + Gradient do_[0] = -191451981150158848.000 +Backward Time Step 2: + Gradient di[0] = -4926981397282816.000, df[0] = -3712314985414656.000, dc_hat[0] = -2714798898806784.000 + Gradient do_[0] = -219051715872161792.000 +Backward Time Step 1: + Gradient di[0] = -6216364402409472.000, df[0] = -4447570705252352.000, dc_hat[0] = -3762235658731520.000 + Gradient do_[0] = -196549506035089408.000 +Backward Time Step 0: + Gradient di[0] = -7258362123124736.000, df[0] = -5305139845922816.000, dc_hat[0] = -7337962362634240.000 + Gradient do_[0] = -112605830523650048.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1294445182976.000, df[0] = 949093400576.000, dc_hat[0] = 561962418176.000 + Gradient do_[0] = 80258007040000.000 +Backward Time Step 3: + Gradient di[0] = 2030836776960.000, df[0] = 1465542115328.000, dc_hat[0] = 794997161984.000 + Gradient do_[0] = 107426711863296.000 +Backward Time Step 2: + Gradient di[0] = 2584118558720.000, df[0] = 1856018448384.000, dc_hat[0] = 1320204369920.000 + Gradient do_[0] = 117069014106112.000 +Backward Time Step 1: + Gradient di[0] = 3240877096960.000, df[0] = 2234801192960.000, dc_hat[0] = 1802249568256.000 + Gradient do_[0] = 103163654832128.000 +Backward Time Step 0: + Gradient di[0] = 3923561152512.000, df[0] = 2791591116800.000, dc_hat[0] = 3681623474176.000 + Gradient do_[0] = 60657898618880.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2395798792830976.000, df[0] = -1880918918692864.000, dc_hat[0] = -1082865072734208.000 + Gradient do_[0] = -140055980124667904.000 +Backward Time Step 3: + Gradient di[0] = -3761737442525184.000, df[0] = -2871774853201920.000, dc_hat[0] = -1554270281269248.000 + Gradient do_[0] = -191489244286418944.000 +Backward Time Step 2: + Gradient di[0] = -4927938101248000.000, df[0] = -3713036271484928.000, dc_hat[0] = -2715325300736000.000 + Gradient do_[0] = -219094236048392192.000 +Backward Time Step 1: + Gradient di[0] = -6217575046316032.000, df[0] = -4448436678033408.000, dc_hat[0] = -3762964192559104.000 + Gradient do_[0] = -196587696884285440.000 +Backward Time Step 0: + Gradient di[0] = -7259777851719680.000, df[0] = -5306174396170240.000, dc_hat[0] = -7339393660485632.000 + Gradient do_[0] = -112627777806532608.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1294700511232.000, df[0] = 949280505856.000, dc_hat[0] = 562073108480.000 + Gradient do_[0] = 80273827954688.000 +Backward Time Step 3: + Gradient di[0] = 2031234711552.000, df[0] = 1465829294080.000, dc_hat[0] = 795152613376.000 + Gradient do_[0] = 107447775657984.000 +Backward Time Step 2: + Gradient di[0] = 2584626331648.000, df[0] = 1856382828544.000, dc_hat[0] = 1320462057472.000 + Gradient do_[0] = 117091906617344.000 +Backward Time Step 1: + Gradient di[0] = 3241515941888.000, df[0] = 2235241332736.000, dc_hat[0] = 1802602938368.000 + Gradient do_[0] = 103183955263488.000 +Backward Time Step 0: + Gradient di[0] = 3924343652352.000, df[0] = 2792147910656.000, dc_hat[0] = 3682358001664.000 + Gradient do_[0] = 60669994991616.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2396301304004608.000, df[0] = -1881313116160000.000, dc_hat[0] = -1083091162497024.000 + Gradient do_[0] = -140085254621757440.000 +Backward Time Step 3: + Gradient di[0] = -3762524226846720.000, df[0] = -2872375611752448.000, dc_hat[0] = -1554594417082368.000 + Gradient do_[0] = -191529290561486848.000 +Backward Time Step 2: + Gradient di[0] = -4928972651495424.000, df[0] = -3713815002742784.000, dc_hat[0] = -2715893310160896.000 + Gradient do_[0] = -219140140658851840.000 +Backward Time Step 1: + Gradient di[0] = -6218874810793984.000, df[0] = -4449366270017536.000, dc_hat[0] = -3763745608171520.000 + Gradient do_[0] = -196628756771635200.000 +Backward Time Step 0: + Gradient di[0] = -7261287532724224.000, df[0] = -5307277665894400.000, dc_hat[0] = -7340919447617536.000 + Gradient do_[0] = -112651202558164992.000 +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 +Backward Time Step 4: + Gradient di[0] = 1294983102464.000, df[0] = 949487927296.000, dc_hat[0] = 562195988480.000 + Gradient do_[0] = 80291368534016.000 +Backward Time Step 3: + Gradient di[0] = 2031680094208.000, df[0] = 1466150682624.000, dc_hat[0] = 795326414848.000 + Gradient do_[0] = 107471297314816.000 +Backward Time Step 2: + Gradient di[0] = 2585191251968.000, df[0] = 1856789020672.000, dc_hat[0] = 1320750284800.000 + Gradient do_[0] = 117117542203392.000 +Backward Time Step 1: + Gradient di[0] = 3242225303552.000, df[0] = 2235730493440.000, dc_hat[0] = 1802995499008.000 + Gradient do_[0] = 103206512230400.000 +Backward Time Step 0: + Gradient di[0] = 3925198241792.000, df[0] = 2792756084736.000, dc_hat[0] = 3683159900160.000 + Gradient do_[0] = 60683207049216.000 +Time Step 0: + i_gate[0] = 0.542, f_gate[0] = 0.727, o_gate[0] = 0.159, c_hat[0] = 0.795 + c_state[0] = 0.431, h_state[0] = 0.064 +Time Step 1: + i_gate[0] = 0.410, f_gate[0] = 0.722, o_gate[0] = 0.134, c_hat[0] = 0.837 + c_state[0] = 0.654, h_state[0] = 0.077 +Time Step 2: + i_gate[0] = 0.351, f_gate[0] = 0.732, o_gate[0] = 0.132, c_hat[0] = 0.837 + c_state[0] = 0.772, h_state[0] = 0.086 +Time Step 3: + i_gate[0] = 0.312, f_gate[0] = 0.740, o_gate[0] = 0.137, c_hat[0] = 0.868 + c_state[0] = 0.843, h_state[0] = 0.094 +Time Step 4: + i_gate[0] = 0.293, f_gate[0] = 0.759, o_gate[0] = 0.139, c_hat[0] = 0.853 + c_state[0] = 0.890, h_state[0] = 0.099 +Backward Time Step 4: + Gradient di[0] = -2396771334488064.000, df[0] = -1881681946476544.000, dc_hat[0] = -1083303494942720.000 + Gradient do_[0] = -140112725232582656.000 +Backward Time Step 3: + Gradient di[0] = -3763265108705280.000, df[0] = -2872941205258240.000, dc_hat[0] = -1554899762413568.000 + Gradient do_[0] = -191566931654868992.000 +Backward Time Step 2: + Gradient di[0] = -4929941166620672.000, df[0] = -3714544610312192.000, dc_hat[0] = -2716425080799232.000 + Gradient do_[0] = -219183193411026944.000 +Backward Time Step 1: + Gradient di[0] = -6220101560827904.000, df[0] = -4450243517087744.000, dc_hat[0] = -3764485684723712.000 + Gradient do_[0] = -196667445837037568.000 +Backward Time Step 0: + Gradient di[0] = -7262719367446528.000, df[0] = -5308324564172800.000, dc_hat[0] = -7342366851596288.000 + Gradient do_[0] = -112673416129019904.000 +Epoch 1000, Train Loss=0.011137, Weight Norm=13.278138 +Sample Predictions at Epoch 1000: +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 192 | 2024-10-11 | 57.57 | 63.87 | 6.30 | +| 193 | 2024-10-14 | 56.93 | 66.55 | 9.62 | +| 194 | 2024-10-15 | 57.12 | 66.00 | 8.88 | +| 195 | 2024-10-16 | 58.09 | 67.20 | 9.11 | +| 196 | 2024-10-17 | 57.61 | 66.76 | 9.15 | +------------------------------------------------------------- +Time Step 0: + i_gate[0] = 0.554, f_gate[0] = 0.737, o_gate[0] = 0.166, c_hat[0] = 0.812 + c_state[0] = 0.450, h_state[0] = 0.070 +Time Step 1: + i_gate[0] = 0.421, f_gate[0] = 0.735, o_gate[0] = 0.141, c_hat[0] = 0.852 + c_state[0] = 0.689, h_state[0] = 0.084 +Time Step 2: + i_gate[0] = 0.363, f_gate[0] = 0.747, o_gate[0] = 0.140, c_hat[0] = 0.850 + c_state[0] = 0.823, h_state[0] = 0.095 +Time Step 3: + i_gate[0] = 0.328, f_gate[0] = 0.756, o_gate[0] = 0.147, c_hat[0] = 0.877 + c_state[0] = 0.910, h_state[0] = 0.106 +Time Step 4: + i_gate[0] = 0.311, f_gate[0] = 0.774, o_gate[0] = 0.150, c_hat[0] = 0.862 + c_state[0] = 0.972, h_state[0] = 0.112 + +Validation (Last 30 Days): +------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +------------------------------------------------------------- +| 197 | 2024-10-18 | 58.40 | 61.52 | 3.12 | +| 198 | 2024-10-21 | 58.40 | 60.68 | 2.28 | +| 199 | 2024-10-22 | 56.24 | 61.02 | 4.78 | +| 200 | 2024-10-23 | 55.44 | 58.63 | 3.19 | +| 201 | 2024-10-24 | 56.03 | 59.05 | 3.02 | +| 202 | 2024-10-25 | 55.90 | 59.18 | 3.28 | +| 203 | 2024-10-28 | 56.04 | 57.29 | 1.25 | +| 204 | 2024-10-29 | 56.53 | 55.64 | 0.89 | +| 205 | 2024-10-30 | 56.28 | 59.83 | 3.55 | +| 206 | 2024-10-31 | 55.90 | 60.01 | 4.11 | +| 207 | 2024-11-01 | 57.07 | 60.49 | 3.42 | +| 208 | 2024-11-04 | 57.39 | 59.74 | 2.35 | +| 209 | 2024-11-05 | 57.43 | 58.02 | 0.59 | +| 210 | 2024-11-06 | 56.95 | 56.46 | 0.49 | +| 211 | 2024-11-07 | 56.31 | 56.34 | 0.03 | +| 212 | 2024-11-08 | 55.67 | 56.42 | 0.75 | +| 213 | 2024-11-11 | 55.47 | 59.76 | 4.29 | +| 214 | 2024-11-12 | 55.48 | 61.61 | 6.13 | +| 215 | 2024-11-13 | 56.34 | 60.58 | 4.24 | +| 216 | 2024-11-14 | 56.90 | 61.62 | 4.72 | +| 217 | 2024-11-15 | 56.67 | 59.92 | 3.25 | +| 218 | 2024-11-18 | 56.50 | 57.39 | 0.89 | +| 219 | 2024-11-19 | 55.78 | 61.19 | 5.41 | +| 220 | 2024-11-20 | 54.68 | 62.95 | 8.27 | +| 221 | 2024-11-21 | 55.61 | 64.17 | 8.56 | +| 222 | 2024-11-22 | 56.17 | 63.00 | 6.83 | +| 223 | 2024-11-25 | 56.71 | 65.06 | 8.35 | +| 224 | 2024-11-26 | 57.61 | 63.68 | 6.07 | +| 225 | 2024-11-27 | 57.77 | 63.68 | 5.91 | +------------------------------------------------------------- + +Validation Metrics: +Mean Absolute Error (MAE): 3.79 +Root Mean Squared Error (RMSE): 4.49 + +Detailed Predictions for All Data: +-------------------------------------------------------------------- +| Day | Date | Predicted Close | Actual Close | Error | +-------------------------------------------------------------------- +| 0 | 2024-01-08 | 51.55 | 72.11 | 20.56 | +| 1 | 2024-01-09 | 56.39 | 73.28 | 16.89 | +| 2 | 2024-01-10 | 57.81 | 74.47 | 16.66 | +| 3 | 2024-01-11 | 58.27 | 58.81 | 0.54 | +| 4 | 2024-01-12 | 58.52 | 61.39 | 2.87 | +| 5 | 2024-01-16 | 53.72 | 59.09 | 5.37 | +| 6 | 2024-01-17 | 55.11 | 57.66 | 2.55 | +| 7 | 2024-01-18 | 54.86 | 56.45 | 1.59 | +| 8 | 2024-01-19 | 54.35 | 57.27 | 2.92 | +| 9 | 2024-01-22 | 53.68 | 55.65 | 1.97 | +| 10 | 2024-01-23 | 53.97 | 54.79 | 0.82 | +| 11 | 2024-01-24 | 53.49 | 53.40 | 0.09 | +| 12 | 2024-01-25 | 53.05 | 55.25 | 2.20 | +| 13 | 2024-01-26 | 52.37 | 55.58 | 3.21 | +| 14 | 2024-01-29 | 53.22 | 57.24 | 4.02 | +| 15 | 2024-01-30 | 53.38 | 59.82 | 6.44 | +| 16 | 2024-01-31 | 53.73 | 58.73 | 5.00 | +| 17 | 2024-02-01 | 54.18 | 58.80 | 4.62 | +| 18 | 2024-02-02 | 53.67 | 59.14 | 5.47 | +| 19 | 2024-02-05 | 54.19 | 58.76 | 4.57 | +| 20 | 2024-02-06 | 54.19 | 58.32 | 4.13 | +| 21 | 2024-02-07 | 53.83 | 55.58 | 1.75 | +| 22 | 2024-02-08 | 53.32 | 55.22 | 1.90 | +| 23 | 2024-02-09 | 52.60 | 56.55 | 3.95 | +| 24 | 2024-02-12 | 51.91 | 56.60 | 4.69 | +| 25 | 2024-02-13 | 52.31 | 56.39 | 4.08 | +| 26 | 2024-02-14 | 52.29 | 58.62 | 6.33 | +| 27 | 2024-02-15 | 51.96 | 59.54 | 7.58 | +| 28 | 2024-02-16 | 52.62 | 57.92 | 5.30 | +| 29 | 2024-02-20 | 52.79 | 58.10 | 5.31 | +| 30 | 2024-02-21 | 52.23 | 57.07 | 4.84 | +| 31 | 2024-02-22 | 52.36 | 53.81 | 1.45 | +| 32 | 2024-02-23 | 53.14 | 53.88 | 0.74 | +| 33 | 2024-02-26 | 52.08 | 53.51 | 1.43 | +| 34 | 2024-02-27 | 52.81 | 52.55 | 0.26 | +| 35 | 2024-02-28 | 52.69 | 51.48 | 1.21 | +| 36 | 2024-02-29 | 52.22 | 52.52 | 0.30 | +| 37 | 2024-03-01 | 51.96 | 56.09 | 4.13 | +| 38 | 2024-03-04 | 52.20 | 56.39 | 4.19 | +| 39 | 2024-03-05 | 53.32 | 56.00 | 2.68 | +| 40 | 2024-03-06 | 53.36 | 55.48 | 2.12 | +| 41 | 2024-03-07 | 52.87 | 55.38 | 2.51 | +| 42 | 2024-03-08 | 52.53 | 52.44 | 0.09 | +| 43 | 2024-03-11 | 53.15 | 53.38 | 0.23 | +| 44 | 2024-03-12 | 52.30 | 54.41 | 2.11 | +| 45 | 2024-03-13 | 52.51 | 54.54 | 2.03 | +| 46 | 2024-03-14 | 52.67 | 55.20 | 2.53 | +| 47 | 2024-03-15 | 52.80 | 53.47 | 0.67 | +| 48 | 2024-03-18 | 52.71 | 52.92 | 0.21 | +| 49 | 2024-03-19 | 51.89 | 52.90 | 1.01 | +| 50 | 2024-03-20 | 51.53 | 52.87 | 1.34 | +| 51 | 2024-03-21 | 51.50 | 53.28 | 1.78 | +| 52 | 2024-03-22 | 51.36 | 54.28 | 2.92 | +| 53 | 2024-03-25 | 51.60 | 57.08 | 5.48 | +| 54 | 2024-03-26 | 52.23 | 56.50 | 4.27 | +| 55 | 2024-03-27 | 53.22 | 56.28 | 3.06 | +| 56 | 2024-03-28 | 52.87 | 58.51 | 5.64 | +| 57 | 2024-04-01 | 54.48 | 58.21 | 3.73 | +| 58 | 2024-04-02 | 55.19 | 57.44 | 2.25 | +| 59 | 2024-04-03 | 54.73 | 56.97 | 2.24 | +| 60 | 2024-04-04 | 54.23 | 55.18 | 0.95 | +| 61 | 2024-04-05 | 53.37 | 53.52 | 0.15 | +| 62 | 2024-04-08 | 52.24 | 53.09 | 0.85 | +| 63 | 2024-04-09 | 51.70 | 48.52 | 3.18 | +| 64 | 2024-04-10 | 51.30 | 47.20 | 4.10 | +| 65 | 2024-04-11 | 51.02 | 47.09 | 3.93 | +| 66 | 2024-04-12 | 50.22 | 44.75 | 5.47 | +| 67 | 2024-04-15 | 50.05 | 44.87 | 5.18 | +| 68 | 2024-04-16 | 49.52 | 44.12 | 5.40 | +| 69 | 2024-04-17 | 50.11 | 43.54 | 6.57 | +| 70 | 2024-04-18 | 50.31 | 44.26 | 6.05 | +| 71 | 2024-04-19 | 50.35 | 43.59 | 6.76 | +| 72 | 2024-04-22 | 50.95 | 46.32 | 4.63 | +| 73 | 2024-04-23 | 50.94 | 45.22 | 5.72 | +| 74 | 2024-04-24 | 52.20 | 44.53 | 7.67 | +| 75 | 2024-04-25 | 51.96 | 43.41 | 8.55 | +| 76 | 2024-04-26 | 51.43 | 43.01 | 8.42 | +| 77 | 2024-04-29 | 50.90 | 42.90 | 8.00 | +| 78 | 2024-04-30 | 50.78 | 43.79 | 6.99 | +| 79 | 2024-05-01 | 50.78 | 44.07 | 6.71 | +| 80 | 2024-05-02 | 51.29 | 43.60 | 7.69 | +| 81 | 2024-05-03 | 51.34 | 48.55 | 2.79 | +| 82 | 2024-05-06 | 51.20 | 49.41 | 1.79 | +| 83 | 2024-05-07 | 53.57 | 49.70 | 3.87 | +| 84 | 2024-05-08 | 53.54 | 51.31 | 2.23 | +| 85 | 2024-05-09 | 53.45 | 51.75 | 1.70 | +| 86 | 2024-05-10 | 53.96 | 53.20 | 0.76 | +| 87 | 2024-05-13 | 54.00 | 52.55 | 1.45 | +| 88 | 2024-05-14 | 54.43 | 53.00 | 1.43 | +| 89 | 2024-05-15 | 55.82 | 50.23 | 5.59 | +| 90 | 2024-05-16 | 55.88 | 50.70 | 5.18 | +| 91 | 2024-05-17 | 54.64 | 52.10 | 2.54 | +| 92 | 2024-05-20 | 54.33 | 50.94 | 3.38 | +| 93 | 2024-05-21 | 54.82 | 48.44 | 6.38 | +| 94 | 2024-05-22 | 54.24 | 50.62 | 3.61 | +| 95 | 2024-05-23 | 52.81 | 55.45 | 2.64 | +| 96 | 2024-05-24 | 53.42 | 53.02 | 0.40 | +| 97 | 2024-05-28 | 55.38 | 53.61 | 1.77 | +| 98 | 2024-05-29 | 54.63 | 53.61 | 1.02 | +| 99 | 2024-05-30 | 54.73 | 55.06 | 0.33 | +| 100 | 2024-05-31 | 54.52 | 51.96 | 2.56 | +| 101 | 2024-06-03 | 54.87 | 53.44 | 1.43 | +| 102 | 2024-06-04 | 53.89 | 53.34 | 0.55 | +| 103 | 2024-06-05 | 54.12 | 52.87 | 1.25 | +| 104 | 2024-06-06 | 53.91 | 50.13 | 3.78 | +| 105 | 2024-06-07 | 53.77 | 51.22 | 2.55 | +| 106 | 2024-06-10 | 52.94 | 48.46 | 4.48 | +| 107 | 2024-06-11 | 53.31 | 47.68 | 5.63 | +| 108 | 2024-06-12 | 53.45 | 42.70 | 10.75 | +| 109 | 2024-06-13 | 53.48 | 43.06 | 10.42 | +| 110 | 2024-06-14 | 52.53 | 43.22 | 9.31 | +| 111 | 2024-06-17 | 52.84 | 40.86 | 11.98 | +| 112 | 2024-06-18 | 53.09 | 41.72 | 11.37 | +| 113 | 2024-06-20 | 52.31 | 42.00 | 10.31 | +| 114 | 2024-06-21 | 52.37 | 44.14 | 8.23 | +| 115 | 2024-06-24 | 52.49 | 42.97 | 9.52 | +| 116 | 2024-06-25 | 52.93 | 44.80 | 8.13 | +| 117 | 2024-06-26 | 52.56 | 45.41 | 7.15 | +| 118 | 2024-06-27 | 52.56 | 46.34 | 6.22 | +| 119 | 2024-06-28 | 52.47 | 46.54 | 5.93 | +| 120 | 2024-07-01 | 52.30 | 47.29 | 5.01 | +| 121 | 2024-07-02 | 51.98 | 46.13 | 5.85 | +| 122 | 2024-07-03 | 51.93 | 46.93 | 5.00 | +| 123 | 2024-07-05 | 51.36 | 47.34 | 4.02 | +| 124 | 2024-07-08 | 51.22 | 47.02 | 4.20 | +| 125 | 2024-07-09 | 51.06 | 46.10 | 4.96 | +| 126 | 2024-07-10 | 50.92 | 47.89 | 3.03 | +| 127 | 2024-07-11 | 50.45 | 47.38 | 3.07 | +| 128 | 2024-07-12 | 50.98 | 44.69 | 6.29 | +| 129 | 2024-07-15 | 50.63 | 54.35 | 3.72 | +| 130 | 2024-07-16 | 49.45 | 55.87 | 6.42 | +| 131 | 2024-07-17 | 54.37 | 53.08 | 1.29 | +| 132 | 2024-07-18 | 55.63 | 53.46 | 2.17 | +| 133 | 2024-07-19 | 54.89 | 56.20 | 1.31 | +| 134 | 2024-07-22 | 54.50 | 55.27 | 0.77 | +| 135 | 2024-07-23 | 54.85 | 54.17 | 0.68 | +| 136 | 2024-07-24 | 54.53 | 54.53 | 0.00 | +| 137 | 2024-07-25 | 53.91 | 52.76 | 1.15 | +| 138 | 2024-07-26 | 54.11 | 51.63 | 2.48 | +| 139 | 2024-07-29 | 53.50 | 51.16 | 2.34 | +| 140 | 2024-07-30 | 53.14 | 52.35 | 0.79 | +| 141 | 2024-07-31 | 52.96 | 52.02 | 0.94 | +| 142 | 2024-08-01 | 53.56 | 53.17 | 0.39 | +| 143 | 2024-08-02 | 53.48 | 53.05 | 0.43 | +| 144 | 2024-08-05 | 54.55 | 51.67 | 2.88 | +| 145 | 2024-08-06 | 54.51 | 51.38 | 3.13 | +| 146 | 2024-08-07 | 54.18 | 51.62 | 2.56 | +| 147 | 2024-08-08 | 53.93 | 51.54 | 2.39 | +| 148 | 2024-08-09 | 53.98 | 49.68 | 4.30 | +| 149 | 2024-08-12 | 53.89 | 50.19 | 3.70 | +| 150 | 2024-08-13 | 53.35 | 48.73 | 4.62 | +| 151 | 2024-08-14 | 53.47 | 50.18 | 3.29 | +| 152 | 2024-08-15 | 53.10 | 49.48 | 3.62 | +| 153 | 2024-08-16 | 53.59 | 49.96 | 3.63 | +| 154 | 2024-08-19 | 53.37 | 53.85 | 0.48 | +| 155 | 2024-08-20 | 54.13 | 54.37 | 0.24 | +| 156 | 2024-08-21 | 54.99 | 54.76 | 0.23 | +| 157 | 2024-08-22 | 55.22 | 54.97 | 0.25 | +| 158 | 2024-08-23 | 55.20 | 54.36 | 0.84 | +| 159 | 2024-08-26 | 55.07 | 54.35 | 0.72 | +| 160 | 2024-08-27 | 54.62 | 54.43 | 0.19 | +| 161 | 2024-08-28 | 55.69 | 55.29 | 0.40 | +| 162 | 2024-08-29 | 55.62 | 55.43 | 0.19 | +| 163 | 2024-08-30 | 56.07 | 54.85 | 1.22 | +| 164 | 2024-09-03 | 56.17 | 53.05 | 3.12 | +| 165 | 2024-09-04 | 55.47 | 55.63 | 0.16 | +| 166 | 2024-09-05 | 54.39 | 55.09 | 0.70 | +| 167 | 2024-09-06 | 55.14 | 53.91 | 1.23 | +| 168 | 2024-09-09 | 54.91 | 54.73 | 0.18 | +| 169 | 2024-09-10 | 54.22 | 54.61 | 0.39 | +| 170 | 2024-09-11 | 54.31 | 53.74 | 0.57 | +| 171 | 2024-09-12 | 54.10 | 56.91 | 2.81 | +| 172 | 2024-09-13 | 53.57 | 56.00 | 2.43 | +| 173 | 2024-09-16 | 54.65 | 56.45 | 1.80 | +| 174 | 2024-09-17 | 54.34 | 60.23 | 5.89 | +| 175 | 2024-09-18 | 54.46 | 61.21 | 6.75 | +| 176 | 2024-09-19 | 56.18 | 60.53 | 4.35 | +| 177 | 2024-09-20 | 56.60 | 60.15 | 3.55 | +| 178 | 2024-09-23 | 56.30 | 59.91 | 3.61 | +| 179 | 2024-09-24 | 56.17 | 60.91 | 4.74 | +| 180 | 2024-09-25 | 55.93 | 61.14 | 5.21 | +| 181 | 2024-09-26 | 57.06 | 59.88 | 2.82 | +| 182 | 2024-09-27 | 57.10 | 59.96 | 2.86 | +| 183 | 2024-09-30 | 56.60 | 60.90 | 4.30 | +| 184 | 2024-10-01 | 56.56 | 62.12 | 5.56 | +| 185 | 2024-10-02 | 56.88 | 64.79 | 7.91 | +| 186 | 2024-10-03 | 57.40 | 66.61 | 9.21 | +| 187 | 2024-10-04 | 58.55 | 66.37 | 7.82 | +| 188 | 2024-10-07 | 59.12 | 66.39 | 7.27 | +| 189 | 2024-10-08 | 59.21 | 63.78 | 4.57 | +| 190 | 2024-10-09 | 59.20 | 63.77 | 4.57 | +| 191 | 2024-10-10 | 58.11 | 63.02 | 4.91 | +| 192 | 2024-10-11 | 57.89 | 63.87 | 5.98 | +| 193 | 2024-10-14 | 57.20 | 66.55 | 9.35 | +| 194 | 2024-10-15 | 57.35 | 66.00 | 8.65 | +| 195 | 2024-10-16 | 58.29 | 67.20 | 8.91 | +| 196 | 2024-10-17 | 57.75 | 66.76 | 9.01 | +| 197 | 2024-10-18 | 58.40 | 61.52 | 3.12 | +| 198 | 2024-10-21 | 58.40 | 60.68 | 2.28 | +| 199 | 2024-10-22 | 56.24 | 61.02 | 4.78 | +| 200 | 2024-10-23 | 55.44 | 58.63 | 3.19 | +| 201 | 2024-10-24 | 56.03 | 59.05 | 3.02 | +| 202 | 2024-10-25 | 55.90 | 59.18 | 3.28 | +| 203 | 2024-10-28 | 56.04 | 57.29 | 1.25 | +| 204 | 2024-10-29 | 56.53 | 55.64 | 0.89 | +| 205 | 2024-10-30 | 56.28 | 59.83 | 3.55 | +| 206 | 2024-10-31 | 55.90 | 60.01 | 4.11 | +| 207 | 2024-11-01 | 57.07 | 60.49 | 3.42 | +| 208 | 2024-11-04 | 57.39 | 59.74 | 2.35 | +| 209 | 2024-11-05 | 57.43 | 58.02 | 0.59 | +| 210 | 2024-11-06 | 56.95 | 56.46 | 0.49 | +| 211 | 2024-11-07 | 56.31 | 56.34 | 0.03 | +| 212 | 2024-11-08 | 55.67 | 56.42 | 0.75 | +| 213 | 2024-11-11 | 55.47 | 59.76 | 4.29 | +| 214 | 2024-11-12 | 55.48 | 61.61 | 6.13 | +| 215 | 2024-11-13 | 56.34 | 60.58 | 4.24 | +| 216 | 2024-11-14 | 56.90 | 61.62 | 4.72 | +| 217 | 2024-11-15 | 56.67 | 59.92 | 3.25 | +| 218 | 2024-11-18 | 56.50 | 57.39 | 0.89 | +| 219 | 2024-11-19 | 55.78 | 61.19 | 5.41 | +| 220 | 2024-11-20 | 54.68 | 62.95 | 8.27 | +| 221 | 2024-11-21 | 55.61 | 64.17 | 8.56 | +| 222 | 2024-11-22 | 56.17 | 63.00 | 6.83 | +| 223 | 2024-11-25 | 56.71 | 65.06 | 8.35 | +| 224 | 2024-11-26 | 57.61 | 63.68 | 6.07 | +| 225 | 2024-11-27 | 57.77 | 63.68 | 5.91 | +-------------------------------------------------------------------- diff --git a/src/nn_tests/RNN-LSTM/price_predictor b/src/nn_tests/RNN-LSTM/price_predictor new file mode 100755 index 0000000..a96fb71 Binary files /dev/null and b/src/nn_tests/RNN-LSTM/price_predictor differ diff --git a/src/nn_tests/RNN-LSTM/price_predictor.c b/src/nn_tests/RNN-LSTM/price_predictor.c new file mode 100644 index 0000000..f60b96e --- /dev/null +++ b/src/nn_tests/RNN-LSTM/price_predictor.c @@ -0,0 +1,1269 @@ +/******************************************************************************* + * price_predictor.c (Advanced Implementation) + * + * Description: + * A C program that implements a multi-layer LSTM with Adam optimizer, learning + * rate scheduling, L2 regularization, and early stopping for predicting the + * next day's stock close price based on technical indicators calculated from + * historical stock data. + * + * Key Enhancements: + * 1. Adam Optimizer + * 2. Stacked LSTM Layers + * 3. Learning Rate Scheduling + * 4. L2 Regularization + * 5. Early Stopping + * 6. Improved Output Formatting + * + * Compile: + * gcc price_predictor.c -o price_predictor -lm + * + * Run: + * ./price_predictor path/to/stock_data.csv + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#define MAX_SAMPLES 10000 /* Maximum number of data samples */ +#define INPUT_SIZE 13 /* Technical indicators: 13 features */ +#define HIDDEN_SIZE 16 /* Number of hidden units per LSTM layer */ +#define OUTPUT_SIZE 1 /* Next-day close prediction */ +#define EPOCHS 1000 /* Number of training epochs */ +#define INITIAL_LEARNING_RATE 0.001f /* Initial learning rate */ +#define VALIDATION_SIZE 30 /* Number of days for validation */ +#define CLIP_VALUE 5.0f /* Gradient clipping threshold */ +#define L2_LAMBDA 0.0001f /* L2 Regularization parameter */ +#define EARLY_STOPPING_PATIENCE 50 /* Epochs to wait before early stopping */ +#define NUM_LSTM_LAYERS 2 /* Number of stacked LSTM layers */ + +/*************************************** + * Data Structures + ***************************************/ + +/* LSTM parameters for one layer */ +typedef struct { + /* Input gate */ + float W_ix[HIDDEN_SIZE][INPUT_SIZE]; + float W_ih[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_i[HIDDEN_SIZE]; + /* Forget gate */ + float W_fx[HIDDEN_SIZE][INPUT_SIZE]; + float W_fh[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_f[HIDDEN_SIZE]; + /* Output gate */ + float W_ox[HIDDEN_SIZE][INPUT_SIZE]; + float W_oh[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_o[HIDDEN_SIZE]; + /* Candidate gate */ + float W_cx[HIDDEN_SIZE][INPUT_SIZE]; + float W_ch[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_c[HIDDEN_SIZE]; + /* Output layer */ + float W_hy[OUTPUT_SIZE][HIDDEN_SIZE]; + float b_y[OUTPUT_SIZE]; +} LSTMLayerParams; + +/* Gradient for LSTM parameters */ +typedef struct { + /* Input gate */ + float W_ix[HIDDEN_SIZE][INPUT_SIZE]; + float W_ih[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_i[HIDDEN_SIZE]; + /* Forget gate */ + float W_fx[HIDDEN_SIZE][INPUT_SIZE]; + float W_fh[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_f[HIDDEN_SIZE]; + /* Output gate */ + float W_ox[HIDDEN_SIZE][INPUT_SIZE]; + float W_oh[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_o[HIDDEN_SIZE]; + /* Candidate gate */ + float W_cx[HIDDEN_SIZE][INPUT_SIZE]; + float W_ch[HIDDEN_SIZE][HIDDEN_SIZE]; + float b_c[HIDDEN_SIZE]; + /* Output layer */ + float W_hy[OUTPUT_SIZE][HIDDEN_SIZE]; + float b_y[OUTPUT_SIZE]; +} LSTMLayerGrads; + +/* Adam optimizer parameters for one layer */ +typedef struct { + /* Input gate */ + float m_W_ix[HIDDEN_SIZE][INPUT_SIZE]; + float v_W_ix[HIDDEN_SIZE][INPUT_SIZE]; + float m_W_ih[HIDDEN_SIZE][HIDDEN_SIZE]; + float v_W_ih[HIDDEN_SIZE][HIDDEN_SIZE]; + float m_b_i[HIDDEN_SIZE]; + float v_b_i[HIDDEN_SIZE]; + /* Forget gate */ + float m_W_fx[HIDDEN_SIZE][INPUT_SIZE]; + float v_W_fx[HIDDEN_SIZE][INPUT_SIZE]; + float m_W_fh[HIDDEN_SIZE][HIDDEN_SIZE]; + float v_W_fh[HIDDEN_SIZE][HIDDEN_SIZE]; + float m_b_f[HIDDEN_SIZE]; + float v_b_f[HIDDEN_SIZE]; + /* Output gate */ + float m_W_ox[HIDDEN_SIZE][INPUT_SIZE]; + float v_W_ox[HIDDEN_SIZE][INPUT_SIZE]; + float m_W_oh[HIDDEN_SIZE][HIDDEN_SIZE]; + float v_W_oh[HIDDEN_SIZE][HIDDEN_SIZE]; + float m_b_o[HIDDEN_SIZE]; + float v_b_o[HIDDEN_SIZE]; + /* Candidate gate */ + float m_W_cx[HIDDEN_SIZE][INPUT_SIZE]; + float v_W_cx[HIDDEN_SIZE][INPUT_SIZE]; + float m_W_ch[HIDDEN_SIZE][HIDDEN_SIZE]; + float v_W_ch[HIDDEN_SIZE][HIDDEN_SIZE]; + float m_b_c[HIDDEN_SIZE]; + float v_b_c[HIDDEN_SIZE]; + /* Output layer */ + float m_W_hy[OUTPUT_SIZE][HIDDEN_SIZE]; + float v_W_hy[OUTPUT_SIZE][HIDDEN_SIZE]; + float m_b_y[OUTPUT_SIZE]; + float v_b_y[OUTPUT_SIZE]; +} LSTMLayerAdam; + +/* Forward-pass caches for one layer */ +typedef struct { + float i_gate[MAX_SAMPLES][HIDDEN_SIZE]; + float f_gate[MAX_SAMPLES][HIDDEN_SIZE]; + float o_gate[MAX_SAMPLES][HIDDEN_SIZE]; + float c_hat[MAX_SAMPLES][HIDDEN_SIZE]; + float c_state[MAX_SAMPLES][HIDDEN_SIZE]; + float h_state[MAX_SAMPLES][HIDDEN_SIZE]; + float y_pred[MAX_SAMPLES][OUTPUT_SIZE]; +} LSTMLayerCache; + +/* All layers */ +typedef struct { + LSTMLayerParams layers[NUM_LSTM_LAYERS]; + LSTMLayerGrads grads[NUM_LSTM_LAYERS]; + LSTMLayerAdam adam[NUM_LSTM_LAYERS]; + LSTMLayerCache cache[NUM_LSTM_LAYERS]; +} LSTMModel; + +/* Daily aggregated data */ +typedef struct { + char date[11]; /* "YYYY-MM-DD" */ + float open; + float high; + float low; + float close; + float volume; +} DailyBar; + +/*************************************** + * Technical Indicators Structures + ***************************************/ +typedef struct { + float obv[MAX_SAMPLES]; + float ad[MAX_SAMPLES]; + float adx[MAX_SAMPLES]; + float aroonUp[MAX_SAMPLES]; + float aroonDown[MAX_SAMPLES]; + float macd[MAX_SAMPLES]; + float rsi[MAX_SAMPLES]; +} TechnicalIndicators; + +/*************************************** + * Early Stopping Structure + ***************************************/ +typedef struct { + float best_validation_loss; + int epochs_no_improve; + int stop; +} EarlyStopping; + +/*************************************** + * Utility Functions + ***************************************/ + +/* Generate a random float in [-range, range] */ +static inline float randf(float range) { + float r = (float)rand() / (float)RAND_MAX; + return (r * 2.0f - 1.0f) * range; +} + +/* Sigmoid activation */ +static inline float sigmoid_act(float x) { + return 1.0f / (1.0f + expf(-x)); +} + +/* Derivative of sigmoid */ +static inline float dsigmoid(float s) { + return s * (1.0f - s); +} + +/* Tanh activation */ +static inline float tanh_act(float x) { + return tanhf(x); +} + +/* Derivative of tanh */ +static inline float dtanh_act(float tval) { + return 1.0f - tval * tval; +} + +/* Initialize LSTM parameters with Xavier Initialization */ +static void init_lstm_params(LSTMModel *model, float input_size, float hidden_size) { + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + float current_input_size = (layer == 0) ? input_size : hidden_size; + float limit = sqrtf(6.0f / (current_input_size + hidden_size)); + for(int i = 0; i < hidden_size; i++) { + for(int j = 0; j < current_input_size; j++) { + model->layers[layer].W_ix[i][j] = randf(limit); + model->layers[layer].W_fx[i][j] = randf(limit); + model->layers[layer].W_ox[i][j] = randf(limit); + model->layers[layer].W_cx[i][j] = randf(limit); + } + for(int j = 0; j < hidden_size; j++) { + model->layers[layer].W_ih[i][j] = randf(limit); + model->layers[layer].W_fh[i][j] = randf(limit); + model->layers[layer].W_oh[i][j] = randf(limit); + model->layers[layer].W_ch[i][j] = randf(limit); + } + model->layers[layer].b_i[i] = 0.0f; + model->layers[layer].b_f[i] = 1.0f; // Initialize forget gate bias to 1.0f + model->layers[layer].b_o[i] = 0.0f; + model->layers[layer].b_c[i] = 0.0f; + } + /* Initialize output layer */ + for(int i = 0; i < OUTPUT_SIZE; i++) { + for(int j = 0; j < hidden_size; j++) { + model->layers[layer].W_hy[i][j] = randf(limit); + } + model->layers[layer].b_y[i] = 0.0f; + } + } +} + +/* Zero out gradients */ +static void zero_grads(LSTMModel *model) { + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + memset(&model->grads[layer], 0, sizeof(LSTMLayerGrads)); + } +} + +/* Apply gradient clipping */ +static void clip_grads(LSTMModel *model, float clip_value) { + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + LSTMLayerGrads *g = &model->grads[layer]; + /* Clip weights */ + for(int i = 0; i < HIDDEN_SIZE; i++) { + for(int j = 0; j < INPUT_SIZE; j++) { + /* Input gate */ + if(g->W_ix[i][j] > clip_value) g->W_ix[i][j] = clip_value; + if(g->W_ix[i][j] < -clip_value) g->W_ix[i][j] = -clip_value; + /* Forget gate */ + if(g->W_fx[i][j] > clip_value) g->W_fx[i][j] = clip_value; + if(g->W_fx[i][j] < -clip_value) g->W_fx[i][j] = -clip_value; + /* Output gate */ + if(g->W_ox[i][j] > clip_value) g->W_ox[i][j] = clip_value; + if(g->W_ox[i][j] < -clip_value) g->W_ox[i][j] = -clip_value; + /* Candidate gate */ + if(g->W_cx[i][j] > clip_value) g->W_cx[i][j] = clip_value; + if(g->W_cx[i][j] < -clip_value) g->W_cx[i][j] = -clip_value; + } + for(int j = 0; j < HIDDEN_SIZE; j++) { + /* Input gate */ + if(g->W_ih[i][j] > clip_value) g->W_ih[i][j] = clip_value; + if(g->W_ih[i][j] < -clip_value) g->W_ih[i][j] = -clip_value; + /* Forget gate */ + if(g->W_fh[i][j] > clip_value) g->W_fh[i][j] = clip_value; + if(g->W_fh[i][j] < -clip_value) g->W_fh[i][j] = -clip_value; + /* Output gate */ + if(g->W_oh[i][j] > clip_value) g->W_oh[i][j] = clip_value; + if(g->W_oh[i][j] < -clip_value) g->W_oh[i][j] = -clip_value; + /* Candidate gate */ + if(g->W_ch[i][j] > clip_value) g->W_ch[i][j] = clip_value; + if(g->W_ch[i][j] < -clip_value) g->W_ch[i][j] = -clip_value; + } + /* Biases */ + if(g->b_i[i] > clip_value) g->b_i[i] = clip_value; + if(g->b_i[i] < -clip_value) g->b_i[i] = -clip_value; + if(g->b_f[i] > clip_value) g->b_f[i] = clip_value; + if(g->b_f[i] < -clip_value) g->b_f[i] = -clip_value; + if(g->b_o[i] > clip_value) g->b_o[i] = clip_value; + if(g->b_o[i] < -clip_value) g->b_o[i] = -clip_value; + if(g->b_c[i] > clip_value) g->b_c[i] = clip_value; + if(g->b_c[i] < -clip_value) g->b_c[i] = -clip_value; + } + /* Clip output layer gradients */ + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + LSTMLayerGrads *g_out = &model->grads[layer]; + for(int i = 0; i < OUTPUT_SIZE; i++) { + for(int j = 0; j < HIDDEN_SIZE; j++) { + if(g_out->W_hy[i][j] > clip_value) g_out->W_hy[i][j] = clip_value; + if(g_out->W_hy[i][j] < -clip_value) g_out->W_hy[i][j] = -clip_value; + } + if(g_out->b_y[i] > clip_value) g_out->b_y[i] = clip_value; + if(g_out->b_y[i] < -clip_value) g_out->b_y[i] = -clip_value; + } + } + } + +/* Initialize LSTM cache */ +static void init_cache(LSTMModel *model) { + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + memset(&model->cache[layer], 0, sizeof(LSTMLayerCache)); + } +} + +/* Initialize Adam optimizer parameters */ +static void init_adam_parameters(LSTMModel *model) { + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + memset(&model->adam[layer], 0, sizeof(LSTMLayerAdam)); + } +} + +/* Update parameters using Adam optimizer */ +static void update_parameters_adam(LSTMModel *model, int epoch) { + float beta1 = 0.9f; + float beta2 = 0.999f; + float epsilon = 1e-8f; + + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + LSTMLayerParams *params = &model->layers[layer]; + LSTMLayerGrads *grads = &model->grads[layer]; + LSTMLayerAdam *adam = &model->adam[layer]; + + /* Update weights and biases */ + for(int i = 0; i < HIDDEN_SIZE; i++) { + for(int j = 0; j < INPUT_SIZE; j++) { + /* Input gate W_ix */ + adam->m_W_ix[i][j] = beta1 * adam->m_W_ix[i][j] + (1 - beta1) * grads->W_ix[i][j]; + adam->v_W_ix[i][j] = beta2 * adam->v_W_ix[i][j] + (1 - beta2) * grads->W_ix[i][j] * grads->W_ix[i][j]; + float m_hat = adam->m_W_ix[i][j] / (1 - powf(beta1, epoch)); + float v_hat = adam->v_W_ix[i][j] / (1 - powf(beta2, epoch)); + params->W_ix[i][j] -= (INITIAL_LEARNING_RATE * m_hat) / (sqrtf(v_hat) + epsilon); + + /* Forget gate W_fx */ + adam->m_W_fx[i][j] = beta1 * adam->m_W_fx[i][j] + (1 - beta1) * grads->W_fx[i][j]; + adam->v_W_fx[i][j] = beta2 * adam->v_W_fx[i][j] + (1 - beta2) * grads->W_fx[i][j] * grads->W_fx[i][j]; + m_hat = adam->m_W_fx[i][j] / (1 - powf(beta1, epoch)); + v_hat = adam->v_W_fx[i][j] / (1 - powf(beta2, epoch)); + params->W_fx[i][j] -= (INITIAL_LEARNING_RATE * m_hat) / (sqrtf(v_hat) + epsilon); + + /* Output gate W_ox */ + adam->m_W_ox[i][j] = beta1 * adam->m_W_ox[i][j] + (1 - beta1) * grads->W_ox[i][j]; + adam->v_W_ox[i][j] = beta2 * adam->v_W_ox[i][j] + (1 - beta2) * grads->W_ox[i][j] * grads->W_ox[i][j]; + m_hat = adam->m_W_ox[i][j] / (1 - powf(beta1, epoch)); + v_hat = adam->v_W_ox[i][j] / (1 - powf(beta2, epoch)); + params->W_ox[i][j] -= (INITIAL_LEARNING_RATE * m_hat) / (sqrtf(v_hat) + epsilon); + + /* Candidate gate W_cx */ + adam->m_W_cx[i][j] = beta1 * adam->m_W_cx[i][j] + (1 - beta1) * grads->W_cx[i][j]; + adam->v_W_cx[i][j] = beta2 * adam->v_W_cx[i][j] + (1 - beta2) * grads->W_cx[i][j] * grads->W_cx[i][j]; + m_hat = adam->m_W_cx[i][j] / (1 - powf(beta1, epoch)); + v_hat = adam->v_W_cx[i][j] / (1 - powf(beta2, epoch)); + params->W_cx[i][j] -= (INITIAL_LEARNING_RATE * m_hat) / (sqrtf(v_hat) + epsilon); + } + for(int j = 0; j < HIDDEN_SIZE; j++) { + /* Input gate W_ih */ + adam->m_W_ih[i][j] = beta1 * adam->m_W_ih[i][j] + (1 - beta1) * grads->W_ih[i][j]; + adam->v_W_ih[i][j] = beta2 * adam->v_W_ih[i][j] + (1 - beta2) * grads->W_ih[i][j] * grads->W_ih[i][j]; + float m_hat_ih = adam->m_W_ih[i][j] / (1 - powf(beta1, epoch)); + float v_hat_ih = adam->v_W_ih[i][j] / (1 - powf(beta2, epoch)); + params->W_ih[i][j] -= (INITIAL_LEARNING_RATE * m_hat_ih) / (sqrtf(v_hat_ih) + epsilon); + + /* Forget gate W_fh */ + adam->m_W_fh[i][j] = beta1 * adam->m_W_fh[i][j] + (1 - beta1) * grads->W_fh[i][j]; + adam->v_W_fh[i][j] = beta2 * adam->v_W_fh[i][j] + (1 - beta2) * grads->W_fh[i][j] * grads->W_fh[i][j]; + float m_hat_fh = adam->m_W_fh[i][j] / (1 - powf(beta1, epoch)); + float v_hat_fh = adam->v_W_fh[i][j] / (1 - powf(beta2, epoch)); + params->W_fh[i][j] -= (INITIAL_LEARNING_RATE * m_hat_fh) / (sqrtf(v_hat_fh) + epsilon); + + /* Output gate W_oh */ + adam->m_W_oh[i][j] = beta1 * adam->m_W_oh[i][j] + (1 - beta1) * grads->W_oh[i][j]; + adam->v_W_oh[i][j] = beta2 * adam->v_W_oh[i][j] + (1 - beta2) * grads->W_oh[i][j] * grads->W_oh[i][j]; + float m_hat_oh = adam->m_W_oh[i][j] / (1 - powf(beta1, epoch)); + float v_hat_oh = adam->v_W_oh[i][j] / (1 - powf(beta2, epoch)); + params->W_oh[i][j] -= (INITIAL_LEARNING_RATE * m_hat_oh) / (sqrtf(v_hat_oh) + epsilon); + + /* Candidate gate W_ch */ + adam->m_W_ch[i][j] = beta1 * adam->m_W_ch[i][j] + (1 - beta1) * grads->W_ch[i][j]; + adam->v_W_ch[i][j] = beta2 * adam->v_W_ch[i][j] + (1 - beta2) * grads->W_ch[i][j] * grads->W_ch[i][j]; + float m_hat_ch = adam->m_W_ch[i][j] / (1 - powf(beta1, epoch)); + float v_hat_ch = adam->v_W_ch[i][j] / (1 - powf(beta2, epoch)); + params->W_ch[i][j] -= (INITIAL_LEARNING_RATE * m_hat_ch) / (sqrtf(v_hat_ch) + epsilon); + } + /* Biases */ + for(int j = 0; j < HIDDEN_SIZE; j++) { + /* Input gate b_i */ + adam->m_b_i[j] = beta1 * adam->m_b_i[j] + (1 - beta1) * grads->b_i[j]; + adam->v_b_i[j] = beta2 * adam->v_b_i[j] + (1 - beta2) * grads->b_i[j] * grads->b_i[j]; + float m_hat_bi = adam->m_b_i[j] / (1 - powf(beta1, epoch)); + float v_hat_bi = adam->v_b_i[j] / (1 - powf(beta2, epoch)); + params->b_i[j] -= (INITIAL_LEARNING_RATE * m_hat_bi) / (sqrtf(v_hat_bi) + epsilon); + + /* Forget gate b_f */ + adam->m_b_f[j] = beta1 * adam->m_b_f[j] + (1 - beta1) * grads->b_f[j]; + adam->v_b_f[j] = beta2 * adam->v_b_f[j] + (1 - beta2) * grads->b_f[j] * grads->b_f[j]; + float m_hat_bf = adam->m_b_f[j] / (1 - powf(beta1, epoch)); + float v_hat_bf = adam->v_b_f[j] / (1 - powf(beta2, epoch)); + params->b_f[j] -= (INITIAL_LEARNING_RATE * m_hat_bf) / (sqrtf(v_hat_bf) + epsilon); + + /* Output gate b_o */ + adam->m_b_o[j] = beta1 * adam->m_b_o[j] + (1 - beta1) * grads->b_o[j]; + adam->v_b_o[j] = beta2 * adam->v_b_o[j] + (1 - beta2) * grads->b_o[j] * grads->b_o[j]; + float m_hat_bo = adam->m_b_o[j] / (1 - powf(beta1, epoch)); + float v_hat_bo = adam->v_b_o[j] / (1 - powf(beta2, epoch)); + params->b_o[j] -= (INITIAL_LEARNING_RATE * m_hat_bo) / (sqrtf(v_hat_bo) + epsilon); + + /* Candidate gate b_c */ + adam->m_b_c[j] = beta1 * adam->m_b_c[j] + (1 - beta1) * grads->b_c[j]; + adam->v_b_c[j] = beta2 * adam->v_b_c[j] + (1 - beta2) * grads->b_c[j] * grads->b_c[j]; + float m_hat_bc = adam->m_b_c[j] / (1 - powf(beta1, epoch)); + float v_hat_bc = adam->v_b_c[j] / (1 - powf(beta2, epoch)); + params->b_c[j] -= (INITIAL_LEARNING_RATE * m_hat_bc) / (sqrtf(v_hat_bc) + epsilon); + } + } + } + +/* Apply L2 regularization to gradients */ +static void apply_l2_regularization(LSTMModel *model, float lambda) { + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + LSTMLayerGrads *g = &model->grads[layer]; + LSTMLayerParams *p = &model->layers[layer]; + /* Input gate weights */ + for(int i = 0; i < HIDDEN_SIZE; i++) { + for(int j = 0; j < INPUT_SIZE; j++) { + g->W_ix[i][j] += lambda * p->W_ix[i][j]; + g->W_fx[i][j] += lambda * p->W_fx[i][j]; + g->W_ox[i][j] += lambda * p->W_ox[i][j]; + g->W_cx[i][j] += lambda * p->W_cx[i][j]; + } + for(int j = 0; j < HIDDEN_SIZE; j++) { + g->W_ih[i][j] += lambda * p->W_ih[i][j]; + g->W_fh[i][j] += lambda * p->W_fh[i][j]; + g->W_oh[i][j] += lambda * p->W_oh[i][j]; + g->W_ch[i][j] += lambda * p->W_ch[i][j]; + } + /* Biases */ + g->b_i[i] += lambda * p->b_i[i]; + g->b_f[i] += lambda * p->b_f[i]; + g->b_o[i] += lambda * p->b_o[i]; + g->b_c[i] += lambda * p->b_c[i]; + } + /* Output layer weights */ + for(int i = 0; i < OUTPUT_SIZE; i++) { + for(int j = 0; j < HIDDEN_SIZE; j++) { + g->W_hy[i][j] += lambda * p->W_hy[i][j]; + } + g->b_y[i] += lambda * p->b_y[i]; + } + } +} + +/* Implement learning rate schedule (e.g., step decay) */ +static float get_learning_rate(int epoch) { + float initial_lr = INITIAL_LEARNING_RATE; + float decay_rate = 0.5f; + int decay_step = 200; + return initial_lr * powf(decay_rate, floorf((float)epoch / decay_step)); +} + +/* Initialize early stopping */ +static void init_early_stopping(EarlyStopping *es) { + es->best_validation_loss = INFINITY; + es->epochs_no_improve = 0; + es->stop = 0; +} + +/* Update early stopping */ +static void update_early_stopping(EarlyStopping *es, float current_loss) { + if(current_loss < es->best_validation_loss) { + es->best_validation_loss = current_loss; + es->epochs_no_improve = 0; + } + else { + es->epochs_no_improve += 1; + if(es->epochs_no_improve >= EARLY_STOPPING_PATIENCE) { + es->stop = 1; + } + } +} + +/*************************************** + * CSV Parsing and Data Aggregation + ***************************************/ + +/* Load intraday CSV, aggregate into daily bars */ +static int load_and_aggregate_daily( + const char *csv_file, + DailyBar *daily, + int max_days +){ + FILE *fp = fopen(csv_file, "r"); + if(!fp){ + fprintf(stderr, "Could not open file: %s\n", csv_file); + return -1; + } + + char line[1024]; + int firstRow = 1; + char currentDate[11] = {0}; + DailyBar curDay; + int haveCurrent = 0; + int dayCount = 0; + + while(fgets(line, sizeof(line), fp)){ + if(firstRow){ + /* Detect and skip header */ + if(strstr(line, "time") || strstr(line, "Timestamp")){ + firstRow = 0; + continue; + } + firstRow = 0; /* No header present */ + } + + /* Tokenize the CSV line */ + char *ts = strtok(line, ","); + char *oStr = strtok(NULL, ","); + char *hStr = strtok(NULL, ","); + char *lStr = strtok(NULL, ","); + char *cStr = strtok(NULL, ","); + char *vStr = strtok(NULL, ","); + if(!ts || !oStr || !hStr || !lStr || !cStr || !vStr) { + /* Skip malformed line */ + continue; + } + + /* Extract date (YYYY-MM-DD) from Timestamp */ + char dateBuf[11]; + strncpy(dateBuf, ts, 10); + dateBuf[10] = '\0'; + + char *endptr; + float o_val = strtof(oStr, &endptr); + if(endptr == oStr) continue; /* parse fail */ + float h_val = strtof(hStr, &endptr); + if(endptr == hStr) continue; + float l_val = strtof(lStr, &endptr); + if(endptr == lStr) continue; + float c_val = strtof(cStr, &endptr); + if(endptr == cStr) continue; + float v_val = strtof(vStr, &endptr); + if(endptr == vStr) continue; + + /* If it's a new date, finalize the old day (if any) */ + if(!haveCurrent || strcmp(dateBuf, currentDate) != 0){ + /* Finalize the previous day */ + if(haveCurrent){ + /* Check if curDay is valid: open>0, high>=open, etc. */ + if(curDay.open > 0 && curDay.high >= curDay.open && curDay.low <= curDay.open + && curDay.close > 0 && curDay.volume >= 0 + && dayCount < max_days) + { + daily[dayCount++] = curDay; + } + } + /* Start new day */ + strncpy(currentDate, dateBuf, 11); + curDay.open = o_val; + curDay.high = h_val; + curDay.low = l_val; + curDay.close = c_val; + curDay.volume = v_val; + strncpy(curDay.date, dateBuf, 11); + haveCurrent = 1; + } else { + /* Same day => update H, L, C, Vol */ + if(h_val > curDay.high) curDay.high = h_val; + if(l_val < curDay.low) curDay.low = l_val; + curDay.close = c_val; + curDay.volume += v_val; + } + } + /* Finalize last day if valid */ + if(haveCurrent && dayCount < max_days) { + if(curDay.open > 0 && curDay.high >= curDay.open && curDay.low <= curDay.open + && curDay.close > 0 && curDay.volume >= 0) + { + daily[dayCount++] = curDay; + } + } + + fclose(fp); + return dayCount; +} + +/*************************************** + * Technical Indicators Calculation + ***************************************/ + +/* On-Balance Volume (OBV) */ +static void calculate_obv(DailyBar data[], int count, float obvArr[]) { + obvArr[0] = 0.0f; + for(int i = 1; i < count; i++) { + if(data[i].close > data[i-1].close) + obvArr[i] = obvArr[i-1] + data[i].volume; + else if(data[i].close < data[i-1].close) + obvArr[i] = obvArr[i-1] - data[i].volume; + else + obvArr[i] = obvArr[i-1]; + } +} + +/* Accumulation/Distribution (AD) */ +static void calculate_ad(DailyBar data[], int count, float adArr[]) { + for(int i = 0; i < count; i++) { + float money_flow_multiplier = 0.0f; + if(data[i].high != data[i].low) { + money_flow_multiplier = ((data[i].close - data[i].low) - (data[i].high - data[i].close)) / (data[i].high - data[i].low); + } + float money_flow_volume = money_flow_multiplier * data[i].volume; + if(i == 0) + adArr[i] = money_flow_volume; + else + adArr[i] = adArr[i-1] + money_flow_volume; + } +} + +/* Relative Strength Index (RSI) */ +static void calculate_rsi(DailyBar data[], int count, float rsiArr[]) { + int period = 14; + float gains = 0.0f, losses = 0.0f; + + /* Initial calculation */ + for(int i = 1; i <= period; i++) { + float change = data[i].close - data[i-1].close; + if(change > 0) + gains += change; + else + losses -= change; + } + + float average_gain = gains / period; + float average_loss = losses / period; + rsiArr[period] = (average_loss == 0) ? 100.0f : 100.0f - (100.0f / (1.0f + (average_gain / average_loss))); + + /* Subsequent calculations */ + for(int i = period + 1; i < count; i++) { + float change = data[i].close - data[i-1].close; + if(change > 0) { + average_gain = ((average_gain * (period - 1)) + change) / period; + average_loss = (average_loss * (period - 1)) / period; + } else { + average_gain = (average_gain * (period - 1)) / period; + average_loss = ((average_loss * (period - 1)) - change) / period; + } + rsiArr[i] = (average_loss == 0) ? 100.0f : 100.0f - (100.0f / (1.0f + (average_gain / average_loss))); + } + + /* Fill the initial periods with 50 (neutral RSI) */ + for(int i = 0; i < period; i++) { + rsiArr[i] = 50.0f; + } +} + +/* Aroon Up and Aroon Down */ +static void calculate_aroon(DailyBar data[], int count, float aroonUpArr[], float aroonDownArr[]) { + int period = 25; + for(int i = 0; i < count; i++) { + if(i < period - 1){ + aroonUpArr[i] = 50.0f; + aroonDownArr[i] = 50.0f; + continue; + } + int highest = 0, lowest = 0; + for(int j = 0; j < period; j++) { + if(data[i - j].high > data[i - highest].high) + highest = j; + if(data[i - j].low < data[i - lowest].low) + lowest = j; + } + aroonUpArr[i] = ((float)(period - highest) / period) * 100.0f; + aroonDownArr[i] = ((float)(period - lowest) / period) * 100.0f; + } +} + +/* Moving Average Convergence Divergence (MACD) */ +static void calculate_macd(DailyBar data[], int count, float macdArr[]) { + int short_period = 12; + int long_period = 26; + int signal_period = 9; + float ema_short = 0.0f, ema_long = 0.0f; + float multiplier_short = 2.0f / (short_period + 1); + float multiplier_long = 2.0f / (long_period + 1); + float macd_signal = 0.0f; + float multiplier_signal = 2.0f / (signal_period + 1); + + /* Initialize EMA_short and EMA_long */ + for(int i = 0; i < count; i++) { + if(i == short_period - 1) { + float sum = 0.0f; + for(int j = 0; j < short_period; j++) { + sum += data[i - j].close; + } + ema_short = sum / short_period; + macdArr[i] = 0.0f; /* MACD undefined for first calculation */ + } + else if(i >= short_period) { + ema_short = (data[i].close - ema_short) * multiplier_short + ema_short; + if(i == long_period - 1) { + float sum = 0.0f; + for(int j = 0; j < long_period; j++) { + sum += data[i - j].close; + } + ema_long = sum / long_period; + macdArr[i] = ema_short - ema_long; + } + else if(i >= long_period) { + ema_long = (data[i].close - ema_long) * multiplier_long + ema_long; + float macd = ema_short - ema_long; + macdArr[i] = macd; + } + else { + macdArr[i] = 0.0f; + } + } + else { + macdArr[i] = 0.0f; + } + } + + /* Signal line (EMA of MACD) */ + float signal_line = 0.0f; + for(int i = 0; i < count; i++) { + if(macdArr[i] == 0.0f){ + /* Do nothing */ + } + else { + if(i < long_period + signal_period - 1){ + /* Not enough data for signal line */ + macdArr[i] = 0.0f; + } + else if(i == long_period + signal_period - 1){ + /* Initialize signal line */ + float sum = 0.0f; + for(int j = 0; j < signal_period; j++) { + sum += macdArr[i - j]; + } + signal_line = sum / signal_period; + macdArr[i] = signal_line; + } + else { + /* Update signal line */ + signal_line = (macdArr[i] - signal_line) * multiplier_signal + signal_line; + macdArr[i] = signal_line; + } + } + } +} + +/* Average Directional Index (ADX) - Simplified version */ +static void calculate_adx(DailyBar data[], int count, float adxArr[]) { + int period = 14; + float tr = 0.0f, plus_dm = 0.0f, minus_dm = 0.0f; + float atr = 0.0f, plus_di = 0.0f, minus_di = 0.0f, dx = 0.0f, adx = 0.0f; + + for(int i = 1; i < count; i++) { + float high_diff = data[i].high - data[i-1].high; + float low_diff = data[i-1].low - data[i].low; + float plus_dm_val = (high_diff > low_diff && high_diff > 0) ? high_diff : 0.0f; + float minus_dm_val = (low_diff > high_diff && low_diff > 0) ? low_diff : 0.0f; + float tr_val = fmaxf(data[i].high - data[i].low, fmaxf(fabsf(data[i].high - data[i-1].close), fabsf(data[i].low - data[i-1].close))); + + plus_dm += plus_dm_val; + minus_dm += minus_dm_val; + tr += tr_val; + + if(i >= period) { + if(i > period){ + plus_dm = plus_dm - (plus_dm / period) + plus_dm_val; + minus_dm = minus_dm - (minus_dm / period) + minus_dm_val; + tr = tr - (tr / period) + tr_val; + } + + atr = tr / period; + plus_di = (atr == 0.0f) ? 0.0f : (plus_dm / atr) * 100.0f; + minus_di = (atr == 0.0f) ? 0.0f : (minus_dm / atr) * 100.0f; + float di_diff = fabsf(plus_di - minus_di); + float di_sum = plus_di + minus_di; + dx = (di_sum == 0.0f) ? 0.0f : (di_diff / di_sum) * 100.0f; + + if(i == period){ + adx = dx; + } + else { + adx = ((adx * (period - 1)) + dx) / period; + } + + adxArr[i] = adx; + } + else { + adxArr[i] = 0.0f; + } + } + + /* Fill the initial periods with 0 */ + for(int i = 0; i < period; i++) { + adxArr[i] = 0.0f; + } +} + +/* Normalize Data */ +static void normalize_data(float inputs[][INPUT_SIZE], float targets[][OUTPUT_SIZE], int count, + float minVal[], float maxVal[], + float *min_target, float *max_target, + float normalized_inputs[][INPUT_SIZE], + float normalized_targets[][OUTPUT_SIZE]) { + /* Find min and max for inputs */ + for(int j = 0; j < INPUT_SIZE; j++){ + minVal[j] = 1e9f; + maxVal[j] = -1e9f; + } + for(int i = 0; i < count; i++){ + for(int j = 0; j < INPUT_SIZE; j++){ + float v = inputs[i][j]; + if(v < minVal[j]) minVal[j] = v; + if(v > maxVal[j]) maxVal[j] = v; + } + } + + /* Find min and max for targets */ + *min_target = 1e9f; + *max_target = -1e9f; + for(int i = 0; i < count; i++){ + float target = targets[i][0]; + if(target < *min_target) *min_target = target; + if(target > *max_target) *max_target = target; + } + + /* Normalize inputs */ + for(int i = 0; i < count; i++){ + for(int j = 0; j < INPUT_SIZE; j++){ + float denom = (maxVal[j] - minVal[j]); + if(denom < 1e-6f) denom = 1.0f; /* Prevent division by zero */ + normalized_inputs[i][j] = (inputs[i][j] - minVal[j]) / denom; + if(isnan(normalized_inputs[i][j]) || isinf(normalized_inputs[i][j])){ + fprintf(stderr, "Normalization resulted in invalid value at sample %d, feature %d.\n", i, j); + exit(1); + } + } + } + + /* Normalize targets */ + for(int i = 0; i < count; i++){ + normalized_targets[i][0] = (targets[i][0] - *min_target) / (*max_target - *min_target); + if(isnan(normalized_targets[i][0]) || isinf(normalized_targets[i][0])){ + fprintf(stderr, "Normalization resulted in invalid target at sample %d.\n", i); + exit(1); + } + } +} + +/*************************************** + * LSTM Forward Pass + ***************************************/ +static void lstm_forward_pass(LSTMModel *model, float normalized_inputs[][INPUT_SIZE], int seq_len) { + for(int layer = 0; layer < NUM_LSTM_LAYERS; layer++) { + LSTMLayerParams *params = &model->layers[layer]; + LSTMLayerCache *cache = &model->cache[layer]; + int current_input_size = (layer == 0) ? INPUT_SIZE : HIDDEN_SIZE; /* Previous layer's hidden size */ + + float h_prev[HIDDEN_SIZE]; + float c_prev[HIDDEN_SIZE]; + memset(h_prev, 0, sizeof(h_prev)); + memset(c_prev, 0, sizeof(c_prev)); + + for(int t = 0; t < seq_len; t++) { + /* Get input for this layer */ + float *current_input_ptr; + if(layer == 0) + current_input_ptr = normalized_inputs[t]; + else + current_input_ptr = model->cache[layer - 1].h_state[t]; + + /* Compute gate inputs */ + float i_in[HIDDEN_SIZE], f_in[HIDDEN_SIZE], o_in[HIDDEN_SIZE], c_in[HIDDEN_SIZE]; + for(int i = 0; i < HIDDEN_SIZE; i++) { + i_in[i] = params->b_i[i]; + f_in[i] = params->b_f[i]; + o_in[i] = params->b_o[i]; + c_in[i] = params->b_c[i]; + } + + /* Input and hidden contributions */ + for(int i = 0; i < HIDDEN_SIZE; i++) { + for(int j = 0; j < current_input_size; j++) { + i_in[i] += params->W_ix[i][j] * current_input_ptr[j]; + f_in[i] += params->W_fx[i][j] * current_input_ptr[j]; + o_in[i] += params->W_ox[i][j] * current_input_ptr[j]; + c_in[i] += params->W_cx[i][j] * current_input_ptr[j]; + } + for(int j = 0; j < HIDDEN_SIZE; j++) { + i_in[i] += params->W_ih[i][j] * h_prev[j]; + f_in[i] += params->W_fh[i][j] * h_prev[j]; + o_in[i] += params->W_oh[i][j] * h_prev[j]; + c_in[i] += params->W_ch[i][j] * h_prev[j]; + } + } + + /* Activation functions */ + for(int i = 0; i < HIDDEN_SIZE; i++) { + cache->i_gate[t][i] = sigmoid_act(i_in[i]); + cache->f_gate[t][i] = sigmoid_act(f_in[i]); + cache->o_gate[t][i] = sigmoid_act(o_in[i]); + cache->c_hat[t][i] = tanh_act(c_in[i]); + } + + /* Cell and hidden states */ + for(int i = 0; i < HIDDEN_SIZE; i++) { + cache->c_state[t][i] = cache->f_gate[t][i] * c_prev[i] + cache->i_gate[t][i] * cache->c_hat[t][i]; + cache->h_state[t][i] = tanh_act(cache->c_state[t][i]) * cache->o_gate[t][i]; + } + + /* Output layer */ + for(int i = 0; i < OUTPUT_SIZE; i++) { + float sum = params->b_y[i]; + for(int j = 0; j < HIDDEN_SIZE; j++) { + sum += params->W_hy[i][j] * cache->h_state[t][j]; + } + cache->y_pred[t][i] = sum; /* Linear activation */ + } + + /* Update previous states */ + for(int i = 0; i < HIDDEN_SIZE; i++) { + h_prev[i] = cache->h_state[t][i]; + c_prev[i] = cache->c_state[t][i]; + } + + /* Debug: Print activations for first few time steps and layers */ + if(t < 5 && layer == 0){ + printf("Layer %d, Time Step %d:\n", layer+1, t); + printf(" i_gate[0]=%.3f, f_gate[0]=%.3f, o_gate[0]=%.3f, c_hat[0]=%.3f\n", + cache->i_gate[t][0], cache->f_gate[t][0], cache->o_gate[t][0], cache->c_hat[t][0]); + printf(" c_state[0]=%.3f, h_state[0]=%.3f\n", + cache->c_state[t][0], cache->h_state[t][0]); + } + } + } +} + +/*************************************** + * LSTM Backward Pass + ***************************************/ +static float lstm_backward_pass(LSTMModel *model, float targets[][OUTPUT_SIZE], int seq_len) { + float total_loss = 0.0f; + /* Initialize gradients for all layers */ + zero_grads(model); + + /* Initialize variables for backpropagation */ + float dh_next[NUM_LSTM_LAYERS][HIDDEN_SIZE]; + float dc_next[NUM_LSTM_LAYERS][HIDDEN_SIZE]; + memset(dh_next, 0, sizeof(dh_next)); + memset(dc_next, 0, sizeof(dc_next)); + + /* Iterate over time steps in reverse */ + for(int t = seq_len -1; t >=0; t--) { + /* Calculate loss */ + float y_pred = model->cache[NUM_LSTM_LAYERS-1].y_pred[t][0]; + float y_true = targets[t][0]; + float error = y_pred - y_true; + float loss = 0.5f * error * error; + total_loss += loss; + + /* Output layer gradients */ + for(int layer = NUM_LSTM_LAYERS -1; layer >=0; layer--) { + LSTMLayerParams *params = &model->layers[layer]; + LSTMLayerGrads *grads = &model->grads[layer]; + LSTMLayerCache *cache = &model->cache[layer]; + + /* Gradient of loss w.r.t y_pred */ + float dy = error; + + /* Gradient w.r.t W_hy and b_y */ + for(int i = 0; i < OUTPUT_SIZE; i++) { + for(int j = 0; j < HIDDEN_SIZE; j++) { + grads->W_hy[i][j] += dy * cache->h_state[t][j]; + } + grads->b_y[i] += dy; + } + + /* Gradient w.r.t hidden state */ + float dh[HIDDEN_SIZE]; + for(int j = 0; j < HIDDEN_SIZE; j++) { + dh[j] = params->W_hy[0][j] * dy + dh_next[layer][j]; + } + + /* Backprop through output gate */ + float do_[HIDDEN_SIZE]; + for(int j = 0; j < HIDDEN_SIZE; j++) { + do_[j] = tanh_act(cache->c_state[t][j]) * dh[j]; + } + + /* Backprop through cell state */ + float dc[HIDDEN_SIZE]; + for(int j = 0; j < HIDDEN_SIZE; j++) { + float tanhc = tanh_act(cache->c_state[t][j]); + dc[j] = params->W_hy[0][j] * dy * cache->o_gate[t][j] * dtanh_act(tanhc); + dc[j] += dh[j] * cache->o_gate[t][j] * dtanh_act(tanhc); + dc[j] += dc_next[layer][j]; + } + + /* Backprop through gates */ + float di[HIDDEN_SIZE], df[HIDDEN_SIZE], dc_hat[HIDDEN_SIZE]; + for(int j = 0; j < HIDDEN_SIZE; j++) { + di[j] = dc[j] * cache->c_hat[t][j] * dsigmoid(cache->i_gate[t][j]); + df[j] = dc[j] * cache->f_gate[t][j] * dsigmoid(cache->f_gate[t][j]); + dc_hat[j] = dc[j] * cache->i_gate[t][j] * dtanh_act(cache->c_hat[t][j]); + } + + /* Accumulate gradients */ + for(int j = 0; j < HIDDEN_SIZE; j++) { + for(int k = 0; k < INPUT_SIZE; k++) { + grads->W_ix[j][k] += di[j] * ((layer == 0) ? 0 : model->cache[layer-1].h_state[t][k]); + grads->W_fx[j][k] += df[j] * ((layer == 0) ? 0 : model->cache[layer-1].h_state[t][k]); + grads->W_ox[j][k] += do_[j] * ((layer == 0) ? 0 : model->cache[layer-1].h_state[t][k]); + grads->W_cx[j][k] += dc_hat[j] * ((layer == 0) ? 0 : model->cache[layer-1].h_state[t][k]); + } + for(int k = 0; k < HIDDEN_SIZE; k++) { + grads->W_ih[j][k] += di[j] * ((t > 0) ? model->cache[layer].h_state[t-1][k] : 0.0f); + grads->W_fh[j][k] += df[j] * ((t > 0) ? model->cache[layer].h_state[t-1][k] : 0.0f); + grads->W_oh[j][k] += do_[j] * ((t > 0) ? model->cache[layer].h_state[t-1][k] : 0.0f); + grads->W_ch[j][k] += dc_hat[j] * ((t > 0) ? model->cache[layer].h_state[t-1][k] : 0.0f); + } + grads->b_i[j] += di[j]; + grads->b_f[j] += df[j]; + grads->b_o[j] += do_[j]; + grads->b_c[j] += dc_hat[j]; + } + + /* Update dh_next and dc_next */ + for(int j = 0; j < HIDDEN_SIZE; j++) { + dh_next[layer][j] = 0.0f; + dc_next[layer][j] = 0.0f; + for(int k = 0; k < HIDDEN_SIZE; k++) { + dh_next[layer][j] += model->layers[layer].W_ih[j][k] * di[k]; + dh_next[layer][j] += model->layers[layer].W_fh[j][k] * df[k]; + dh_next[layer][j] += model->layers[layer].W_oh[j][k] * do_[k]; + dh_next[layer][j] += model->layers[layer].W_ch[j][k] * dc_hat[k]; + } + } + + /* Debug: Print gradients for first few time steps and layers */ + if(t < 5 && layer == 0){ + printf("Backward Layer %d, Time Step %d:\n", layer+1, t); + printf(" Gradient di[0]=%.3f, df[0]=%.3f, dc_hat[0]=%.3f\n", + di[0], df[0], dc_hat[j]); + printf(" Gradient do_[0]=%.3f\n", do_[0]); + } + } + } + + return total_loss; +} + +/*************************************** + * LSTM Forward Pass for Validation + ***************************************/ +static void lstm_forward_validation(LSTMModel *model, float normalized_inputs[][INPUT_SIZE], int seq_len) { + lstm_forward_pass(model, normalized_inputs, seq_len); +} + +/*************************************** + * Main Function + ***************************************/ +int main(int argc, char *argv[]){ + if(argc < 2){ + fprintf(stderr, "Usage: %s path/to/stock_data.csv\n", argv[0]); + return 1; + } + srand((unsigned)time(NULL)); + + /* 1) Load intraday CSV, aggregate daily bars */ + DailyBar dailyData[MAX_SAMPLES]; + int rawCount = load_and_aggregate_daily(argv[1], dailyData, MAX_SAMPLES); + if(rawCount <= 1){ + fprintf(stderr, "No valid daily bars found in CSV.\n"); + return 1; + } + + /* 2) Calculate Technical Indicators */ + TechnicalIndicators indicators; + calculate_obv(dailyData, rawCount, indicators.obv); + calculate_ad(dailyData, rawCount, indicators.ad); + calculate_adx(dailyData, rawCount, indicators.adx); + calculate_aroon(dailyData, rawCount, indicators.aroonUp, indicators.aroonDown); + calculate_macd(dailyData, rawCount, indicators.macd); + calculate_rsi(dailyData, rawCount, indicators.rsi); + + /* 3) Build input features and targets */ + float inputs[MAX_SAMPLES][INPUT_SIZE]; + float targets_raw[MAX_SAMPLES][OUTPUT_SIZE]; // Raw targets + int validCount = 0; + for(int i = 0; i < rawCount; i++){ + /* Basic check: ensure all indicators are calculated */ + if(i < 26 || indicators.adx[i] == 0.0f || indicators.macd[i] == 0.0f || indicators.rsi[i] == 0.0f){ + continue; + } + inputs[validCount][0] = dailyData[i].open; + inputs[validCount][1] = dailyData[i].high; + inputs[validCount][2] = dailyData[i].low; + inputs[validCount][3] = dailyData[i].close; + inputs[validCount][4] = dailyData[i].volume; + inputs[validCount][5] = dailyData[i].high - dailyData[i].low; /* Range */ + inputs[validCount][6] = indicators.obv[i]; + inputs[validCount][7] = indicators.ad[i]; + inputs[validCount][8] = indicators.adx[i]; + inputs[validCount][9] = indicators.aroonUp[i]; + inputs[validCount][10] = indicators.aroonDown[i]; + inputs[validCount][11] = indicators.macd[i]; + inputs[validCount][12] = indicators.rsi[i]; + + /* Target is the next day's close */ + if(i < rawCount - 1){ + targets_raw[validCount][0] = dailyData[i+1].close; + } + else{ + targets_raw[validCount][0] = dailyData[i].close; /* Last target */ + } + validCount++; + } + + if(validCount <= VALIDATION_SIZE){ + fprintf(stderr, "Not enough valid data after processing.\n"); + return 1; + } + + printf("Total valid daily bars used: %d\n", validCount); + printf("First day: %s O=%.2f H=%.2f L=%.2f C=%.2f V=%.0f\n", + dailyData[0].date, dailyData[0].open, dailyData[0].high, dailyData[0].low, + dailyData[0].close, dailyData[0].volume); + printf("Last day: %s O=%.2f H=%.2f L=%.2f C=%.2f V=%.0f\n", + dailyData[validCount-1].date, dailyData[validCount-1].open, + dailyData[validCount-1].high, dailyData[validCount-1].low, + dailyData[validCount-1].close, dailyData[validCount-1].volume); + + /* 4) Normalize Inputs and Targets */ + float minVal[INPUT_SIZE], maxVal[INPUT_SIZE]; + float min_target, max_target; + float normalized_inputs[MAX_SAMPLES][INPUT_SIZE]; + float normalized_targets[MAX_SAMPLES][OUTPUT_SIZE]; + normalize_data(inputs, targets_raw, validCount, minVal, maxVal, &min_target, &max_target, + normalized_inputs, normalized_targets); + + /* Debug: Print min and max targets */ + printf("\nTarget Min: %.2f, Target Max: %.2f\n", min_target, max_target); + + /* Debug: Print first 5 normalized targets */ + printf("\nNormalized Targets (First 5 Samples):\n"); + for(int i = 0; i < 5 && i < validCount; i++) { + printf("Sample %d: %.3f\n", i, normalized_targets[i][0]); + } + + /* 5) Initialize LSTM Model */ + LSTMModel model; + init_lstm_params(&model, INPUT_SIZE, HIDDEN_SIZE); + init_adam_parameters(&model); + init_cache(&model); + + /* 6) Split data into training and validation */ + int trainLen = validCount - VALIDATION_SIZE; + if(trainLen < 2){ + trainLen = validCount; /* Fallback */ + } + + /* 7) Training Loop with Advanced Features */ + EarlyStopping es; + init_early_stopping(&es); + + for(int epoch = 1; epoch <= EPOCHS; epoch++){ + /* Forward Pass */ + lstm_forward_pass(&model, normalized_inputs, trainLen); + + /* Compute Loss and Backward Pass */ + float epoch_loss = lstm_backward_pass(&model, normalized_targets, trainLen); + + /* Apply L2 Regularization */ + apply_l2_regularization(&model, L2_LAMBDA); + + /* Update Parameters with Adam */ + float current_lr = get_learning_rate(epoch); + update_parameters_adam(&model, epoch); + + /* Apply Gradient Clipping */ + clip_grads(&model, CLIP_VALUE); + + /* Calculate Average Loss */ + float avg_loss = epoch_loss / trainLen; + + /* Early Stopping based on Validation Loss */ + /* Forward pass on validation set */ + lstm_forward_validation(&model, normalized_inputs, validCount); + float validation_loss = 0.0f; + for(int t = trainLen; t < validCount; t++) { + float y_pred_norm = model.cache[NUM_LSTM_LAYERS-1].y_pred[t][0]; + float y_true_norm = normalized_targets[t][0]; + float diff = y_pred_norm - y_true_norm; + validation_loss += 0.5f * diff * diff; + } + float avg_val_loss = validation_loss / VALIDATION_SIZE; + update_early_stopping(&es, avg_val_loss); + + /* Print training progress */ + if(epoch % 100 == 0 || epoch == 1){ + printf("Epoch %4d, Train Loss=%.6f, Val Loss=%.6f, LR=%.6f\n", epoch, avg_loss, avg_val_loss, current_lr); + } + + /* Early stopping condition */ + if(es.stop){ + printf("Early stopping triggered at epoch %d\n", epoch); + break; + } + } + + /* 8) Validation */ + lstm_forward_validation(&model, normalized_inputs, validCount); + printf("\nValidation (Last %d Days):\n", VALIDATION_SIZE); + printf("-------------------------------------------------------------\n"); + printf("| Day | Date | Predicted Close | Actual Close | Error |\n"); + printf("-------------------------------------------------------------\n"); + int valStart = trainLen; + float total_mae = 0.0f; + float total_rmse = 0.0f; + for(int t = valStart; t < validCount; t++){ + float y_pred_norm = model.cache[NUM_LSTM_LAYERS-1].y_pred[t][0]; + /* Denormalize prediction */ + float y_pred_raw = y_pred_norm * (max_target - min_target) + min_target; + float y_true = dailyData[t].close; + float error = fabsf(y_pred_raw - y_true); + total_mae += error; + total_rmse += error * error; + printf("| %3d | %10s | %8.2f | %8.2f | %5.2f |\n", + t, dailyData[t].date, y_pred_raw, y_true, error); + } + printf("-------------------------------------------------------------\n"); + + /* 9) Calculate Validation Metrics */ + float mae = total_mae / VALIDATION_SIZE; + float rmse = sqrtf(total_rmse / VALIDATION_SIZE); + printf("\nValidation Metrics:\n"); + printf("Mean Absolute Error (MAE): %.2f\n", mae); + printf("Root Mean Squared Error (RMSE): %.2f\n", rmse); + + /* 10) Pretty Output for All Data */ + printf("\nDetailed Predictions for All Data:\n"); + printf("--------------------------------------------------------------------\n"); + printf("| Day | Date | Predicted Close | Actual Close | Error |\n"); + printf("--------------------------------------------------------------------\n"); + for(int t = 0; t < validCount; t++){ + float y_pred_norm = model.cache[NUM_LSTM_LAYERS-1].y_pred[t][0]; + /* Denormalize prediction */ + float y_pred_raw = y_pred_norm * (max_target - min_target) + min_target; + float y_true = dailyData[t].close; + float error = fabsf(y_pred_raw - y_true); + printf("| %3d | %10s | %8.2f | %8.2f | %5.2f |\n", + t, dailyData[t].date, y_pred_raw, y_true, error); + } + printf("--------------------------------------------------------------------\n"); + + return 0; +} +