{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import warnings\n", "\n", "if not sys.warnoptions:\n", " warnings.simplefilter('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "from sklearn.preprocessing import MinMaxScaler\n", "from datetime import datetime\n", "from datetime import timedelta\n", "from tqdm import tqdm\n", "sns.set()\n", "tf.compat.v1.random.set_random_seed(1234)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateOpenHighLowCloseAdj CloseVolume
02016-11-02778.200012781.650024763.450012768.700012768.7000121872400
12016-11-03767.250000769.950012759.030029762.130005762.1300051943200
22016-11-04750.659973770.359985750.560974762.020020762.0200202134800
32016-11-07774.500000785.190002772.549988782.520020782.5200201585100
42016-11-08783.400024795.632996780.190002790.510010790.5100101350800
\n", "
" ], "text/plain": [ " Date Open High Low Close Adj Close \\\n", "0 2016-11-02 778.200012 781.650024 763.450012 768.700012 768.700012 \n", "1 2016-11-03 767.250000 769.950012 759.030029 762.130005 762.130005 \n", "2 2016-11-04 750.659973 770.359985 750.560974 762.020020 762.020020 \n", "3 2016-11-07 774.500000 785.190002 772.549988 782.520020 782.520020 \n", "4 2016-11-08 783.400024 795.632996 780.190002 790.510010 790.510010 \n", "\n", " Volume \n", "0 1872400 \n", "1 1943200 \n", "2 2134800 \n", "3 1585100 \n", "4 1350800 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../dataset/GOOG-year.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
00.112708
10.090008
20.089628
30.160459
40.188066
\n", "
" ], "text/plain": [ " 0\n", "0 0.112708\n", "1 0.090008\n", "2 0.089628\n", "3 0.160459\n", "4 0.188066" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "minmax = MinMaxScaler().fit(df.iloc[:, 4:5].astype('float32')) # Close index\n", "df_log = minmax.transform(df.iloc[:, 4:5].astype('float32')) # Close index\n", "df_log = pd.DataFrame(df_log)\n", "df_log.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split train and test\n", "\n", "I will cut the dataset to train and test datasets,\n", "\n", "1. Train dataset derived from starting timestamp until last 30 days\n", "2. Test dataset derived from last 30 days until end of the dataset\n", "\n", "So we will let the model do forecasting based on last 30 days, and we will going to repeat the experiment for 10 times. You can increase it locally if you want, and tuning parameters will help you by a lot." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((252, 7), (222, 1), (30, 1))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_size = 30\n", "simulation_size = 10\n", "\n", "df_train = df_log.iloc[:-test_size]\n", "df_test = df_log.iloc[-test_size:]\n", "df.shape, df_train.shape, df_test.shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "class Model:\n", " def __init__(\n", " self,\n", " learning_rate,\n", " num_layers,\n", " size,\n", " size_layer,\n", " output_size,\n", " forget_bias = 0.1,\n", " ):\n", " def lstm_cell(size_layer):\n", " return tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False)\n", "\n", " rnn_cells = tf.nn.rnn_cell.MultiRNNCell(\n", " [lstm_cell(size_layer) for _ in range(num_layers)],\n", " state_is_tuple = False,\n", " )\n", " self.X = tf.placeholder(tf.float32, (None, None, size))\n", " self.Y = tf.placeholder(tf.float32, (None, output_size))\n", " drop = tf.contrib.rnn.DropoutWrapper(\n", " rnn_cells, output_keep_prob = forget_bias\n", " )\n", " self.hidden_layer = tf.placeholder(\n", " tf.float32, (None, num_layers * 2 * size_layer)\n", " )\n", " _, last_state = tf.nn.dynamic_rnn(\n", " drop, self.X, initial_state = self.hidden_layer, dtype = tf.float32\n", " )\n", " \n", " with tf.variable_scope('decoder', reuse = False):\n", " rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell(\n", " [lstm_cell(size_layer) for _ in range(num_layers)], state_is_tuple = False\n", " )\n", " drop_dec = tf.contrib.rnn.DropoutWrapper(\n", " rnn_cells_dec, output_keep_prob = forget_bias\n", " )\n", " self.outputs, self.last_state = tf.nn.dynamic_rnn(\n", " drop_dec, self.X, initial_state = last_state, dtype = tf.float32\n", " )\n", " \n", " self.logits = tf.layers.dense(self.outputs[-1], output_size)\n", " self.cost = tf.reduce_mean(tf.square(self.Y - self.logits))\n", " self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(\n", " self.cost\n", " )\n", " \n", "def calculate_accuracy(real, predict):\n", " real = np.array(real) + 1\n", " predict = np.array(predict) + 1\n", " percentage = 1 - np.sqrt(np.mean(np.square((real - predict) / real)))\n", " return percentage * 100\n", "\n", "def anchor(signal, weight):\n", " buffer = []\n", " last = signal[0]\n", " for i in signal:\n", " smoothed_val = last * weight + (1 - weight) * i\n", " buffer.append(smoothed_val)\n", " last = smoothed_val\n", " return buffer" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "num_layers = 1\n", "size_layer = 128\n", "timestamp = 5\n", "epoch = 300\n", "dropout_rate = 0.8\n", "future_day = test_size\n", "learning_rate = 0.01" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def forecast():\n", " tf.reset_default_graph()\n", " modelnn = Model(\n", " learning_rate, num_layers, df_log.shape[1], size_layer, df_log.shape[1], dropout_rate\n", " )\n", " sess = tf.InteractiveSession()\n", " sess.run(tf.global_variables_initializer())\n", " date_ori = pd.to_datetime(df.iloc[:, 0]).tolist()\n", "\n", " pbar = tqdm(range(epoch), desc = 'train loop')\n", " for i in pbar:\n", " init_value = np.zeros((1, num_layers * 2 * size_layer))\n", " total_loss, total_acc = [], []\n", " for k in range(0, df_train.shape[0] - 1, timestamp):\n", " index = min(k + timestamp, df_train.shape[0] - 1)\n", " batch_x = np.expand_dims(\n", " df_train.iloc[k : index, :].values, axis = 0\n", " )\n", " batch_y = df_train.iloc[k + 1 : index + 1, :].values\n", " logits, last_state, _, loss = sess.run(\n", " [modelnn.logits, modelnn.last_state, modelnn.optimizer, modelnn.cost],\n", " feed_dict = {\n", " modelnn.X: batch_x,\n", " modelnn.Y: batch_y,\n", " modelnn.hidden_layer: init_value,\n", " },\n", " ) \n", " init_value = last_state\n", " total_loss.append(loss)\n", " total_acc.append(calculate_accuracy(batch_y[:, 0], logits[:, 0]))\n", " pbar.set_postfix(cost = np.mean(total_loss), acc = np.mean(total_acc))\n", " \n", " future_day = test_size\n", "\n", " output_predict = np.zeros((df_train.shape[0] + future_day, df_train.shape[1]))\n", " output_predict[0] = df_train.iloc[0]\n", " upper_b = (df_train.shape[0] // timestamp) * timestamp\n", " init_value = np.zeros((1, num_layers * 2 * size_layer))\n", "\n", " for k in range(0, (df_train.shape[0] // timestamp) * timestamp, timestamp):\n", " out_logits, last_state = sess.run(\n", " [modelnn.logits, modelnn.last_state],\n", " feed_dict = {\n", " modelnn.X: np.expand_dims(\n", " df_train.iloc[k : k + timestamp], axis = 0\n", " ),\n", " modelnn.hidden_layer: init_value,\n", " },\n", " )\n", " init_value = last_state\n", " output_predict[k + 1 : k + timestamp + 1] = out_logits\n", "\n", " if upper_b != df_train.shape[0]:\n", " out_logits, last_state = sess.run(\n", " [modelnn.logits, modelnn.last_state],\n", " feed_dict = {\n", " modelnn.X: np.expand_dims(df_train.iloc[upper_b:], axis = 0),\n", " modelnn.hidden_layer: init_value,\n", " },\n", " )\n", " output_predict[upper_b + 1 : df_train.shape[0] + 1] = out_logits\n", " future_day -= 1\n", " date_ori.append(date_ori[-1] + timedelta(days = 1))\n", "\n", " init_value = last_state\n", " \n", " for i in range(future_day):\n", " o = output_predict[-future_day - timestamp + i:-future_day + i]\n", " out_logits, last_state = sess.run(\n", " [modelnn.logits, modelnn.last_state],\n", " feed_dict = {\n", " modelnn.X: np.expand_dims(o, axis = 0),\n", " modelnn.hidden_layer: init_value,\n", " },\n", " )\n", " init_value = last_state\n", " output_predict[-future_day + i] = out_logits[-1]\n", " date_ori.append(date_ori[-1] + timedelta(days = 1))\n", " \n", " output_predict = minmax.inverse_transform(output_predict)\n", " deep_future = anchor(output_predict[:, 0], 0.3)\n", " \n", " return deep_future[-test_size:]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "W0813 21:47:16.666563 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:47:16.753933 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:47:16.834197 140095600830272 deprecation.py:323] From :41: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use keras.layers.dense instead.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:36<00:00, 3.11it/s, acc=97.9, cost=0.00101] \n", "W0813 21:48:54.353741 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:48:54.437589 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:38<00:00, 3.05it/s, acc=98.3, cost=0.00069] \n", "W0813 21:50:34.225154 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:50:34.305581 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:38<00:00, 3.06it/s, acc=97.7, cost=0.00117] \n", "W0813 21:52:13.825603 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:52:13.908980 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 4\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:37<00:00, 3.08it/s, acc=98.4, cost=0.000614]\n", "W0813 21:53:52.767824 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:53:52.849310 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 5\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:38<00:00, 3.03it/s, acc=98.2, cost=0.000755]\n", "W0813 21:55:32.572073 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:55:32.654169 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 6\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:38<00:00, 3.07it/s, acc=98.3, cost=0.000681]\n", "W0813 21:57:12.073868 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 21:57:12.156364 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 7\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:38<00:00, 3.01it/s, acc=97.7, cost=0.00126] \n", "W0813 21:58:51.933507 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 8\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "W0813 21:58:52.153095 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "train loop: 100%|██████████| 300/300 [01:38<00:00, 3.04it/s, acc=98.5, cost=0.000589]\n", "W0813 22:00:31.650501 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 22:00:31.732362 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 9\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:38<00:00, 3.01it/s, acc=98.4, cost=0.000625]\n", "W0813 22:02:11.445839 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n", "W0813 22:02:11.528598 140095600830272 rnn_cell_impl.py:893] : Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "simulation 10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "train loop: 100%|██████████| 300/300 [01:39<00:00, 3.08it/s, acc=96.8, cost=0.0027] \n" ] } ], "source": [ "results = []\n", "for i in range(simulation_size):\n", " print('simulation %d'%(i + 1))\n", " results.append(forecast())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "accuracies = [calculate_accuracy(df['Close'].iloc[-test_size:].values, r) for r in results]\n", "\n", "plt.figure(figsize = (15, 5))\n", "for no, r in enumerate(results):\n", " plt.plot(r, label = 'forecast %d'%(no + 1))\n", "plt.plot(df['Close'].iloc[-test_size:].values, label = 'true trend', c = 'black')\n", "plt.legend()\n", "plt.title('average accuracy: %.4f'%(np.mean(accuracies)))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }