diff --git a/README.md b/README.md index e7336ab..e4e8c73 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,6 @@ INITIAL_EPSILON = 0.1 ## Disclaimer This work is highly based on the following repos: -1. [sourabhv/FlapPyBird] (https://github.com/sourabhv/FlapPyBird) +1. [sourabhv/FlapPyBird](https://github.com/sourabhv/FlapPyBird) 2. [asrivat1/DeepLearningVideoGames](https://github.com/asrivat1/DeepLearningVideoGames) diff --git a/deep_q_network.py b/deep_q_network.py index 1294f96..66b763a 100755 --- a/deep_q_network.py +++ b/deep_q_network.py @@ -79,7 +79,7 @@ def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) - readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) + readout_action = tf.reduce_sum(tf.multiply(readout, a), axis=1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) @@ -150,7 +150,7 @@ def trainNetwork(s, readout, h_fc1, sess): # only train if done observing if t > OBSERVE: # sample a minibatch to train on - minibatch = random.sample(D, BATCH) + minibatch = random.sample(list(D), BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] diff --git a/deep_q_network_opt_v1.py b/deep_q_network_opt_v1.py new file mode 100755 index 0000000..0200822 --- /dev/null +++ b/deep_q_network_opt_v1.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python +#* + +# distinction between dqn (vanilla) and this: +# we want to formulate an addition to state, s, called "delta s", ds, which will cause the Q value of s+ds and a +# target action, a^t, to be higher than the q value for s+ds and any other action. The implication is that adding ds +# to our data will result in an action to be drawn. In cases where ds is not added/nothing is changed, we will +# behave normally. +# In order to retain the natural behaviour of a DQN controller, we will not be changing the weights or controller +# for that matter, we will instead minimize the loss between Q(s+ds, a^t) and Q(s+ds, a), where a^t is the target a, +# and a is any non-target a. Thee loss function will be (some) hinge loss: l(a,b) = max(b - (a + eps), 0), which +# will essentially enforce the condition: a >= b + eps +# An informal proof associated with the possibility this will work depends on the fact that our controller learns how +# to behave well within a certain set of input states from the possible set of states it's been trained on, +# called (here) the game-possible pixel space. This is the set of frames the game can generate under any scenario +# within the game. The game-possible pixel space is small, relative to the pixel space, which is a space containing +# all possible combinations of pixel intensities that a screen can generate. Given that ds blongs in the pixel space, +# and s belongs in the game-possible pixel space, we can say that s+ds belongs in the pixel space, which the controller +# may not know how to handle, which, if abused properly may result in a simple adversarial attack. + +from __future__ import print_function +import tensorflow as tf +import cv2 +import sys +sys.path.append("game/") +import wrapped_flappy_bird as game +import random +import numpy as np +from collections import deque + +# const for advesarial optimization: +# flap = [0, 1] +# noaction = [1, 0] +action_target = [0, 1] +LR = 0.01 # learning rate for optimizing ds +# number of time steps in which to calculate ds, first 10 frames of game +INTERVAL = 10 + +GAME = 'bird' # the name of the game being played for log files +ACTIONS = 2 # number of valid actions +GAMMA = 0.99 # decay rate of past observations +OBSERVE = 100000. # timesteps to observe before training +EXPLORE = 2000000. # frames over which to anneal epsilon +FINAL_EPSILON = 0.0001 # final value of epsilon +INITIAL_EPSILON = 0.0001 # starting value of epsilon +REPLAY_MEMORY = 50000 # number of previous transitions to remember +BATCH = 32 # size of minibatch +FRAME_PER_ACTION = 1 + +def weight_variable(shape): + initial = tf.truncated_normal(shape, stddev = 0.01) + return tf.Variable(initial) + +def bias_variable(shape): + initial = tf.constant(0.01, shape = shape) + return tf.Variable(initial) + +def conv2d(x, W, stride): + return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") + +def max_pool_2x2(x): + return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") + +def createNetwork(): + # network weights + W_conv1 = weight_variable([8, 8, 4, 32]) + b_conv1 = bias_variable([32]) + + W_conv2 = weight_variable([4, 4, 32, 64]) + b_conv2 = bias_variable([64]) + + W_conv3 = weight_variable([3, 3, 64, 64]) + b_conv3 = bias_variable([64]) + + W_fc1 = weight_variable([1600, 512]) + b_fc1 = bias_variable([512]) + + W_fc2 = weight_variable([512, ACTIONS]) + b_fc2 = bias_variable([ACTIONS]) + + # input layer + s = tf.placeholder("float", [None, 80, 80, 4]) + + # hidden layers + h_conv1 = tf.nn.relu(conv2d(s, W_conv1, 4) + b_conv1) + h_pool1 = max_pool_2x2(h_conv1) + + h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2, 2) + b_conv2) + #h_pool2 = max_pool_2x2(h_conv2) + + h_conv3 = tf.nn.relu(conv2d(h_conv2, W_conv3, 1) + b_conv3) + #h_pool3 = max_pool_2x2(h_conv3) + + #h_pool3_flat = tf.reshape(h_pool3, [-1, 256]) + h_conv3_flat = tf.reshape(h_conv3, [-1, 1600]) + + h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1) + + # readout layer + readout = tf.matmul(h_fc1, W_fc2) + b_fc2 + + return s, readout, h_fc1 + +def trainNetwork(s, readout, h_fc1, delta_s, sess): + # define the cost function + a = tf.placeholder("float", [None, ACTIONS]) + y = tf.placeholder("float", [None]) + readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) + cost = tf.reduce_mean(tf.square(y - readout_action)) + train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) + + # open up a game state to communicate with emulator + game_state = game.GameState() + + # store the previous observations in replay memory + D = deque() + + # printing + a_file = open("logs_" + GAME + "/readout.txt", 'w') + h_file = open("logs_" + GAME + "/hidden.txt", 'w') + + # get the first state by doing nothing and preprocess the image to 80x80x4 + do_nothing = np.zeros(ACTIONS) + do_nothing[0] = 1 + x_t, r_0, terminal = game_state.frame_step(do_nothing) + x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) + ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) + s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) + + # saving and loading networks + saver = tf.train.Saver() + sess.run(tf.initialize_all_variables()) + checkpoint = tf.train.get_checkpoint_state("saved_networks") + if checkpoint and checkpoint.model_checkpoint_path: + saver.restore(sess, checkpoint.model_checkpoint_path) + print("Successfully loaded:", checkpoint.model_checkpoint_path) + else: + print("Could not find old network weights") + + # start training + epsilon = INITIAL_EPSILON + t = 0 + while "flappy bird" != "angry bird": + # choose an action epsilon greedily + readout_t = readout.eval(feed_dict={s : [s_t]})[0] + a_t = np.zeros([ACTIONS]) + action_index = 0 + if t % FRAME_PER_ACTION == 0: + if random.random() <= epsilon: + print("----------Random Action----------") + action_index = random.randrange(ACTIONS) + a_t[random.randrange(ACTIONS)] = 1 + else: + action_index = np.argmax(readout_t) + a_t[action_index] = 1 + else: + a_t[0] = 1 # do nothing + + # scale down epsilon + if epsilon > FINAL_EPSILON and t > OBSERVE: + epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE + + # run the selected action and observe next state and reward + x_t1_colored, r_t, terminal = game_state.frame_step(a_t) + x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) + ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) + x_t1 = np.reshape(x_t1, (80, 80, 1)) + #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) + s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) + + # store the transition in D + D.append((s_t, a_t, r_t, s_t1, terminal)) + if len(D) > REPLAY_MEMORY: + D.popleft() + + # only train if done observing + if t > INTERVAL: + # sample a minibatch to optimize on + opt_batch = random.sample(list(D), INTERVAL) + + # get the batch variables + s_opt_batch = [d[0] for d in opt_batch] + + s_ds = np.ndarray((INTERVAL, 80, 80, 4), dtype=float) # forward init. + + # a_batch = [d[1] for d in minibatch] + # r_batch = [d[2] for d in minibatch] + # s_j1_batch = [d[3] for d in minibatch] + + # y_batch = [] + # readout_j_batch = readout.eval(feed_dict = {s : s_j_batch}) + # readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) + # for i in range(0, len(minibatch)): + # terminal = minibatch[i][4] + # # if terminal, only equals reward + # if terminal: + # y_batch.append(r_batch[i]) + # else: + # y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) + + # IN ATTACK FORMULATION NOT NECESSARY, WE WANT TO RETAIN WEIGHTS, BUT TAKE THE PARAMS INTO ATTACK OPERATION + # perform gradient step + # train_step.run(feed_dict = { + # y : y_batch, + # a : a_batch, + # s : s_j_batch} + # ) + + # taking params into attack operation: + # intake batch of s_t, a_t, r_t, s_t1 resulted from a normal controller devised optimization, across batch by modulating ds, + # reduce the sum (expected) loss between Q(s+ds, a^t) and Q(s+ds, a), where a^t != a. + # subjects: batch of s_t, a_t, r_t, s_t1, size BATCH. + # Note: the training is complete, so in theory we shouldn't be using this as a SGD batch anymore. + # + # + # theory/idea: + # RL is less suceptible to a stationary attack since reproducing a setting is challenging. We can go through an + # interaction, record it, and then produce an optimization which will abuse that origional interaction, but when + # will that come in handy? In CV, usually repeated inputs are easy to produce, but in an RL setting there isn't + # opportunity for repeated input. + # if we find that one adversarial input transfers well to other similar images, maybe we can make a case here. + + # input and noise which should result in target action to be drawn + s_ds = np.reshape(s_opt_batch,[INTERVAL, 80, 80, 4]) + delta_s + + # Q values for both actions at state s + ds for entire batch + # you just need to feed into s + # ds generates automatically + # then you use s+ds as your new input + # talk after meeting have Q's + readout_s_ds = readout.eval(feed_dict={s : [s_opt_batch][0]}) + + # readout(s) = [Q(no flap), Q(flap)] + # a = readout[target_action] + a = tf.placeholder("float", INTERVAL) # readout_s_ds[1] + a = readout_s_ds[:,0] + + # b = readout[!target_action] + b = tf.placeholder("float", INTERVAL) # readout_s_ds[0] + b = readout_s_ds[:,1] + + eps = 1 # Q values are typically 10- 30 + loss = tf.nn.relu(b - a + eps) + opt = tf.train.AdamOptimizer(LR).minimize(loss, var_list=(delta_s)) + # opt.run(feed_dict={}) + + # update the old values + s_t = s_t1 + t += 1 + + # save progress every 10000 iterations + if t % 10000 == 0: + saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) + + # print info + state = "" + if t <= OBSERVE: + state = "observe" + elif t > OBSERVE and t <= OBSERVE + EXPLORE: + state = "explore" + else: + state = "train" + + print("TIMESTEP", t, "/ STATE", state, \ + "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ + "/ Q_MAX %e" % np.max(readout_t)) + # write info to files + ''' + if t % 10000 <= 100: + a_file.write(",".join([str(x) for x in readout_t]) + '\n') + h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n') + cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1) + ''' + +def playGame(): + sess = tf.InteractiveSession() + s, readout, h_fc1 = createNetwork() + + # subject of optimization + # gaussian noise, 1 set of perturbations which will be added to 4 frames + # same noise across every frame within stack within data set + # image data not normalized, gaussian distribution must bee scaled to [0, 255] + # tensor = np.random.normal(loc=(255/2), scale=(255*(0.01**0.5)), size=(INTERVAL, 80, 80, 4)) + tensor = (tf.random.normal([INTERVAL, 80, 80, 4], mean=(255.0 / 2), stddev=(255 * (0.01 ** 0.5)))) + delta_s = tf.Variable(initial_value=tensor, trainable=True) + + trainNetwork(s, readout, h_fc1, delta_s, sess) + +def main(): + playGame() + +if __name__ == "__main__": + main()