diff --git a/README.md b/README.md index e37bb2a..aee828c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ # openai-tests -My OpenAI playground \ No newline at end of file +My OpenAI playground + +# Q-Learning + +``` +import ql +frames = ql.train(10000) # run 10k training sessions +ql.review(frames) # review the training process +ql.play() # see the trained algorithm in action +``` \ No newline at end of file diff --git a/Taxi-v3.dat b/Taxi-v3.dat new file mode 100644 index 0000000..141eb2e Binary files /dev/null and b/Taxi-v3.dat differ diff --git a/ql/__init__.py b/ql/__init__.py new file mode 100644 index 0000000..2bc8ebd --- /dev/null +++ b/ql/__init__.py @@ -0,0 +1,14 @@ +import gym +from time import sleep +import numpy as np +import pickle +import random +import os + +from .settings import * + +env = gym.make(gym_name).env + +from .train import * +from .play import * +from .review import * diff --git a/ql/play.py b/ql/play.py new file mode 100644 index 0000000..6fa2998 --- /dev/null +++ b/ql/play.py @@ -0,0 +1,59 @@ +from . import * + +def play(player_episodes=1): + q_table = pickle.load( open( gym_name+".dat", "rb" )) + + total_epochs = 0 + episodes = player_episodes + + print(f"Evaluation: 0%") + + total_epochs, total_rewards = 0, 0 + + try: + for ep in range(episodes): + state = env.reset() + epochs, reward = 0, 0 + + done = False + + i = 0 + while not done: + action = np.argmax(q_table[state]) + state, reward, done, info = env.step(action) + + if i > max_iterations: + done = True + + ftext = "" + if use_ansi: + ftext = env.render(mode="ansi") + else: + ftext = str(info) + + i += 1 + print (u"{}[2J{}[;H".format(chr(27), chr(27))) + print(f"Evaluation: {100 * ep / episodes}%") + print(f"{ftext}") + sleep(.1) + + epochs += 1 + + total_epochs += epochs + total_rewards += reward + + + sleep(1) + except KeyboardInterrupt: + print(f"Results after {episodes} episodes:") + print(f"Average timesteps per episode: {total_epochs / episodes}") + print(f"Average rewards per episode: {total_rewards / episodes}") + + exit() + + print (u"{}[2J{}[;H".format(chr(27), chr(27))) + print("Evaluation: finished.\n") + + print(f"Results after {episodes} episodes:") + print(f"Average timesteps per episode: {total_epochs / episodes}") + print(f"Average rewards per episode: {total_rewards / episodes}") diff --git a/ql/review.py b/ql/review.py new file mode 100644 index 0000000..3c3fde7 --- /dev/null +++ b/ql/review.py @@ -0,0 +1,32 @@ +from . import * + +def review(frames): + sucs = "" + prevSess = -1 + rew = 0 + cnt = 1 + + for i, frame in enumerate(frames): + print (u"{}[2J{}[;H".format(chr(27), chr(27))) + print(f"Session: {frame['session']}") + print(frame['frame']) + print(f"Timestep: {i + 1}") + print(f"State: {frame['state']}") + print(f"Action: {frame['action']}") + print(f"Reward: {frame['reward']}") + + sess = frame['session'] + if sess != prevSess: + if rew > 0: + sucs += "+" + sleep(1) + elif rew < 0: + sucs += "-" + else: + sucs += "." + prevSess = frame['session'] + cnt += 1 + rew = frame['reward'] + + print(f"\nSuccesses: [{sucs}]") + sleep(.1) diff --git a/ql/settings.py b/ql/settings.py new file mode 100644 index 0000000..137a2d7 --- /dev/null +++ b/ql/settings.py @@ -0,0 +1,17 @@ +# OpenAI Gym settings + +gym_name = "Taxi-v3" + +# Q-Learning training settings + +alpha = 0.1 +gamma = 0.8 +epsilon = 0.1 + +# Q-learning player settings + +max_iterations = 1000 + +# Render settings + +use_ansi = True diff --git a/ql/train.py b/ql/train.py new file mode 100644 index 0000000..cb0de74 --- /dev/null +++ b/ql/train.py @@ -0,0 +1,77 @@ +from . import * + +def train(training_episodes=10000, resume=True): + if resume and os.path.exists(gym_name+".dat"): + q_table = pickle.load( open( gym_name+".dat", "rb" )) + else: + q_table = np.zeros([env.observation_space.n, env.action_space.n]) + + episodes = training_episodes + percentage = episodes / 100 + + frames = [] + suc_cnt = 0 + + try: + for i in range(1, episodes + 1): + state = env.reset() + + epochs, reward, = 0, 0 + done = False + + while not done: + if random.uniform(0, 1) < epsilon: + action = env.action_space.sample() + else: + action = np.argmax(q_table[state]) + + next_state, reward, done, info = env.step(action) + + old_value = q_table[state, action] + next_max = np.max(q_table[next_state]) + + new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max) + q_table[state, action] = new_value + + if reward > 0: + suc_cnt += 1 + + state = next_state + epochs += 1 + + if i % percentage == 0: + ftext = "" + if use_ansi: + ftext = env.render(mode="ansi") + else: + ftext = str(info) + + frames.append({ + 'frame': ftext, + 'state': state, + 'action': action, + 'reward': reward, + 'session': i + } + ) + + if i % percentage == 0: + print (u"{}[2J{}[;H".format(chr(27), chr(27))) + print(f"Training: {i/percentage}%") + print(f"Successes so far: {suc_cnt}") + sleep(.1) + print (u"{}[2J{}[;H".format(chr(27), chr(27))) + print("Training: finished.\n") + print(f"Successes totally: {suc_cnt}") + pickle.dump(q_table , open( gym_name+".dat", "wb" ) ) + print(f"Q-table saved: {gym_name}.dat") + + except KeyboardInterrupt: + print (u"{}[2J{}[;H".format(chr(27), chr(27))) + print("Training: stopped.\n") + print(f"Successes totally: {suc_cnt}") + pickle.dump(q_table , open( gym_name+".dat", "wb" ) ) + print(f"Q-table saved: {gym_name}_stopped.dat") + exit() + + return frames \ No newline at end of file