@@ -1,3 +1,12 @@ | |||||
# openai-tests | # openai-tests | ||||
My OpenAI playground | |||||
My OpenAI playground | |||||
# Q-Learning | |||||
``` | |||||
import ql | |||||
frames = ql.train(10000) # run 10k training sessions | |||||
ql.review(frames) # review the training process | |||||
ql.play() # see the trained algorithm in action | |||||
``` |
@@ -0,0 +1,14 @@ | |||||
import gym | |||||
from time import sleep | |||||
import numpy as np | |||||
import pickle | |||||
import random | |||||
import os | |||||
from .settings import * | |||||
env = gym.make(gym_name).env | |||||
from .train import * | |||||
from .play import * | |||||
from .review import * |
@@ -0,0 +1,59 @@ | |||||
from . import * | |||||
def play(player_episodes=1): | |||||
q_table = pickle.load( open( gym_name+".dat", "rb" )) | |||||
total_epochs = 0 | |||||
episodes = player_episodes | |||||
print(f"Evaluation: 0%") | |||||
total_epochs, total_rewards = 0, 0 | |||||
try: | |||||
for ep in range(episodes): | |||||
state = env.reset() | |||||
epochs, reward = 0, 0 | |||||
done = False | |||||
i = 0 | |||||
while not done: | |||||
action = np.argmax(q_table[state]) | |||||
state, reward, done, info = env.step(action) | |||||
if i > max_iterations: | |||||
done = True | |||||
ftext = "" | |||||
if use_ansi: | |||||
ftext = env.render(mode="ansi") | |||||
else: | |||||
ftext = str(info) | |||||
i += 1 | |||||
print (u"{}[2J{}[;H".format(chr(27), chr(27))) | |||||
print(f"Evaluation: {100 * ep / episodes}%") | |||||
print(f"{ftext}") | |||||
sleep(.1) | |||||
epochs += 1 | |||||
total_epochs += epochs | |||||
total_rewards += reward | |||||
sleep(1) | |||||
except KeyboardInterrupt: | |||||
print(f"Results after {episodes} episodes:") | |||||
print(f"Average timesteps per episode: {total_epochs / episodes}") | |||||
print(f"Average rewards per episode: {total_rewards / episodes}") | |||||
exit() | |||||
print (u"{}[2J{}[;H".format(chr(27), chr(27))) | |||||
print("Evaluation: finished.\n") | |||||
print(f"Results after {episodes} episodes:") | |||||
print(f"Average timesteps per episode: {total_epochs / episodes}") | |||||
print(f"Average rewards per episode: {total_rewards / episodes}") |
@@ -0,0 +1,32 @@ | |||||
from . import * | |||||
def review(frames): | |||||
sucs = "" | |||||
prevSess = -1 | |||||
rew = 0 | |||||
cnt = 1 | |||||
for i, frame in enumerate(frames): | |||||
print (u"{}[2J{}[;H".format(chr(27), chr(27))) | |||||
print(f"Session: {frame['session']}") | |||||
print(frame['frame']) | |||||
print(f"Timestep: {i + 1}") | |||||
print(f"State: {frame['state']}") | |||||
print(f"Action: {frame['action']}") | |||||
print(f"Reward: {frame['reward']}") | |||||
sess = frame['session'] | |||||
if sess != prevSess: | |||||
if rew > 0: | |||||
sucs += "+" | |||||
sleep(1) | |||||
elif rew < 0: | |||||
sucs += "-" | |||||
else: | |||||
sucs += "." | |||||
prevSess = frame['session'] | |||||
cnt += 1 | |||||
rew = frame['reward'] | |||||
print(f"\nSuccesses: [{sucs}]") | |||||
sleep(.1) |
@@ -0,0 +1,17 @@ | |||||
# OpenAI Gym settings | |||||
gym_name = "Taxi-v3" | |||||
# Q-Learning training settings | |||||
alpha = 0.1 | |||||
gamma = 0.8 | |||||
epsilon = 0.1 | |||||
# Q-learning player settings | |||||
max_iterations = 1000 | |||||
# Render settings | |||||
use_ansi = True |
@@ -0,0 +1,77 @@ | |||||
from . import * | |||||
def train(training_episodes=10000, resume=True): | |||||
if resume and os.path.exists(gym_name+".dat"): | |||||
q_table = pickle.load( open( gym_name+".dat", "rb" )) | |||||
else: | |||||
q_table = np.zeros([env.observation_space.n, env.action_space.n]) | |||||
episodes = training_episodes | |||||
percentage = episodes / 100 | |||||
frames = [] | |||||
suc_cnt = 0 | |||||
try: | |||||
for i in range(1, episodes + 1): | |||||
state = env.reset() | |||||
epochs, reward, = 0, 0 | |||||
done = False | |||||
while not done: | |||||
if random.uniform(0, 1) < epsilon: | |||||
action = env.action_space.sample() | |||||
else: | |||||
action = np.argmax(q_table[state]) | |||||
next_state, reward, done, info = env.step(action) | |||||
old_value = q_table[state, action] | |||||
next_max = np.max(q_table[next_state]) | |||||
new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max) | |||||
q_table[state, action] = new_value | |||||
if reward > 0: | |||||
suc_cnt += 1 | |||||
state = next_state | |||||
epochs += 1 | |||||
if i % percentage == 0: | |||||
ftext = "" | |||||
if use_ansi: | |||||
ftext = env.render(mode="ansi") | |||||
else: | |||||
ftext = str(info) | |||||
frames.append({ | |||||
'frame': ftext, | |||||
'state': state, | |||||
'action': action, | |||||
'reward': reward, | |||||
'session': i | |||||
} | |||||
) | |||||
if i % percentage == 0: | |||||
print (u"{}[2J{}[;H".format(chr(27), chr(27))) | |||||
print(f"Training: {i/percentage}%") | |||||
print(f"Successes so far: {suc_cnt}") | |||||
sleep(.1) | |||||
print (u"{}[2J{}[;H".format(chr(27), chr(27))) | |||||
print("Training: finished.\n") | |||||
print(f"Successes totally: {suc_cnt}") | |||||
pickle.dump(q_table , open( gym_name+".dat", "wb" ) ) | |||||
print(f"Q-table saved: {gym_name}.dat") | |||||
except KeyboardInterrupt: | |||||
print (u"{}[2J{}[;H".format(chr(27), chr(27))) | |||||
print("Training: stopped.\n") | |||||
print(f"Successes totally: {suc_cnt}") | |||||
pickle.dump(q_table , open( gym_name+".dat", "wb" ) ) | |||||
print(f"Q-table saved: {gym_name}_stopped.dat") | |||||
exit() | |||||
return frames |