-
Notifications
You must be signed in to change notification settings - Fork 0
/
example.py
29 lines (25 loc) · 1.18 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from mdp import MDP
from mdp import ValueIteration, PolicyIteration
dice_game = MDP(gamma=0.9,
states=['In', 'Out'],
actions_space=['stay', 'quit'],
t_prob=lambda *key: {('In', 'stay', 'In'): 0.9,
('In', 'stay', 'Out'): 0.1,
('In', 'quit', 'In'): 0,
('In', 'quit', 'Out'): 1,
('Out', 'stay', 'In'): 0,
('Out', 'stay', 'Out'): 0,
}.get((*key,), 0),
rewards=lambda *key:{('In','stay'):4,
('In','quit'):10,
}.get((*key,), 0)
)
val = dice_game.iterative_policy_evaluation(policy=lambda x:'stay', k_max=1000)
print(val)
#vp = ValueIteration(k_max=10)
#policy_after_one_step = vp.solve(P=dice_game)
pi = PolicyIteration(k_max=100, initial_policy=dice_game.random_policy())
p = pi.solve(problem=dice_game)
print([p(s) for s in dice_game.states])
# print("U(s_In) =", policy_after_one_step.U[0])
# print("U(s_Out) =", policy_after_one_step.U[1])