强化学习qlearning-小安子历险记代码实现

原创

安城安编程 2024-02-21 12:26:43 博主文章分类：多智能体深度强化学习 ©著作权

文章标签 python 开发语言强化学习迭代 sed 文章分类 copilot AIGC

©著作权归作者所有：来自51CTO博客作者安城安编程的原创作品，请联系作者获取转载授权，否则将追究法律责任

一.qlearning算法介绍

二.效果展示

三.源代码

一.qlearning算法介绍

Q-learning是一种强化学习算法，用于解决马尔可夫决策过程（Markov Decision Process，MDP）问题。它通过学习一个Q函数来选择并执行最优的动作。

Q函数表示在给定状态下选择特定动作的预期回报。Q-learning的目标是找到每个状态-动作对的最优Q值。它通过迭代更新Q值来实现，在每个时间步骤上，它根据当前状态选择一个动作，并执行该动作。接着，它观察环境反馈的奖励值和新状态，并使用这些信息来更新Q值。

Q值的更新使用了贝尔曼方程，该方程表示一个状态的Q值应该等于该状态下选择每个可能动作的预期回报的最大值。具体而言，Q值的更新公式为：

Q(s, a) = Q(s, a) + α [r + γ max(Q(s', a')) - Q(s, a)]

其中，Q(s, a)表示在状态s下选择动作a的Q值，α是学习率，r是当前状态下选择动作a后获得的奖励，γ是折扣因子（用于平衡当前奖励和未来奖励的重要性），s'是执行动作a后观察到的新状态。

Q-learning通过不断迭代执行动作、更新Q值的过程，逐步收敛于最优的Q值函数。最终，它可以使用最优的Q值函数来选择最优的动作，从而解决MDP问题。Q-learning是一种基础且经典的强化学习算法，被广泛应用于各种领域，如机器人控制、游戏智能等。

二.效果展示

强化学习qlearning模型演示 tkinter开发

三.源代码

import numpy as np
import pandas as pd
import time
from tkinter import *
from threading import Timer
from tkinter import messagebox
import tkinter as tk
np.random.seed(2)
N_STATES = 25 # 二维世界的序号
ACTIONS = ['left','right','up','down'] # 探索者可选择的动作
EPSIONS = 0.0 # greedy police贪婪度
ALPHA = 0.1 # 学习效率
LAMBDA = 0.9 # 未来奖励的衰减值
MAX_EPISODES = 13 # 最大的回合数
FRESH_TIME = 0.001 # 走一步所花费的时间,移动时间间隔
class basedesk():#底板
    def __init__(self, master):
        self.master = master
        self.master.title("小安子历险记")
        self.master.configure(bg='#E2C07C')
        self.master.geometry("1000x560")
        mainwindow(self.master)
class mainwindow():#主界面
    def __init__(self, master):
        self.master = master
        self.point_num=1
        self.window = tk.Frame(self.master, bg='#E2C07C')
        self.window.place(x=0,y=0,width=1000,height=560)
        self.window.showmap_label = tk.Label(self.window,text="迷宫地图",fg='#000000',bg='#B1EBC2',font=("Helvetic",20,"bold"),relief=RAISED).place(x=10, y=10,width=500, height=30)
        self.map_canvas = tk.Canvas(self.window,bg='#EAF4C0',relief=SUNKEN)
        self.map_canvas.place(x=10,y=50, width=500, height=500)
        for i in range(4):
            self.map_canvas.create_line(0, (i+1)*100, 500, (i+1)*100, width=1, capstyle=BUTT, fill='black')
            self.map_canvas.create_line((i + 1) * 100,0, (i + 1) * 100, 500,  width=1, capstyle=BUTT, fill='black')
        UNIT = 100
        origin = np.array([50, 50])
        hell1_center = origin + np.array([UNIT , 0])
        self.hell1 = self.map_canvas.create_rectangle(
            hell1_center[0] - 45, hell1_center[1] - 45,
            hell1_center[0] + 45, hell1_center[1] + 45, fill='black')
        hell2_center = origin + np.array([UNIT*4 , 0])
        self.hell2 = self.map_canvas.create_rectangle(
            hell2_center[0] - 45, hell2_center[1] - 45,
            hell2_center[0] + 45, hell2_center[1] + 45, fill='black')
        hell3_center = origin + np.array([UNIT*2 , UNIT])
        self.hell3 = self.map_canvas.create_rectangle(
            hell3_center[0] - 45, hell3_center[1] - 45,
            hell3_center[0] + 45, hell3_center[1] + 45, fill='black')
        hell4_center = origin + np.array([UNIT * 3, UNIT])
        self.hell4 = self.map_canvas.create_rectangle(
            hell4_center[0] - 45, hell4_center[1] - 45,
            hell4_center[0] + 45, hell4_center[1] + 45, fill='black')
        hell5_center = origin + np.array([UNIT, UNIT*2])
        self.hell5 = self.map_canvas.create_rectangle(
            hell5_center[0] - 45, hell5_center[1] - 45,
            hell5_center[0] + 45, hell5_center[1] + 45, fill='black')
        hell7_center = origin + np.array([UNIT * 3, UNIT * 3])
        self.hell7 = self.map_canvas.create_rectangle(
            hell7_center[0] - 45, hell7_center[1] - 45,
            hell7_center[0] + 45, hell7_center[1] + 45, fill='black')
        hell8_center = origin + np.array([0, UNIT * 4])
        self.hell8 = self.map_canvas.create_rectangle(
            hell8_center[0] - 45, hell8_center[1] - 45,
            hell8_center[0] + 45, hell8_center[1] + 45, fill='black')
        oval_center = origin + UNIT * 2
        self.oval = self.map_canvas.create_oval(
            oval_center[0] - 45, oval_center[1] - 45,
            oval_center[0] + 45, oval_center[1] + 45, fill='green')
        self.window.showtimes_listbox = tk.Listbox(self.window,font=("Helvetica",20))
        self.window.showtimes_listbox.place(x=510, y=8, width=480, height=30)
        self.refresh_times()
        self.window2 = tk.Frame(self.window, bg='#FFD966')
        self.window2.place(x=510, y=48, width=480, height=500)
        self.window2.submitted_scrollbar = tk.Scrollbar(self.window2)
        self.window2.submitted_scrollbar.pack(side=RIGHT, fill=Y)
        self.window2.submitted_listbox = tk.Listbox(self.window2,font=("Helvetic",15,"bold"), yscrollcommand=self.window2.submitted_scrollbar.set)
        self.window2.submitted_listbox.place(x=0, y=0, width=463, height=500)
        self.window.calculate_btn=tk.Button(self.window,text="冲鸭！",bg='#1CBF4A',fg='#FFFFFF',font=("Helvetic",40, "bold"),command=self.rl_middle)
        self.window.calculate_btn.place(x=510,y=400,width=480, height=150)
        self.anan=tk.Button(self.map_canvas,text="安",bg='#7CE2C9',fg="#000000",font=("Helvetic",40, "bold"))
        self.anan.place(x=10,y=10,width=80, height=80)
        messagebox.showinfo('小安子历险记','小安子醒来后发现自己被关在一个黑暗的房间，房间里布满了无底洞，掉下去必死无疑，而小安子具备复活的技能，他能否逃出去呢？\n开发人员qq：584565617')


    def check_dead(self,S):
        switch =[1,4,7,8,11,18,20]
        if S in switch:
            return 'hell'
        else:
            return 'live'
    def get_env_feedback(self,S, A):
        if A=='right':
            if S%5==4:
                S_=S
                R = -10000000
            else:
                S_=S+1
                R = -0.1
                if self.check_dead(S_) == 'hell':
                    R = -100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '掉入了深渊，正在复活...')
                    return S_, R
                elif S_ == 12:
                    R = 100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '逃离成功~~~')
                    return S_, R
        elif A=='left':
            if S%5==0:
                S_=S
                R = -10000000
            else:
                S_=S-1
                R=-0.1
                if self.check_dead(S_) == 'hell':
                    R = -100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '掉入了深渊，正在复活...')
                    return S_, R
                elif S_ == 12:
                    R = 100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '逃离成功~~~')
                    return S_, R
        elif A=='up':
            if int(S/5)==0:
                S_ = S
                R = -10000000
            else:
                S_ = S - 5
                R = -0.1
                if self.check_dead(S_) == 'hell':
                    R = -100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '掉入了深渊，正在复活...')
                    return S_, R
                elif S_ == 12:
                    R = 100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '逃离成功~~~')
                    return S_, R
        else:
            if int(S/5)==4:

                S_ = S
                R = -10000000
            else:
                S_ = S + 5
                R = -0.1
                if self.check_dead(S_) == 'hell':
                    R = -100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '掉入了深渊，正在复活...')
                    return S_, R
                elif S_ == 12:
                    R = 100
                    S_ = 'terminal'
                    self.window2.submitted_listbox.insert(END, '逃离成功~~~')
                    return S_, R
        return S_, R
    def rl_middle(self):
        t = Timer(0.1, self.rl)
        t.start()

    def rl(self):
        self.window2.submitted_listbox.insert(END,'开始探索...')
        self.window.calculate_btn.destroy()
        q_table=self.build_q_table(N_STATES,ACTIONS)
        timeshappened=0
        while True:
            timeshappened+=1
            self.window2.submitted_listbox.insert(END, '第'+str(timeshappened)+'次探索...')
            step_counter=0
            S=0
            is_terminated=False
            self.refresh_anan(0,0)
            while not is_terminated:
                time.sleep(0.1)
                A=self.choose_action(S,q_table)
                S_,R=self.get_env_feedback(S,A)
                if S_ != 'terminal':
                    q_target = R + LAMBDA * q_table.iloc[S_, :].max()  # 更新:真实值减去估计值乘以学习效率

                else:
                    q_target = R  # 实际的状态-行为值(回合结束)
                    is_terminated = True
                step_counter += 1
                q_table.loc[S, A] += ALPHA * (q_target)
                S = S_
                if S_ != 'terminal':
                    self.refresh_anan(S % 5, int(S / 5))

            self.window2.submitted_listbox.insert(END,'本次尝试步数：'+str(step_counter)+'步')
            '''for elements in self.recorder:
                q_table.loc[elements[0], elements[1]] += ALPHA*(q_target/step_counter)'''  # 更新
            self.point_num+=1
            self.refresh_times()
            #if timeshappened >= 100000:

    def build_q_table(self,n_states,actions):
        table = pd.DataFrame(
            np.zeros((n_states, len(actions))),  # q_table initial values全零初始化
            columns=actions,  # actions name
        )
        return table
    def choose_action(self,state, q_table):
        global EPSIONS
        EPSIONS+=0.05
        if EPSIONS>1.0:
            EPSIONS=1.0
        state_actions = q_table.iloc[state, :]
        if np.random.uniform() > EPSIONS :  # action non-greedy或者全为零的情况
            action_name = np.random.choice(ACTIONS)  # 随机选择action里面的动作
        else:  # action greedy贪婪模式
            action_name = state_actions.tolist()
            action_name = action_name.index(max(action_name))  # 返回最大数的索引
            action_name=ACTIONS[action_name]

        return action_name
    def refresh_anan(self,a,b):
        self.anan.place(x=(50+a*100)-40,y=(50+b*100)-40,width=80, height=80)
    def refresh_times(self):
        self.window.showtimes_listbox.delete(0, END)
        self.window.showtimes_listbox.insert(END,'第'+str(self.point_num)+'次尝试')


if __name__ == '__main__':#主函数
    root = tk.Tk()
    root.resizable(False, False)
    basedesk(root)
    root.mainloop()