业务方的SQL: UPDATE agent_commission_statistics set settled =1 where `date` ='2024-08'

这个SQL要改429699条数据,直接执行会影响业务稳定性,并且造成主从复制延迟。

第一种思路:

需要通过脚本,循环分批次批量更改数据N行记录,比如2000行

实验开始

1.创建表结构

CREATE TABLE agent_commission_statistics (
  id bigint NOT NULL AUTO_INCREMENT,
  name varchar(255) DEFAULT NULL,
  date char(7) DEFAULT NULL,
  agent_id bigint NOT NULL,
  settled int NOT NULL DEFAULT '0',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=500001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

2.创建python脚本,构造数据,批量插入50万行数据

cat insert_piliang.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import pymysql
import random
import time

# 数据库配置
db_config = {
    'host': '127.0.0.1',
    'user': 'dba_tools',
    'password': 'hoSk3778ClguTfCK',
    'database': 'agentdata',
    'port': 3366,
    'charset': 'utf8'
}

# 表名
table_name = 'agent_commission_statistics'


# 生成随机数据,确保 settled 永远为 int 类型的 0
def generate_random_data(batch_size=5000):
    data = []
    dates = ['2024-08', '2024-07', '2024-06']
    for _ in range(batch_size):
        name = f"agent_{random.randint(1000, 9999)}"
        date = random.choice(dates)
        agent_id = random.randint(100000, 999999)
        settled = 0  # 明确为int类型的0
        data.append((name, date, agent_id, settled))
    return data


# 批量插入数据
def batch_insert_data(conn, data):
    insert_sql = f"INSERT INTO {table_name} (name, date, agent_id, settled) VALUES (%s, %s, %s, %s)"
    with conn.cursor() as cursor:
        cursor.executemany(insert_sql, data)
    conn.commit()


def main():
    total_records = 500000  # 总共插入50万条数据
    batch_size = 5000  # 每批插入5000条数据
    inserted_records = 0  # 记录已插入的条数

    try:
        # 建立数据库连接
        conn = pymysql.connect(**db_config)

        start_time = time.time()  # 记录开始时间

        while inserted_records < total_records:
            # 生成随机数据
            data = generate_random_data(batch_size)

            # 批量插入数据
            batch_insert_data(conn, data)

            inserted_records += len(data)
            print(f"已插入 {inserted_records} 条数据")

        end_time = time.time()  # 记录结束时间
        print(f"插入完成,耗时 {end_time - start_time:.2f} 秒,共插入 {inserted_records} 条数据")

    except pymysql.MySQLError as e:
        print(f"数据库操作失败:{e}")
    finally:
        if conn:
            conn.close()
        print("数据库连接已关闭")


if __name__ == "__main__":
    main()

3.执行插入脚本

python3 insert_piliang.py

4.查询数据

mysql> select count(*) from agent_commission_statistics where date='2024-08' and settled=0;

+----------+

| count(*) |

+----------+

|   166013 |

+----------+

1 row in set (0.17 sec)




mysql> select count(*) as count,date from agent_commission_statistics group by date order by count desc;

+--------+---------+

| count  | date    |

+--------+---------+

| 167218 | 2024-07 |

| 166769 | 2024-06 |

| 166013 | 2024-08 |

+--------+---------+

3 rows in set (0.16 sec)

5.使用批量更新脚本更新, 这里bath_size=2000,也就是每次偏移是2000,可以根据实际情况修改

cat update.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import pymysql
import time

# 数据库配置
db_config = {
    'host': '127.0.0.1',
    'user': 'dba_tools',
    'password': 'hoSk3778ClguTfCK',
    'database': 'agentdata',
    'port': 3366,
    'charset': 'utf8'
}

# 表名
table_name = 'agent_commission_statistics'

# 查询条件 (可以在这里修改)
where_condition = "date = '2024-08' AND settled = 0"

# 更新条件 (可以在这里修改)
set_condition = "settled = 1"

# 分批更新的偏移量
batch_size = 2000

def get_min_max_id(cursor, table_name, where_condition):
    # 获取min_id和max_id,确保条件通过 where_condition 控制
    query = f"SELECT MIN(id), MAX(id) FROM {table_name} WHERE {where_condition}"
    cursor.execute(query)
    result = cursor.fetchone()

    # 检查是否存在符合条件的行,防止返回 None 导致后续逻辑错误
    if result is None or result[0] is None or result[1] is None:
        return None, None
    return result[0], result[1]


def count_settled_records(cursor, table_name, where_condition):
    # 获取符合条件的未结算的记录数
    query = f"SELECT COUNT(*) FROM {table_name} WHERE {where_condition}"
    cursor.execute(query)
    result = cursor.fetchone()
    return result[0]


def update_table_data(conn, table_name, where_condition, set_condition, batch_size):
    with conn.cursor() as cursor:
        # 获取min_id和max_id
        min_id, max_id = get_min_max_id(cursor, table_name, where_condition)

        if min_id is None or max_id is None:
            print(f"未找到符合条件的记录,表: {table_name}")
            return

        begin_id = min_id
        total_updated_rows = 0  # 累计总的更改行数

        while begin_id <= max_id:
            # 构造更新语句
            update_sql = f"""
            UPDATE {table_name} 
            SET {set_condition} 
            WHERE {where_condition} 
            AND id >= {begin_id} 
            AND id < {begin_id + batch_size} 
            LIMIT {batch_size}
            """
            print(update_sql)

            # 设置事务隔离级别适合8.0
            cursor.execute("SET SESSION transaction_isolation = 'REPEATABLE-READ'")

            # 执行更新操作
            cursor.execute(update_sql)
            conn.commit()

            # 检查受影响的行数并累积
            affected_rows = cursor.rowcount
            total_updated_rows += affected_rows

            if affected_rows > 0:
                print(f"{table_name} 表本次更改的行数是: {affected_rows}")
            else:
                print(f"{table_name} 表当前范围内没有数据需要更新")

            # 更新begin_id
            begin_id += batch_size
            time.sleep(1)

        print(f"{table_name} 表数据更改成功,总的更改行数是: {total_updated_rows}")


def main():
    try:
        # 建立数据库连接
        conn = pymysql.connect(**db_config)
        with conn.cursor() as cursor:
            # 运行之前,获取符合条件的记录数
            total_before = count_settled_records(cursor, table_name, where_condition)
            print(f"开始时符合条件的记录数为: {total_before}")

        # 执行数据更新操作
        update_table_data(conn, table_name, where_condition, set_condition, batch_size)

        with conn.cursor() as cursor:
            # 运行之后,获取已更改的记录数
            total_after = count_settled_records(cursor, table_name, where_condition)
            total_updated = total_before - total_after
            print(f"{table_name} 表最终更改的总行数是: {total_updated}")

    except pymysql.MySQLError as e:
        print(f"数据库操作失败: {e}")
    finally:
        # 确保连接关闭
        if conn:
            conn.close()
        print(f"操作完成,对应的表名是: {table_name}")


if __name__ == "__main__":
    main()

6.执行更新脚本

python3 update.py

7.观测一下执行的速度,数据的变化

mysql> select count(*) from agent_commission_statistics where date='2024-08' and settled=0;
 
+----------+
 
| count(*) |
 
+----------+
 
|   146108 |
 
+----------+
 
1 row in set (0.13 sec)
 
 
 
 
mysql> select count(*) from agent_commission_statistics where date='2024-08' and settled=0;
 
+----------+
 
| count(*) |
 
+----------+
 
|   146108 |
 
+----------+
 
1 row in set (0.14 sec)
 
 
 
 
mysql> select count(*) from agent_commission_statistics where date='2024-08' and settled=0;
 
+----------+
 
| count(*) |
 
+----------+
 
|   145430 |
 
+----------+
 
1 row in set (0.13 sec)
 
 
 
 
mysql> select count(*) from agent_commission_statistics where date='2024-08' and settled=0;
 
+----------+
 
| count(*) |
 
+----------+
 
|   144098 |
 
+----------+
 
1 row in set (0.11 sec)
 
 
 
 
mysql>

脚本结束之后会展示总更新的行数

基于Python实现循环分批次批量更改数据N行记录_sql

总结:

  • get_min_max_id 函数优化:避免没有符合条件的 id 返回 None 时出错,改为返回 None, None,并在后续逻辑中进行检查。
  • 灵活性保持where_condition 和 set_condition 仍然是可配置的,确保代码灵活性和易维护