python采用进程池消费队列消息

原创

mtj66 2022-01-02 14:51:33 博主文章分类：python ©著作权

©著作权归作者所有：来自51CTO博客作者mtj66的原创作品，请联系作者获取转载授权，否则将追究法律责任

# 方式1:在主进程创建队列,并采用进程池的形式,异步消费任务队列

# 方式2: 采用生产消费者模式处理,一边生产一边消费, 避免方式1中需要先读取全量任务问题

# 方式1代码

# 从pdf文件转文本是一个cpu以及io负载都很高的过程
# 为了提高文本的转化效率,采用多进程提高转化效率
# coding=utf-8
import multiprocessing
import time
import pdfplumber
import os
import sys

min_pdf_dir, max_pdf_dir = "2017-01-01", "2017-02-31"
base_dir = '/home/chunfeng/notice/'
target_path = '/home/chunfeng/tmp/'

import logging
logger = logging.getLogger('main')
# 600416_1206479426.pdf
pdf_name_title_dict = {}
with open('../data/pdf_name_title.csv') as f:
    for line in f.readlines():
        try:
            pdf_name = line[0:21]
            title = line[22:]
            pdf_name_title_dict[pdf_name] = title
        except Exception as e:
            print(e, line)


def get_pdf_context(pdf_pth, save_path):
    with pdfplumber.open(pdf_pth) as pdf:
        texts = [page.extract_text() for page in pdf.pages]
        content = ""
        for text in texts:
            if not text is None:
                lines = text.splitlines()
                for line in lines:
                    if len(line.strip()) < 2:
                        continue
                    if line.strip().endswith('。'):
                        content += line + "\n"
                    elif len(line) < 30:
                        content += '\n' + line + "\n"  #
                    else:
                        content += line

    if len(content.strip()) > 10:
        title_ = pdf_name_title_dict[os.path.split(pdf_pth)[-1]]
        with open(save_path, 'a') as f:
            f.write(f'{pdf_pth},{title_}\n')
            f.write(content + "\n")
    print(f'work pid:{os.getpid()}, parent pid:{os.getppid()} save file : {pdf_pth} , {save_path}, key:{os.path.split(pdf_pth)[-1]} ')


def write_data(queue):
    global logger
    dirs = os.listdir(base_dir)
    dirs.sort()
    for dir in dirs:
        if (os.path.isdir(os.path.join(base_dir, dir))) & (min_pdf_dir <= dir <= max_pdf_dir):
            logger.warning(f'processing dir:{os.path.join(base_dir, dir)}')
        else:
            # 跳过已经处理的文件夹,或者是跳过文件
            logger.warning(f'skip file or dir :{os.path.join(base_dir, dir)}')
            continue

        # 遍历文件夹下的文件
        files = os.listdir(os.path.join(base_dir, dir))
        files.sort()

        for file in files:
            if os.path.isfile(os.path.join(base_dir, dir, file)):
                # 当队列没有满的时候继续填充,
                while not queue.full():
                    # print(f'queue put {queue.qsize()}:', os.path.join(os.path.join(base_dir, dir, file)))
                    queue.put(os.path.join(os.path.join(base_dir, dir, file)))
                    break
    # 结束标记
    # queue.put('finished')


def read_data(queue, process_index):
    global logger
    if queue.qsize() == 0:
        logger.warning("队列空了")
    else:
        pdf_path = queue.get()  # 获取队列中的一条消息，然后将其从列表中移除，block默认值为True；
        file_name = os.path.split(pdf_path)[-1]
        get_pdf_context(pdf_path, target_path + file_name[0:-3]+'txt')
        logger.warning(f"work pid:{os.getpid()}, parent pid:{os.getppid()}, total:{process_index+queue.qsize()}, process_index:{process_index}, qsize:{queue.qsize()}, finished: {pdf_path}")


def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size


if __name__ == '__main__':

    # 创建共享消息队列
    m = multiprocessing.Manager()
    queue = m.Queue()
    # 在主进程写入queue 避免Broke pipe错误
    write_data(queue)
    logger.warning(f'queue.qsize:{queue.qsize()}, get_size(queue):{get_size(queue)}, {sys.getsizeof(queue)}')
    current_process = multiprocessing.current_process()

    # 获取当前进程的编号
    logger.warning(f"work pid:{os.getpid()},  parent pid:{os.getppid()} process group id: {os.getpgid(os.getpid())}")

    # DOT USE THIS QUEUE 创建消息队列
    # queue = multiprocessing.Queue()

    # 创建进程池
    processes = 7  # 指定cpu core
    pool = multiprocessing.Pool(processes=processes)

    # queue 同步执行完成
    # pool.apply(func=write_data, args=(queue,))

    # 在任务确定的情况下,循环提交任务到进程池
    for i in range(queue.qsize()):
        pool.apply_async(func=read_data, args=(queue, i, ))
    pool.close()
    pool.join()

    logger.warning('##################finished##################')

# 方式二: 当生产任务远远比消费任务执行的快,可能会导致queue占用大量内存问题,采用限制队列size=1000, 如果队列满了,则进入循环等待,否则就续生产


# coding=utf-8
import multiprocessing
import time
import pdfplumber
import os
import logging
import logging.config
logging.config.dictConfig({
    'version': 1,
    'disable_existing_loggers': True,
    'formatters': {
        'verbose': {
            'format': "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s",
            'datefmt': "%Y-%m-%d %H:%M:%S"
        },
        'simple': {
            'format': '%(levelname)s %(message)s'
        },
    },
    'handlers': {
        'null': {
            'level': 'INFO',
            'class': 'logging.NullHandler',
        },
        'console': {
            'level': 'INFO',
            'class': 'logging.StreamHandler',
            'formatter': 'verbose'
        },
        'file': {
            'level': 'INFO',
            # 如果没有使用并发的日志处理类，在多实例的情况下日志会出现缺失
            'class': 'cloghandler.ConcurrentRotatingFileHandler',
            # 当达到10MB时分割日志
            'maxBytes': 1024 * 1024 * 10,
            # 最多保留50份文件
            'backupCount': 50,
            # If delay is true,
            # then file opening is deferred until the first call to emit().
            'delay': True,
            'filename': 'logs/mylog.log',
            'formatter': 'verbose'
        }
    },
    'loggers': {
        '': {
            'handlers': ['file'],
            'level': 'INFO',
        },
    }
})
logger = logging.getLogger('main')

min_pdf_dir, max_pdf_dir = "2017-08-11", "2017-10-31"
base_dir = '/home/chunfeng/notice/'
target_path = '/home/chunfeng/tmp/'


# 文件名和标题 字典
pdf_name_title_dict = {}
with open('../data/pdf_name_title.csv') as f:
    for line in f.readlines():
        try:
            pdf_name = line[0:21]
            title = line[22:]
            pdf_name_title_dict[pdf_name] = title
        except Exception as e:
            print(e, line)


def get_pdf_context(pdf_pth, save_path):
    with pdfplumber.open(pdf_pth) as pdf:
        try:
            texts = [page.extract_text() for page in pdf.pages]
        except Exception as e:
            logger.info(f"pdf_pth:{pdf_pth} pdf.pages:{len(pdf.pages)}")
            texts = []
        content = ""
        for text in texts:
            if not text is None:
                lines = text.splitlines()
                for line in lines:
                    if len(line.strip()) < 2:
                        continue
                    if line.strip().endswith('。'):
                        content += line + "\n"
                    elif len(line) < 30:
                        content += '\n' + line + "\n"  #
                    else:
                        content += line

    if len(content.strip()) > 10:
        title_ = pdf_name_title_dict[os.path.split(pdf_pth)[-1]]
        with open(save_path, 'a') as f:
            f.write(f'{pdf_pth},{title_}\n')
            f.write(content + "\n")
    # logger.info(f" work pid:{os.getpid()}, parent pid:{os.getppid()} save file : {pdf_pth} , {save_path}, key:{os.path.split(pdf_pth)[-1]}")


def write_data(queue, process_name, signal):
    logger.info(f" enter into process_name: {process_name} signal.value:{signal.value} ")
    dirs = os.listdir(base_dir)
    dirs.sort()
    for dir in dirs:
        if (os.path.isdir(os.path.join(base_dir, dir))) & (min_pdf_dir <= dir <= max_pdf_dir):
            logger.info(f"process_name: {process_name} processing dir:{os.path.join(base_dir, dir)}")
        else:
            # 跳过已经处理的文件夹,或者是跳过文件
            logger.info(f"process_name: {process_name} skip file or dir :{os.path.join(base_dir, dir)}")
            continue

        # 遍历文件夹下的文件
        files = os.listdir(os.path.join(base_dir, dir))
        files.sort()

        for file in files:
            if os.path.isfile(os.path.join(base_dir, dir, file)):
                # 当队列满了则等待一会
                while queue.full():
                    time.sleep(1)
                # 当队列没有满的时候继续填充
                queue.put(os.path.join(base_dir, dir, file))
                signal.value += 1
                logger.info(f'queue put {queue.qsize()}, signal.value:{signal.value},{os.path.join(base_dir, dir, file)}')

    # 结束标记 queue.put('finished')
    signal.value = -1


def read_data(queue, process_name, signal):
    import random
    time.sleep(random.randint(1, 3))
    logger.info(f"enter into process_name: {process_name} signal.value:{signal.value} ")
    # time.sleep(1)
    while True:
        if (signal.value < 0) and (queue.qsize() == 0):
            logger.info(f"process_name:{process_name} queue.qsize == 0 队列空了 signal.value:{signal.value}")
            break

        # 获取队列中的一条消息，然后将其从列表中移除，block默认值为True；
        pdf_path = queue.get()
        file_name = os.path.split(pdf_path)[-1]
        try:
            get_pdf_context(pdf_path, target_path + file_name[0:-3]+'txt')
        except Exception as e:
            logger.info(f"work pid:{os.getpid()}, process_name:{process_name}, failed:{pdf_path}")
            logger.info(f'pdf_path:{pdf_path}, exception:{e}')
        logger.info(f"work pid:{os.getpid()}, parent pid:{os.getppid()}, queue:{queue.qsize()},total:{signal.value},{process_name},finished:{pdf_path}")


def main_process():

    # 采用生产消费者模式,单进程生产,多进程消费的方式提高并行度
    manager = multiprocessing.Manager()
    # 创建共享消息队列
    queue = manager.Queue(maxsize=1000)
    # signal = manager.Value('signal', 1)
    process_step = multiprocessing.Value('i', 0)  # num=0
    # arr = multiprocessing.Array('i', range(10))  # arr=range(10)

    # 创建进程池
    process_list = []
    logger.info(f'main process queue.qsize():{queue.qsize()} process_step:{process_step.value} !')

    # 先创建生产进程,并启动,然后阻塞主进程,保证队列一直有数据,其他进程在判断队列为空的情况下,就会退出
    write_process_name = "write_process"
    write_process = multiprocessing.Process(target=write_data, args=(queue, write_process_name, process_step))
    process_list.append(write_process)  # write_process.start()

    logger.info(f'main process queue.qsize():{queue.qsize()} process_step:{process_step.value} !')

    # 创建消费进程
    for i in range(multiprocessing.cpu_count() - 1):
        process_name = f"read_process_{i} "
        logger.info(f'add process to list: {process_name}')
        tmp_process_ = multiprocessing.Process(target=read_data, args=(queue, process_name, process_step))
        process_list.append(tmp_process_)
    logger.info(f'add process finished.')

    # 启动所有消费进程
    for process in process_list:
        logger.info(f'start process : {process.name} ')
        process.start()
    logger.info(f'all process start ')
    # 等待完成
    for process in process_list:
        # 阻塞当前进程，直到调用join方法的那个进程执行完，再继续执行当前进程。
        process.join()
    logger.info(f'all process join ')


if __name__ == '__main__':
    logger.info(f'##########  start ##################')
    main_process()
    logger.info(f'##########finished##################')