# 方式1:在主进程创建队列,并采用进程池的形式,异步消费任务队列

# 方式2: 采用生产消费者模式处理,一边生产一边消费, 避免方式1中需要先读取全量任务问题

 # 方式1代码

# 从pdf文件转文本是一个cpu以及io负载都很高的过程
# 为了提高文本的转化效率,采用多进程提高转化效率
# coding=utf-8
import multiprocessing
import time
import pdfplumber
import os
import sys

min_pdf_dir, max_pdf_dir = "2017-01-01", "2017-02-31"
base_dir = '/home/chunfeng/notice/'
target_path = '/home/chunfeng/tmp/'

import logging
logger = logging.getLogger('main')
# 600416_1206479426.pdf
pdf_name_title_dict = {}
with open('../data/pdf_name_title.csv') as f:
for line in f.readlines():
try:
pdf_name = line[0:21]
title = line[22:]
pdf_name_title_dict[pdf_name] = title
except Exception as e:
print(e, line)


def get_pdf_context(pdf_pth, save_path):
with pdfplumber.open(pdf_pth) as pdf:
texts = [page.extract_text() for page in pdf.pages]
content = ""
for text in texts:
if not text is None:
lines = text.splitlines()
for line in lines:
if len(line.strip()) < 2:
continue
if line.strip().endswith('。'):
content += line + "\n"
elif len(line) < 30:
content += '\n' + line + "\n" #
else:
content += line

if len(content.strip()) > 10:
title_ = pdf_name_title_dict[os.path.split(pdf_pth)[-1]]
with open(save_path, 'a') as f:
f.write(f'{pdf_pth},{title_}\n')
f.write(content + "\n")
print(f'work pid:{os.getpid()}, parent pid:{os.getppid()} save file : {pdf_pth} , {save_path}, key:{os.path.split(pdf_pth)[-1]} ')


def write_data(queue):
global logger
dirs = os.listdir(base_dir)
dirs.sort()
for dir in dirs:
if (os.path.isdir(os.path.join(base_dir, dir))) & (min_pdf_dir <= dir <= max_pdf_dir):
logger.warning(f'processing dir:{os.path.join(base_dir, dir)}')
else:
# 跳过已经处理的文件夹,或者是跳过文件
logger.warning(f'skip file or dir :{os.path.join(base_dir, dir)}')
continue

# 遍历文件夹下的文件
files = os.listdir(os.path.join(base_dir, dir))
files.sort()

for file in files:
if os.path.isfile(os.path.join(base_dir, dir, file)):
# 当队列没有满的时候继续填充,
while not queue.full():
# print(f'queue put {queue.qsize()}:', os.path.join(os.path.join(base_dir, dir, file)))
queue.put(os.path.join(os.path.join(base_dir, dir, file)))
break
# 结束标记
# queue.put('finished')


def read_data(queue, process_index):
global logger
if queue.qsize() == 0:
logger.warning("队列空了")
else:
pdf_path = queue.get() # 获取队列中的一条消息,然后将其从列表中移除,block默认值为True;
file_name = os.path.split(pdf_path)[-1]
get_pdf_context(pdf_path, target_path + file_name[0:-3]+'txt')
logger.warning(f"work pid:{os.getpid()}, parent pid:{os.getppid()}, total:{process_index+queue.qsize()}, process_index:{process_index}, qsize:{queue.qsize()}, finished: {pdf_path}")


def get_size(obj, seen=None):
"""Recursively finds size of objects"""
size = sys.getsizeof(obj)
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
# Important mark as seen *before* entering recursion to gracefully handle
# self-referential objects
seen.add(obj_id)
if isinstance(obj, dict):
size += sum([get_size(v, seen) for v in obj.values()])
size += sum([get_size(k, seen) for k in obj.keys()])
elif hasattr(obj, '__dict__'):
size += get_size(obj.__dict__, seen)
elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
size += sum([get_size(i, seen) for i in obj])
return size


if __name__ == '__main__':

# 创建共享消息队列
m = multiprocessing.Manager()
queue = m.Queue()
# 在主进程写入queue 避免Broke pipe错误
write_data(queue)
logger.warning(f'queue.qsize:{queue.qsize()}, get_size(queue):{get_size(queue)}, {sys.getsizeof(queue)}')
current_process = multiprocessing.current_process()

# 获取当前进程的编号
logger.warning(f"work pid:{os.getpid()}, parent pid:{os.getppid()} process group id: {os.getpgid(os.getpid())}")

# DOT USE THIS QUEUE 创建消息队列
# queue = multiprocessing.Queue()

# 创建进程池
processes = 7 # 指定cpu core
pool = multiprocessing.Pool(processes=processes)

# queue 同步执行完成
# pool.apply(func=write_data, args=(queue,))

# 在任务确定的情况下,循环提交任务到进程池
for i in range(queue.qsize()):
pool.apply_async(func=read_data, args=(queue, i, ))
pool.close()
pool.join()

logger.warning('##################finished##################')


# 方式二:    当生产任务远远比消费任务执行的快,可能会导致queue占用大量内存问题,采用限制队列size=1000,  如果队列满了,则进入循环等待,否则就续生产


# coding=utf-8
import multiprocessing
import time
import pdfplumber
import os
import logging
import logging.config
logging.config.dictConfig({
'version': 1,
'disable_existing_loggers': True,
'formatters': {
'verbose': {
'format': "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s",
'datefmt': "%Y-%m-%d %H:%M:%S"
},
'simple': {
'format': '%(levelname)s %(message)s'
},
},
'handlers': {
'null': {
'level': 'INFO',
'class': 'logging.NullHandler',
},
'console': {
'level': 'INFO',
'class': 'logging.StreamHandler',
'formatter': 'verbose'
},
'file': {
'level': 'INFO',
# 如果没有使用并发的日志处理类,在多实例的情况下日志会出现缺失
'class': 'cloghandler.ConcurrentRotatingFileHandler',
# 当达到10MB时分割日志
'maxBytes': 1024 * 1024 * 10,
# 最多保留50份文件
'backupCount': 50,
# If delay is true,
# then file opening is deferred until the first call to emit().
'delay': True,
'filename': 'logs/mylog.log',
'formatter': 'verbose'
}
},
'loggers': {
'': {
'handlers': ['file'],
'level': 'INFO',
},
}
})
logger = logging.getLogger('main')

min_pdf_dir, max_pdf_dir = "2017-08-11", "2017-10-31"
base_dir = '/home/chunfeng/notice/'
target_path = '/home/chunfeng/tmp/'


# 文件名和标题 字典
pdf_name_title_dict = {}
with open('../data/pdf_name_title.csv') as f:
for line in f.readlines():
try:
pdf_name = line[0:21]
title = line[22:]
pdf_name_title_dict[pdf_name] = title
except Exception as e:
print(e, line)


def get_pdf_context(pdf_pth, save_path):
with pdfplumber.open(pdf_pth) as pdf:
try:
texts = [page.extract_text() for page in pdf.pages]
except Exception as e:
logger.info(f"pdf_pth:{pdf_pth} pdf.pages:{len(pdf.pages)}")
texts = []
content = ""
for text in texts:
if not text is None:
lines = text.splitlines()
for line in lines:
if len(line.strip()) < 2:
continue
if line.strip().endswith('。'):
content += line + "\n"
elif len(line) < 30:
content += '\n' + line + "\n" #
else:
content += line

if len(content.strip()) > 10:
title_ = pdf_name_title_dict[os.path.split(pdf_pth)[-1]]
with open(save_path, 'a') as f:
f.write(f'{pdf_pth},{title_}\n')
f.write(content + "\n")
# logger.info(f" work pid:{os.getpid()}, parent pid:{os.getppid()} save file : {pdf_pth} , {save_path}, key:{os.path.split(pdf_pth)[-1]}")


def write_data(queue, process_name, signal):
logger.info(f" enter into process_name: {process_name} signal.value:{signal.value} ")
dirs = os.listdir(base_dir)
dirs.sort()
for dir in dirs:
if (os.path.isdir(os.path.join(base_dir, dir))) & (min_pdf_dir <= dir <= max_pdf_dir):
logger.info(f"process_name: {process_name} processing dir:{os.path.join(base_dir, dir)}")
else:
# 跳过已经处理的文件夹,或者是跳过文件
logger.info(f"process_name: {process_name} skip file or dir :{os.path.join(base_dir, dir)}")
continue

# 遍历文件夹下的文件
files = os.listdir(os.path.join(base_dir, dir))
files.sort()

for file in files:
if os.path.isfile(os.path.join(base_dir, dir, file)):
# 当队列满了则等待一会
while queue.full():
time.sleep(1)
# 当队列没有满的时候继续填充
queue.put(os.path.join(base_dir, dir, file))
signal.value += 1
logger.info(f'queue put {queue.qsize()}, signal.value:{signal.value},{os.path.join(base_dir, dir, file)}')

# 结束标记 queue.put('finished')
signal.value = -1


def read_data(queue, process_name, signal):
import random
time.sleep(random.randint(1, 3))
logger.info(f"enter into process_name: {process_name} signal.value:{signal.value} ")
# time.sleep(1)
while True:
if (signal.value < 0) and (queue.qsize() == 0):
logger.info(f"process_name:{process_name} queue.qsize == 0 队列空了 signal.value:{signal.value}")
break

# 获取队列中的一条消息,然后将其从列表中移除,block默认值为True;
pdf_path = queue.get()
file_name = os.path.split(pdf_path)[-1]
try:
get_pdf_context(pdf_path, target_path + file_name[0:-3]+'txt')
except Exception as e:
logger.info(f"work pid:{os.getpid()}, process_name:{process_name}, failed:{pdf_path}")
logger.info(f'pdf_path:{pdf_path}, exception:{e}')
logger.info(f"work pid:{os.getpid()}, parent pid:{os.getppid()}, queue:{queue.qsize()},total:{signal.value},{process_name},finished:{pdf_path}")


def main_process():

# 采用生产消费者模式,单进程生产,多进程消费的方式提高并行度
manager = multiprocessing.Manager()
# 创建共享消息队列
queue = manager.Queue(maxsize=1000)
# signal = manager.Value('signal', 1)
process_step = multiprocessing.Value('i', 0) # num=0
# arr = multiprocessing.Array('i', range(10)) # arr=range(10)

# 创建进程池
process_list = []
logger.info(f'main process queue.qsize():{queue.qsize()} process_step:{process_step.value} !')

# 先创建生产进程,并启动,然后阻塞主进程,保证队列一直有数据,其他进程在判断队列为空的情况下,就会退出
write_process_name = "write_process"
write_process = multiprocessing.Process(target=write_data, args=(queue, write_process_name, process_step))
process_list.append(write_process) # write_process.start()

logger.info(f'main process queue.qsize():{queue.qsize()} process_step:{process_step.value} !')

# 创建消费进程
for i in range(multiprocessing.cpu_count() - 1):
process_name = f"read_process_{i} "
logger.info(f'add process to list: {process_name}')
tmp_process_ = multiprocessing.Process(target=read_data, args=(queue, process_name, process_step))
process_list.append(tmp_process_)
logger.info(f'add process finished.')

# 启动所有消费进程
for process in process_list:
logger.info(f'start process : {process.name} ')
process.start()
logger.info(f'all process start ')
# 等待完成
for process in process_list:
# 阻塞当前进程,直到调用join方法的那个进程执行完,再继续执行当前进程。
process.join()
logger.info(f'all process join ')


if __name__ == '__main__':
logger.info(f'########## start ##################')
main_process()
logger.info(f'##########finished##################')