1.设计图

实现pdf跳页解析功能_github

pdf解析可实现指定页码先进行解析,然后调换解析顺序

2.处理流程图

实现pdf跳页解析功能_github_02

实现pdf跳页解析功能_json_03

3.数据库、redis设计

实现pdf跳页解析功能_json_04

3.代码实现

pdf解析工具使用pdfium,仓库地址:https://github.com/klippa-app/pdfium-cli

实现pdf跳页解析功能_github_05

3.1.client_pdfium.go(底层代码)

package pdf_util

import (
	"fmt"
	"os/exec"
	"path"
	"strconv"
	"support/logger"
)

type pdfium struct {
	parseTool string
	dpi       string
	maxHeight int
	maxWidth  int
}

func newPdfium(dpi int) *pdfium {
	res := &pdfium{
		parseTool: getPdfiumTool(),
		dpi:       strconv.Itoa(dpi),
	}
	logger.Debug("new pdfium, dpi: %d, programName: %s", dpi, res.parseTool)
	return res
}

func (p *pdfium) parse(log logger.ILog, filePath string, imgDir string, firstPage, lastPage int, dpi int) ([]byte, error) {

	cmd := exec.Command(getBashTool(), "-c")
	arg := fmt.Sprintf("%s render %s --dpi %d --pages %d-%d %s", getPdfiumTool(),
		filePath, dpi, firstPage, lastPage, path.Join(imgDir, "%d.jpg"))
	cmd.Args = append(cmd.Args, arg)
	log.Debug("pdfium cmd is: %s", cmd)
	res, err := cmd.CombinedOutput()
	log.Debug("pdfium exec result:%s", res)

	return res, err
}

实现pdf跳页解析功能_json_06

3.2 业务代码

实现pdf跳页解析功能_宽高_07

dto.go
package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"path"
	"pps/config"
	"pps/dao"
	pfc "pps/helper/pdf_cache"
	"strconv"
	"support/collection/_set"
	"support/util"
	"support/web/mw"
	"time"
)

const (
	metaFileName = "meta.json"
)

const maxTaskExpireParam = 5

type requestParam struct {
	FileId             string `json:"fileId" form:"fileId" binding:"required"`
	FileOssPath        string `json:"fileOssPath" form:"fileOssPath" binding:"required"`
	PrePage            int    `json:"prePage" form:"prePage"`
	imgOssPath         string
	metaFileKey        string
	DisablePicCompress bool `json:"disablePicCompress" form:"disablePicCompress"`
}

type requestJumpParam struct {
	FileId string `json:"fileId" form:"fileId" binding:"required"`
	PageNo int    `json:"pageNo" form:"pageNo" binding:"required"`
}

func (p *requestParam) init() {
	p.imgOssPath = getImageOssPath(p.FileOssPath)
	p.metaFileKey = path.Join(p.imgOssPath, metaFileName)
}

func getImageOssPath(fileOssPath string) string {
	return fileOssPath + "_i"
}
func getMetaFileKey(fileOssPath string) string {
	return path.Join(getImageOssPath(fileOssPath), metaFileName)
}

type ParseResult struct {
	TotalPage   int     `json:"totalPage"`
	CoverWidth  float64 `json:"coverWidth"`
	CoverHeight float64 `json:"coverHeight"`
	ParseStatus int     `json:"parseStatus"`
}

func (r *ParseResult) String() string {
	return util.ConvertToJsonStr(r)
}

type parseMsg struct {
	localPdfPath       string
	remotePdfPath      string
	localImgDirPrefix  string
	remoteImgDir       string
	totalPage          int
	width              float64 // pdf文档宽高
	height             float64
	metaFileKey        string
	reqTime            time.Time
	ctx                *gin.Context
	prePage            int
	imageDpi           int
	imageWidth         float64 // pdf封面图片宽高
	imageHeight        float64
	disablePicCompress bool // 是否禁用压缩图片
	fileId             string
	fileOssPath        string
}

func toParseMsg(pdfInfo *pfc.PdfInfo, param *requestParam, ctx *gin.Context) *parseMsg {
	localImgDirPrefix := getLocalImgDirPrefix(param.FileId)
	return &parseMsg{
		localPdfPath:       pdfInfo.LocalPath,
		remotePdfPath:      pdfInfo.RemotePath,
		localImgDirPrefix:  localImgDirPrefix,
		remoteImgDir:       param.imgOssPath,
		totalPage:          pdfInfo.TotalPage,
		width:              pdfInfo.Width,
		height:             pdfInfo.Height,
		metaFileKey:        param.metaFileKey,
		ctx:                ctx,
		prePage:            param.PrePage,
		imageDpi:           config.Config.ImageDpi,
		disablePicCompress: param.DisablePicCompress,
		fileId:             param.FileId,
		fileOssPath:        param.FileOssPath,
	}
}

func (m *parseMsg) getLocalImgDir(firstPage int) string {
	return getLocalImageDir(m.localImgDirPrefix, firstPage)
}

func (m *parseMsg) getCost() time.Duration {
	return mw.GetCost(m.ctx)
}

func buildPdfParseRecord(msg *parseMsg, result *ParseResult,
	taskPage int, taskEstimateMs int64) *dao.TblPdfParseRecord {
	c := config.Config
	pending := calPendingList(msg.prePage, taskPage, result.TotalPage)
	base := dao.BasePdfParseRecord{
		Status:             dao.StParsing,
		FileId:             msg.fileId,
		FileOssPath:        msg.fileOssPath,
		Dpi:                msg.imageDpi,
		Tool:               c.ParseTool,
		Width:              result.CoverWidth,
		Height:             result.CoverHeight,
		TaskPage:           taskPage,
		TaskEstimateMs:     taskEstimateMs,
		ExpireTime:         getExpireTime(taskEstimateMs),
		RetryCount:         0,
		DisablePicCompress: msg.disablePicCompress,
	}
	return &dao.TblPdfParseRecord{
		PrePage:            msg.prePage,
		TotalPage:          result.TotalPage,
		Pending:            util.ConvertToJsonStr(pending),
		Parsing:            firstNum,
		BasePdfParseRecord: base,
	}
}

func calPendingList(prePage, taskPage, totalPage int) []int {
	pending := make([]int, 0)
	for start := prePage + 1; start <= totalPage; start = start + taskPage {
		pending = append(pending, start)
	}
	return pending
}

func getExpireTime(taskEstimateMs int64) int64 {
	return util.NowMs() + maxTaskExpireParam*taskEstimateMs
}

func getLocalImgDirPrefix(fileId string) string {
	return path.Join(config.Config.BaseDir, "img", fileId)
}

func getLocalImageDir(localImgDirPrefix string, firstPage int) string {
	return path.Join(localImgDirPrefix, strconv.Itoa(firstPage))
}

func getImageOssCompressDir(imageOssPath string) string {
	return imageOssPath + "_z"
}

type JumpHis struct {
	PrePage   int   `json:"prePage"`
	TotalPage int   `json:"totalPage"`
	TaskPage  int   `json:"taskPage"`
	History   []int `json:"history"`
}

func (j *JumpHis) String() string {
	return util.ConvertToJsonStr(j)
}

func buildJumpHis(record *dao.TblPdfParseRecord, his []int) *JumpHis {
	prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
	totalPendingSet := _set.NewBySlice(calPendingList(prePage, taskPage, totalPage))
	var pending []int
	_ = json.Unmarshal([]byte(record.Pending), &pending)
	pendingSet := _set.NewBySlice(pending)
	// 已经解析过的
	history := _set.Difference(totalPendingSet, pendingSet).Slice()
	hisSet := _set.NewBySlice(his)
	history = _set.Union(hisSet, _set.NewBySlice(history)).Slice()
	return &JumpHis{
		PrePage:   prePage,
		TaskPage:  taskPage,
		TotalPage: totalPage,
		History:   history,
	}
}

func baseRecord2ParseMsg(pdfInfo *pfc.PdfInfo, record dao.BasePdfParseRecord) *parseMsg {
	localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
	return &parseMsg{
		localPdfPath:       pdfInfo.LocalPath,
		remotePdfPath:      pdfInfo.RemotePath,
		localImgDirPrefix:  localImgDirPrefix,
		remoteImgDir:       getImageOssPath(record.FileOssPath),
		width:              record.Width,
		height:             record.Height,
		metaFileKey:        getMetaFileKey(record.FileOssPath),
		imageDpi:           record.Dpi,
		disablePicCompress: record.DisablePicCompress,
		fileId:             record.FileId,
		fileOssPath:        record.FileOssPath,
	}
}
deal_parse.go
package pdf_parse_v2

import (
	"github.com/gin-gonic/gin"
	"github.com/pkg/errors"
	math2 "math"
	"os"
	"path"
	"pps/application/common"
	"pps/application/compress"
	util2 "pps/application/util"
	"pps/config"
	"pps/dao"
	"pps/helper/img_util"
	pfc "pps/helper/pdf_cache"
	"pps/helper/pdf_util"
	"strings"
	"support/concurrent"
	"support/http_util"
	"support/logger"
	"support/math"
	"support/oss"
	"support/safe"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
)

const firstNum = 1

const BigImageDpi = 96

// BestParseTime 最佳解析时间15s
const BestParseTime = 15000

func DealParse(c *gin.Context) {
	common.CallBegin()
	defer common.CallEnd()

	log := mw.GetLogger(c)
	// 参数校验
	var param requestParam
	if err := c.ShouldBindQuery(¶m); err != nil {
		log.Error("DealParse request param failed as: %s", err)
		mw.RetFail(c, mw.ErrBadParam)
		return
	}
	param.init()

	// 检查是否解析过
	if pr := getParseResult(log, ¶m); pr != nil {
		log.Warn("skip parse as already parsed, meta: %s", pr)
		mw.RetJSON(c, pr)
		return
	}

	// 获取pdf信息(文件本地路径,页数,宽高)
	pdfInfo, err := pfc.GetPdf(log, param.FileOssPath)
	if err != nil {
		log.Error("GetPdf failed as: %s", err)
		mw.RetFail(c, mw.ErrInner)
		return
	}
	log.Info("get pdf info finished, cost: %s, pdfInfo: %s", mw.GetCost(c), pdfInfo)
	// 最多解析1000页
	maxParseCount := config.Config.TotalParseCount
	if pdfInfo.TotalPage > maxParseCount {
		log.Error("pdf totalPage more than maxParseCount, totalPage %d, maxParseCount: %d",
			pdfInfo.TotalPage, maxParseCount)
		pdfInfo.TotalPage = maxParseCount
	}
	msg := toParseMsg(pdfInfo, ¶m, c)
	// 预解析
	pr, httpErr := preParse(c, msg)
	if httpErr != nil {
		mw.RetFail(c, httpErr)
		return
	}
	// 启动协程,继续解析剩下的
	go safe.Safego(func() {
		leftParse(c.Copy(), msg)
	}, "leftParse")
	// 返回
	mw.RetJSON(c, pr)
	return

}

// getParseResult 如果解析过,图片目录中会有meta文件
func getParseResult(log logger.ILog, param *requestParam) *ParseResult {
	if r, _ := oss.Helper.IsObjectExist(param.metaFileKey); !r {
		return nil
	}
	res := ParseResult{}
	err := util2.DownloadData(param.metaFileKey, &res)
	if err != nil {
		log.Error("error while download meta file: %s", err)
		return nil
	}
	return &res
}

func preParse(c *gin.Context, msg *parseMsg) (r *ParseResult, he *http_util.HttpError) {

	log := mw.GetLogger(c)
	db := db2.Db(c)
	msg.prePage = util.If(msg.prePage == 0, config.Config.PreParseCount, msg.prePage)

	lastPage := math.Min(msg.prePage, msg.totalPage)
	if err := parseAndUpload(c, msg, firstNum, lastPage); err != nil {
		log.Error("pre parse failed while parse as: %s", err)
		return nil, mw.ErrInner
	}
	log.Info("pre parse upload image finished")
	parseStatus := util.If(msg.totalPage <= msg.prePage, dao.StParseSuccess, dao.StParsing)
	pr := &ParseResult{
		TotalPage:   msg.totalPage,
		CoverWidth:  msg.imageWidth,
		CoverHeight: msg.imageHeight,
		ParseStatus: parseStatus,
	}
	log.Info("pre parse set size finished, result: %s", pr)
	// 上传meta文件
	if err := util2.UploadData(msg.metaFileKey, pr); err != nil {
		log.Error("pre parse failed while upload meta file as: %s", err)
		return nil, mw.ErrInner
	}
	// 计算平均每页耗时
	preParseCost := msg.getCost()
	log.Info("pre parse upload meta file finished cost: %s", preParseCost)
	avgPageCost := preParseCost.Milliseconds() / int64(msg.prePage)
	taskPage := math.Max(config.Config.PreParseCount, int(BestParseTime/avgPageCost))
	taskEstimateMs := int64(taskPage) * avgPageCost
	e := buildPdfParseRecord(msg, pr, taskPage, taskEstimateMs)
	if err := dao.SavePdfParseRecord(db, e); err != nil {
		log.Error("SavePdfParseRecord failed as: %s", err)
		return pr, mw.ErrDb
	}
	return pr, nil
}

func parseAndUpload(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {

	log := mw.GetLogger(c)
	localImgDir := msg.getLocalImgDir(firstPage)
	if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
		return errors.Wrap(err, "mkdir")
	}
	defer func() {
		if err := os.RemoveAll(localImgDir); err != nil {
			log.Error("remove dir failed as: %s", err)
		}
	}()

	// 解析成图片
	res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
		firstPage, lastPage, msg.imageDpi)
	// 检查是否是因为图片太大,再执行一次
	res, err = checkImageParseErr(log, res, err, msg, firstPage, lastPage, localImgDir)
	if err != nil {
		return errors.Wrap(err, "parse pdf")
	}
	wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
	log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
		firstPage, lastPage, wrappedRes)

	// dpi检查,封面图片过大则调整dpi
	if firstPage == firstNum {
		if err := checkDpi(c, msg, firstPage, lastPage); err != nil {
			return errors.Wrap(err, "check dpi")
		}
	}

	// 上传图片
	if err := uploadImage(localImgDir, msg.remoteImgDir, firstPage, lastPage); err != nil {
		return errors.Wrap(err, "upload image")
	}

	// 压缩图片 根据参数开启
	if !msg.disablePicCompress {
		if err := compressImage(c, msg.remoteImgDir, firstPage, lastPage); err != nil {
			return errors.Wrap(err, "compressImage")
		}
	}

	return nil
}

func uploadImage(localImgDir, remoteImgDir string, firstPage, lastPage int) error {
	// 并发上传图片
	limit := concurrent.NewGoLimit(5)
	var err error
	for i := firstPage; i <= lastPage; i++ {
		pageNo := i // 防止闭包i变化
		imgName := pdf_util.GetImgName(pageNo)
		localImgPath := path.Join(localImgDir, imgName)
		remoteImgPath := path.Join(remoteImgDir, imgName)
		limit.Run(func() {
			if e := oss.Helper.PutFile(remoteImgPath, localImgPath, oss.AclPublicRead); e != nil {
				err = e
			}
		})
	}
	limit.Wait()

	return err
}

func compressImage(c *gin.Context, remoteImgDir string, firstPage, lastPage int) error {

	// 压缩图片
	limit := concurrent.NewGoLimit(10)
	for i := firstPage; i <= lastPage; i++ {
		imgName := pdf_util.GetImgName(i)
		imgOssPath := path.Join(remoteImgDir, imgName)
		limit.RunError(func() error {
			return compress.CompressImage(c, imgOssPath)
		})
	}
	limit.Wait()

	return limit.FirstError()
}

func checkDpi(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {
	localImgDir := msg.getLocalImgDir(firstPage)
	log := mw.GetLogger(c)
	localImgPath := path.Join(localImgDir, pdf_util.GetImgName(firstNum))
	imgDim, err := img_util.GetLocalImageDim(localImgPath)
	if err != nil {
		return errors.Wrap(err, "get local image")
	}
	// 修改封面宽高
	msg.imageWidth = float64(imgDim.Width)
	msg.imageHeight = float64(imgDim.Height)

	maxWidth, maxHeight, maxSize := config.Config.ImgAttrMaxValue[0], config.Config.ImgAttrMaxValue[1], config.Config.ImgAttrMaxValue[2]
	if imgDim.Width == 0 || imgDim.Height == 0 {
		return errors.New("get imageDim failed Width or Height is 0")
	}
	rw, rh, rs := float64(imgDim.Width)/maxWidth, float64(imgDim.Height)/maxHeight,
		math2.Sqrt(float64(imgDim.Width*imgDim.Height)/maxSize)
	rr := math.Max(rw, rh, rs)
	// 需要调整dpi
	if rr > 1.0 {
		dpi := float64(msg.imageDpi) / rr
		msg.imageDpi = int(dpi)
		log.Error("checkDpi image is too large,dpi change to %d", msg.imageDpi)
		// 解析成图片
		res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		if err != nil {
			return errors.Wrap(err, "parse pdf")
		}

		// 获取调整dpi后的封面宽高
		wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
		log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
			firstPage, lastPage, wrappedRes)
		imgDim, err := img_util.GetLocalImageDim(localImgPath)
		if err != nil {
			return errors.Wrap(err, "get local image")
		}
		// 修改封面宽高
		msg.imageWidth = float64(imgDim.Width)
		msg.imageHeight = float64(imgDim.Height)
	}
	return nil
}
func wrapOssPath(res string, ossPath string) string {
	res = strings.ReplaceAll(res, "\r\n", "\n")
	res = strings.Trim(res, " \n")
	lines := strings.Split(res, "\n")
	for i, line := range lines {
		lines[i] = line + " (" + ossPath + ")"
	}
	return strings.Join(lines, "\n")
}

func checkImageParseErr(log logger.ILog, res []byte, err error, msg *parseMsg, firstPage, lastPage int, localImgDir string) ([]byte, error) {
	// 降dpi再执行一次
	if strings.Contains(string(res), pdf_util.PdfiumBigImageError.Error()) {
		msg.imageDpi = BigImageDpi
		log.Error("image is too large,dpi change to %d", msg.imageDpi)
		res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		return res, err
	}
	if err != nil && strings.Contains(err.Error(), "killed") {
		msg.imageDpi = BigImageDpi
		log.Error("pdf parse is failed as killed by signal,dpi change to %d", msg.imageDpi)
		res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		return res, err
	}
	return res, err
}
deal_parse_left.go
package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"github.com/pkg/errors"
	"os"
	"path"
	"pps/dao"
	"pps/helper/img_util"
	"pps/helper/pdf_util"
	"support/math"
	"support/oss"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
)

func leftParse(c *gin.Context, msg *parseMsg) {
	log := mw.GetLogger(c)
	db := db2.Db(c)
	for {
		record, err := dao.GetPdfParseRecordByFileId(db, msg.fileId)
		if err != nil {
			log.Error("GetPdfParseRecordByFileId failed as err:%v", err)
			return
		}
		var pending []int
		if err = json.Unmarshal([]byte(record.Pending), &pending); err != nil {
			log.Error("Unmarshal pending failed as err:%v", err)
			return
		}
		if len(pending) == 0 {
			if err := dao.UpdatePdfParseRecordStatus(db, msg.fileId, dao.StParseSuccess); err != nil {
				log.Error("UpdatePdfParseRecordStatus failed as err:%v", err)
			}
			return
		}
		// 取第一条进行解析任务
		firstPage := pending[0]
		lastPage := math.Min(record.TotalPage, firstPage+record.TaskPage-1)

		newPending := util.ConvertToJsonStr(pending[1:])
		expireTime := getExpireTime(record.TaskEstimateMs)
		if err := dao.UpdatePdfParseRecord(db, msg.fileId, newPending, firstPage, msg.imageDpi, expireTime); err != nil {
			log.Error("UpdatePdfParseRecordPending failed as err:%v", err)
			return
		}

		// 调用解析
		if err := parseAndUpload(c, msg, firstPage, lastPage); err != nil {
			// 切换破图上传oss
			if err := GenFailedImageUpload(c, record.BasePdfParseRecord, firstPage, lastPage); err != nil {
				log.Error("GenFailedImageUpload failed as err:%v", err)
			}
		}
	}
}

func GenFailedImageUpload(c *gin.Context, record dao.BasePdfParseRecord, firstPage, lastPage int) error {
	log := mw.GetLogger(c)

	localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
	localImgDir := getLocalImageDir(localImgDirPrefix, firstPage)
	if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
		return errors.Wrap(err, "mkdir")
	}
	imgName := pdf_util.GetImgName(firstPage)
	localImgPath := path.Join(localImgDir, imgName)
	err := GenFailedImage(localImgPath, record.Width, record.Height)
	if err != nil {
		return err
	}
	remoteImgOssPath := getImageOssPath(record.FileOssPath)
	// 上传图片
	if err := uploadImage(localImgDir, remoteImgOssPath, firstPage, firstPage); err != nil {
		return errors.Wrap(err, "upload image")
	}
	// 压缩图片 根据参数开启
	if !record.DisablePicCompress {
		if err := compressImage(c, remoteImgOssPath, firstPage, firstPage); err != nil {
			return errors.Wrap(err, "compressImage")
		}
	}
	// 剩下的不压缩,直接拷贝
	for start := firstPage + 1; start <= lastPage; start++ {
		// oss拷贝
		srcImg := path.Join(remoteImgOssPath, imgName)
		dstImg := path.Join(remoteImgOssPath, pdf_util.GetImgName(start))
		log.Debug("oss copy src:%s, dst:%s", srcImg, dstImg)
		err := oss.Helper.CopyObject(srcImg, dstImg)
		if err != nil {
			log.Error(" srcImg:%s, dstImg:%s oss copy failed as err:%v", srcImg, dstImg, err)
		}
		if !record.DisablePicCompress {
			srcImgCompress := getImageOssCompressDir(srcImg)
			dstImgCompress := getImageOssCompressDir(dstImg)
			err := oss.Helper.CopyFolder(srcImgCompress, dstImgCompress)
			if err != nil {
				log.Error(" srcImgCompress:%s, dstImgCompress:%s oss copy failed as err:%v",
					srcImgCompress, dstImgCompress, err)
			}
		}
	}
	return nil
}

func GenFailedImage(filePath string, width, height float64) error {
	data, err := img_util.GenFailImage(width, height)
	if err != nil {
		return err
	}
	outFile, err := os.Create(filePath)
	if err != nil {
		return err
	}
	_, _ = outFile.Write(data)
	_ = outFile.Close()
	return nil
}
deal_parse_jump.go
package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"pps/dao"
	pfc "pps/helper/pdf_cache"
	"support/collection/_set"
	"support/database/redis"
	"support/logger"
	"support/safe"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
	"time"
)

const maxJumpCount = 5

const PpsJumpHisCachePrefix = "PPS:PDF:JUMP_FILEID_"

func DealJump(c *gin.Context) {
	log := mw.GetLogger(c)
	db := db2.Db(c)
	// 参数校验
	var param requestJumpParam
	if err := c.ShouldBindQuery(¶m); err != nil {
		log.Error("DealJump request param failed as: %s", err)
		mw.RetFail(c, mw.ErrBadParam)
		return
	}
	fileId := param.FileId
	jumpHis, _ := CacheGetJumpHis(log, fileId)
	// redis有缓存
	if len(jumpHis.History) > 0 {
		if hasParse(jumpHis, param.PageNo, false) {
			log.Debug("pageNo:%d hasParse task", param.PageNo)
			mw.RetJSON(c, "")
			return
		}
	}
	record, err := dao.GetPdfParseRecordByFileId(db, fileId)
	if err != nil {
		log.Error("DealJump GetPdfParseRecordByFileId failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}
	jumpHis = buildJumpHis(record, jumpHis.History)
	log.Debug("jumpHis:%s", jumpHis)

	if hasParse(jumpHis, param.PageNo, true) {
		log.Debug("pageNo:%d hasParse task", param.PageNo)
		mw.RetJSON(c, "")
		return
	}
	// 更新his
	_ = CacheSetJumpHis(log, fileId, jumpHis)

	// 调整任务顺序
	pending := adjustPendingList(record, param.PageNo)
	defer func() {
		// 更新pending
		if err = dao.UpdatePdfParseRecordPending(db, fileId, util.ConvertToJsonStr(pending)); err != nil {
			log.Error("DealJump UpdatePdfParseRecordPending failed as: %s", err)
		}
	}()

	// 更新解析记录
	jumpCount, err := dao.GetPdfParseJumpCount(db, fileId)
	if err != nil {
		log.Error("DealJump GetPdfParseJumpCount failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}
	if jumpCount >= maxJumpCount {
		log.Warn("currJumpCount:%d gt maxJumCount:%d", jumpCount, maxJumpCount)
		mw.RetJSON(c, "")
		return
	}
	fromPage := pending[0]
	expireTime := getExpireTime(record.TaskEstimateMs)
	jumpRecord := dao.BuildPdfParseJumpFromParseRecord(record, fromPage, expireTime)
	// 创建跳页记录
	if err = dao.SavePdfParseJump(db, jumpRecord); err != nil {
		log.Error("DealJump SavePdfParseJump failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}

	pending = pending[1:]
	mw.RetJSON(c, "")
	// 处理跳页
	go safe.Safego(func() {
		dealJump(c.Copy(), jumpRecord)
	}, "dealJump")
}

func CacheGetJumpHis(log logger.ILog, fileId string) (*JumpHis, error) {
	var his JumpHis
	key := PpsJumpHisCachePrefix + fileId
	value, err := redis.Get(key)
	if err != nil {
		if err != redis.ErrGetNil {
			log.Error("CacheGetJumpHis, fileId: %s, err: %s", fileId, err)
		}
		return &his, err
	}
	err = json.Unmarshal([]byte(value), &his)
	return &his, err
}

func CacheSetJumpHis(log logger.ILog, fileId string, his *JumpHis) error {
	key := PpsJumpHisCachePrefix + fileId
	value, err := json.Marshal(his)
	if err != nil {
		log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
		return err
	}
	err = redis.Set(key, string(value), time.Hour)
	if err != nil {
		log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
	}
	return err
}

func hasParse(his *JumpHis, pageNo int, insert bool) bool {
	prePage, taskPage, totalPage := his.PrePage, his.TaskPage, his.TotalPage
	plist := calPendingList(prePage, taskPage, totalPage)
	// 开头补上0 表示预解析
	plist = append([]int{0}, plist...)
	start := calPageStart(plist, pageNo)
	hisSet := _set.NewBySlice(his.History)
	hisSet.Add(0)
	if hisSet.Has(start) {
		return true
	}
	if insert {
		// 本次操作加进his
		hisSet.Add(start)
	}

	his.History = hisSet.Slice()
	return false
}

// 二分法查找仅次于target的数字
func calPageStart(arr []int, target int) int {
	// 边界情况处理
	if len(arr) == 0 {
		return 0 // 如果数组空,返回0(一般不会运行到这里)
	}
	if target < arr[0] {
		return arr[0]
	}
	if target > arr[len(arr)-1] {
		return arr[len(arr)-1]
	}

	left := 0
	right := len(arr) - 1

	for left <= right {
		mid := left + (right-left)/2

		// 如果找到精确匹配
		if arr[mid] == target {
			return arr[mid]
		} else if arr[mid] < target {
			// 检查是否是最后一个小于n的元素
			if mid == len(arr)-1 || arr[mid+1] > target {
				return arr[mid]
			}
			left = mid + 1
		} else {
			right = mid - 1
		}
	}

	return 0 // 默认返回0,一般不会运行到这里
}

func adjustPendingList(record *dao.TblPdfParseRecord, pageNo int) (pending []int) {
	_ = json.Unmarshal([]byte(record.Pending), &pending)
	if len(pending) == 0 {
		return
	}
	prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
	plist := calPendingList(prePage, taskPage, totalPage)
	start := calPageStart(plist, pageNo)
	// 已经排在第一个了
	if start == pending[0] {
		return
	}
	pending = rotateSlice(pending, start)
	return
}

func rotateSlice(arr []int, value int) []int {
	// 查找元素的位置
	index := -1
	for i, v := range arr {
		if v == value {
			index = i
			break
		}
	}

	// 如果没有找到指定的元素,返回原始切片
	if index == -1 {
		return arr
	}

	// 使用拼接方式旋转切片
	result := append(arr[index:], arr[:index]...)
	return result
}

func dealJump(c *gin.Context, record *dao.TblParseJump) {
	log := mw.GetLogger(c)
	db := db2.Db(c)

	// 获取pdf信息(文件本地路径,页数,宽高)
	pdfInfo, err := pfc.GetPdf(log, record.FileOssPath)
	if err != nil {
		log.Error("GetPdf failed as: %s", err)
		return
	}
	msg := baseRecord2ParseMsg(pdfInfo, record.BasePdfParseRecord)
	// 调用解析
	if err := parseAndUpload(c, msg, record.FromPage, record.ToPage); err != nil {
		// 切换破图上传oss
		if err := GenFailedImageUpload(c, record.BasePdfParseRecord, record.FromPage, record.ToPage); err != nil {
			log.Error("GenFailedImageUpload failed as err:%v", err)
		}
	}
	if err := dao.UpdateJumpSuccess(db, record.FileId); err != nil {
		log.Error("UpdateJumpSuccess failed as err:%v", err)
	}
}

4.代码解读

实现pdf跳页解析功能_github_08

跳页主要是调整解析队列pending的顺序,如果有可用协程就直接启动协程执行跳页解析

实现pdf跳页解析功能_github_09

left解析主要就是从数据库的pending字段取出一个start 然后开始解析


5.番外篇pdfium介绍

PDFium 是一个开源的 PDF 渲染引擎,用于解析和呈现 PDF 文档。它最初由 Foxit Software 开发,随后由 Google 作为 Chromium 项目的一部分维护和发布。PDFium 被广泛应用于浏览器(如 Google Chrome)的内置 PDF 查看器,以及其他需要处理 PDF 文档的应用程序中。

PDFium 的主要功能

  1. PDF 渲染:PDFium 可以将 PDF 页面渲染为不同格式的图像(如 Bitmaps),并支持高效的缩放和旋转操作。
  2. 文本提取:可以提取 PDF 文档中的文本内容,便于文本检索和搜索引擎的索引。
  3. 表单处理:支持 PDF 表单的填充、提交和提取操作。
  4. 注释处理:支持读取和管理 PDF 注释,如高亮、注释和签名。
  5. 图像和图形处理:能够解析和渲染 PDF 文件中的图像和矢量图形。
  6. 安全性:支持解析和处理加密的 PDF 文档。

主要特性

  • 跨平台支持:PDFium 可以在 Windows、macOS、Linux 等多个平台上编译和运行。
  • 高性能:针对性能进行了优化,可以高效处理大型和复杂的 PDF 文档。
  • 模块化:具有模块化架构,可以选择性编译和使用需要的功能模块。
  • 丰富的 API:提供了丰富的 API,可以进行复杂的 PDF 操作和自定义扩展。

如何使用 PDFium

1. 下载和构建

PDFium 是一个开源项目,可以从其 GitHub 仓库下载源码:

git clone https://pdfium.googlesource.com/pdfium

你可以按照文档进行配置和编译,以适应不同平台和需求。常用的构建工具包括 CMake 和 Ninja。

2. 基本用例

构建完成后,你可以在你的 C/C++ 项目中使用 PDFium。以下是一个简单的示例,展示如何创建一个 PDF 文档并渲染第一页:

#include "public/fpdfview.h"

int main(int argc, char** argv) {
    // 初始化 PDFium
    FPDF_InitLibrary();

    // 加载 PDF 文档
    FPDF_DOCUMENT doc = FPDF_LoadDocument("example.pdf", nullptr);
    if (!doc) {
        FPDF_DestroyLibrary();
        return -1;
    }

    // 加载第一页
    FPDF_PAGE page = FPDF_LoadPage(doc, 0);
    if (!page) {
        FPDF_CloseDocument(doc);
        FPDF_DestroyLibrary();
        return -1;
    }

    // 渲染第一页到 Bitmap
    int width = FPDF_GetPageWidth(page);
    int height = FPDF_GetPageHeight(page);
    FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, 0);
    FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF); // 白色背景
    FPDF_RenderPageBitmap(bitmap, page, 0, 0, width, height, 0, 0);

    // 保存 Bitmap 到文件或进行其他处理
    // ...

    // 清理资源
    FPDFBitmap_Destroy(bitmap);
    FPDF_ClosePage(page);
    FPDF_CloseDocument(doc);
    FPDF_DestroyLibrary();

    return 0;
}

常见应用场景

  1. 嵌入式 PDF 查看器:你可以在桌面或移动应用中嵌入一个自定义的 PDF 查看组件,为用户提供查看和互动功能。
  2. 服务器端 PDF 处理:在服务器端应用中使用 PDFium 批量处理 PDF 文档,实现自动化的文档管理和处理流程。
  3. PDF 数据提取:通过 PDFium 提取 PDF 文档中的文本、图像和其他数据,用于数据分析、数据挖掘等。

资源和社区

PDFium 作为一个强大且灵活的 PDF 引擎,广泛应用于各种需处理 PDF 文档的场景。如果你有特定的需求,深入研究 PDFium 的文档和示例,结合实际情况进行开发,可以达到最佳效果。