golang使用chromedp生成滚动更新页面的PDF

缘起

目前有一个Python项目，作用是打开一个报表页面，截图后生成PDF，作为邮件附件发送到指定邮箱。当前这个Python项目性能较差，目前项目组没有人会Python，于是决定使用golang重写一下

模拟滚动更新

这个页面是滚动更新的，滚动到某个地方才会加载下面的内容，光在这个滚动更新上就踩了很多坑。

踩坑window.scrollTo(x,y) 这个函数在静态页面是可以的，x代表横向偏移，y代表纵向偏移，x我设定的0，y设定的2400（通过页面某个元素可以获取到我当页面的最大值是2400），按F12打开控制台，将这段代码放进去，根本不起作用
踩坑 document.body.scrollTo(x,y) 使用方式同上，同样不起作用
踩坑 document.documentElement.scrollTo(x,y) 同样不起作用

[]byte转int总是0 就是下面这个代码

 bytesBuffer := bytes.NewBuffer(b)
 var x int32
 binary.Read(bytesBuffer, binary.BigEndian, &x)

真理 document.getElementById('xxxx-head').scrollTop=2400 'xxxx-head是我这个页面的最上面的可显示的<div>标签的id，这个可以将滚动条拉到最下面，我代码里面根据可视高度一点点的逼近最大值，其实就是每次滚动一屏，直到滚动到底为止，请看代码

核心代码

/**
fileNameUrlMap: key为文件名 value为url
topDivId: 页面上最上面显示第一行字的div 用来定位页面的最高处 这里传入这个div的Id
waitVisibleExpr: 页面加载到这个选择器说明页面加载完成了
maxHighId: 获取页面最高的大小的div的id
chromeCtx: 谷歌浏览器实例
goTraceId: 日志中的traceId

返回值： map的key为文件名，value是byte数组，返回生成的pdf的文件字节

注意：我这个是动态页面是滚动加载的页面，需要模拟人手动去滚动滑块，然后等待加载数据，一直到底部为止我当前的页面可以通过某个div获取到高度，如果你无法获取到高度，就往下滚动，监测本次滚动所能到达的高度和上一次的是否一致，是的话就说明滚动到底了 */ func ScreenPdf(fileNameUrlMap map[string]string, topDivId string, waitVisibleExpr string, maxHighId string, chromeCtx context.Context, goTraceId string) map[string][]byte { var ( err error resultMap = make(map[string][]byte) ) config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("开始进行截图... param=%s ", fileNameUrlMap)

config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("初始化chrome完成...")

for fileName, url := range fileNameUrlMap {
    config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("开始截图，当前处理文件名=%s url=%s", fileName, url)
    chromeTabCtx, cancelFunc := chromedp.NewContext(chromeCtx, chromedp.WithLogf(config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof))
    //空任务触发初始化
    err = chromedp.Run(chromeTabCtx, make([]chromedp.Action, 0, 1)...)
    chromedp.Sleep(time.Second * 2)
    if err != nil {
        config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("初始化chrome并执行第一个Task失败跳过此截图 fileName=%s", fileName)
        continue
    }
    buf := make([]byte, 0)
    err = chromedp.Run(chromeTabCtx, chromedp.Tasks{
        chromedp.Navigate(url),
        chromedp.Sleep(time.Second * 10),
        chromedp.ActionFunc(func(ctx context.Context) error {
            config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("开始等待页面加载 检测点=%s fileName=%s", waitVisibleExpr, fileName)
            return nil
        }),
        chromedp.WaitVisible(waitVisibleExpr, chromedp.ByID),
        chromedp.ActionFunc(func(ctx context.Context) error {
            config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("页面加载完成 检测点=%s fileName=%s", waitVisibleExpr, fileName)
            var html string
            chromedp.InnerHTML(waitVisibleExpr, &html, chromedp.ByID)
            config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("获取到的页面html=%s", html)
            return nil
        }),
        chromedp.Sleep(time.Second * 15),
        chromedp.ActionFunc(func(ctx context.Context) error {
            //获取可视界面的高度
            var jsGetClientHigh = "document.body.clientHeight"
            clientHigh := getHighByJs(jsGetClientHigh, ctx)
            config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("可视高度为%d ", clientHigh)
            //获取最高的
            var jsGetMaxHigh = "document.getElementById('" + maxHighId + "').offsetHeight"
            maxHigh := getHighByJs(jsGetMaxHigh, ctx)
            config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("最大高度为%d ", maxHigh)
            var currentHigh = clientHigh
            //滚动
            for {
                if currentHigh < maxHigh {
                    jsScroll := "document.getElementById('" + topDivId + "').scrollTop=" + strconv.Itoa(currentHigh)
                    chromedp.EvalAsValue(&runtime.EvaluateParams{
                        Expression:    jsScroll,
                        ReturnByValue: false,
                    }).Do(ctx)
                    time.Sleep(time.Second * 15)
                    currentHigh += clientHigh
                } else {
                    config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Infof("跳出高度%d fileName=%s", currentHigh, fileName)
                    break
                }
            }
            //滚动完成后滚回第一屏
            jsScroll0 := "document.getElementById('" + topDivId + "').scrollTop=0"
            chromedp.EvalAsValue(&runtime.EvaluateParams{
                Expression:    jsScroll0,
                ReturnByValue: false,
            }).Do(ctx)
            time.Sleep(time.Second * 1)
            //纸张设置为A0
            buf, _, err = page.PrintToPDF().WithPaperWidth(33.1).WithPaperHeight(46.8).WithPrintBackground(true).Do(ctx)
            return err
        }),
    })

    if err != nil {
        config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).Errorf("截图出现报错 跳过当前PDF fileName=%s err=%v ", fileName, err)
        continue
    }
    config.LogEntry.WithFields(logrus.Fields{config.GoTraceId: goTraceId}).
        Infof("截图生成bytes完成 当前fileName=%s byteLength=%d", fileName, len(buf))
    resultMap[fileName] = buf
    cancelFunc()
}

return resultMap

}

func getHighByJs(jsGetHigh string, ctx context.Context) int { result, _, _ := chromedp.EvalAsValue(&runtime.EvaluateParams{ Expression: jsGetHigh, ReturnByValue: true, }).Do(ctx) json, _ := result.Value.MarshalJSON() clientHigh := bytesToInt(json) return clientHigh }

func bytesToInt(bys []byte) int { length := float64(len(bys)) - 1 var x float64 for _, value := range bys { tmp := math.Pow(10, length) x = x + (float64(value)-48)*tmp length-- } return int(x)

}

 上面用到了chrome的实例，实例初始化如下：
 ```go
 import (
    "context"
    "github.com/chromedp/chromedp"
    "os"
)

var ChromeCtx context.Context

/**
chrome初始化 全局使用这一个实例即可
*/
func init() {

    var headlessFlag chromedp.ExecAllocatorOption
    //headless这个默认是true，如果想要在本地调试的时候看下浏览器的行为，可以在
    //环境变量里添加headless=false 就可以在本地调试并观察浏览器被控制的行为了
    isHeadless := os.Getenv("headless")
    if isHeadless == "false" {
        headlessFlag = chromedp.Flag("headless", false)
    } else {
        headlessFlag = chromedp.Flag("headless", true)
    }
    opts := append(
        chromedp.DefaultExecAllocatorOptions[:],
        //不检查默认浏览器
        chromedp.NoDefaultBrowserCheck,
        //无头
        headlessFlag,
        //忽略错误
        chromedp.IgnoreCertErrors,
        //不加载gif图像 因为有可能会卡住
        chromedp.Flag("blink-settings", "imagesEnabled=true"),
        //关闭GPU渲染
        chromedp.DisableGPU,
        //不适用谷歌的sanbox模式运行
        chromedp.NoSandbox,
        //设置网站不是首次运行
        chromedp.NoFirstRun,
        //禁用网络安全标志
        chromedp.Flag("disable-web-security", true),
        //关闭插件支持
        chromedp.Flag("disable-extensions", true),
        //关闭默认浏览器检查
        chromedp.Flag("disable-default-apps", true),
        //初始大小
        chromedp.WindowSize(1920, 1080),
        //在呈现所有数据之前防止创建Pdf
        chromedp.Flag("run-all-compositor-stages-before-draw", true),
        //设置userAgent 不然chrome会标识自己是个chrome爬虫 会被反爬虫网页拒绝
        chromedp.UserAgent(`Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36`), //设置UserAgent
    )

    ChromeCtx, _ = chromedp.NewExecAllocator(context.Background(), opts...)
}

完整项目请参考https://github.com/BLF2/go-screenshot

缘起

模拟滚动更新

核心代码

android内涵段子项目-(架构第一期)

Andriod第三方源码分析

热门文章