Published on

记一次PHP数据导出优化经历

Authors
  • avatar
    Name
    peterlee
    Twitter

事情背景

前段时间接手了广西健康素养测评系统的项目优化,项目需求是导出数据,之前使用的是 PHPExcel,但是导出的数据量太大了(>100w),导致导出时间过长,并且他们之前的操作逻辑是直接点击导出按钮后,同步执行数据查询再导出,要导出的数据还不是单表数据,需要联合多表查询,100w+的数据量,直接导出会报OOM,即使数据量小一点不OOM,也会导在导出期间,服务器CPU直接飙升到100%,导致服务器宕机。

优化思路

  1. 缩小数据量,将数据量缩小到10万左右,然后进行导出。但是甲方显然不满意,因为他们明确要求就是要导出100w+的数据,所以继续考虑其他方案
  2. 查询方案优化,采用分页查询的方式,一次查询5000条数据(这个可以根据服务器配置来定),然后写入excel,继续查询再写入,类似分片写入的方式,直到查询完所有数据。经过测试,这个方案不会在引发OOM的问题,而且效率也比直接查询高。但是还是解决不了导出时间过长的问题,所以要在这个方案的基础上继续优化
  3. 同步改异步,点击导出按钮,后台生成一个导出的task,这个task会记录下要导出的数据库的详细信息,在系统资源比较空闲的时候,运行这个task,按第2步的方式去查询导出数据,完成任务后将任务标记为已完成,并通知到导出任务的人员,相关人员在后台可以看到自己的导出任务状态,并提供了下载链接,这样导出任务就完成了
  4. 上述方案已经比较好的解决了遇到的问题,但是我心里对导出时间过长的问题还是有点不满意,希望能进一步缩短异步任务的时长,直到遇到 https://xlswriter-docs.viest.me/ 这个三方库,简直碾压其他库无敌的存在

实际运行效果:任务运行期间,内存CPU几乎毫无波动,并且之前100w+的导出时间需要差不多半个小时,现在只需要不到2分钟不到

为什么内存优化模式内存占用少?看官方说明:当开启内存优化模式时,单元格将根据行落地磁盘,内存中只保留最新一行数据,所以内存优化模式最大内存占用等于数据最多一行的内存用量。内存只需要占用一行的量,所以内存占用少。

来说说如何使用

安装:

pecl install xlswriter

使用很简单:

初始化:

$fileName = $name . '_' . RecycleExportTask::TASK_TYPE_EXPORTRAW . '_' . date('Ymd_') . time() . '.xlsx';
$base_dir = Env::get('root_path') . 'public' . '/export_files/';
$config = [
    'path' => $base_dir  // xlsx文件保存路径
];
$excel = new new \Vtiful\Kernel\Excel($config);
// 第三个参数 False 即为关闭 ZIP64
$excel->constMemory($fileName, NULL, false);

然后$excel->data往里面丢数据就行了

$excel->data($excel_data);
$outfilepath = $excel->output(); //  返回导出文件的路径

贴上实际项目中优化后的代码:

<?php

namespace app\common\command;

use app\common\CommonException;
use app\common\model\project\SurveyResult;
use app\common\model\RecycleExportTask;
use PhpOffice\PhpSpreadsheet\Exception;
use SPSS\Sav\Writer;
use think\console\Command;
use think\console\Input;
use think\console\Output;
use think\Db;
use think\db\exception\DataNotFoundException;
use think\db\exception\ModelNotFoundException;
use think\exception\DbException;
use think\exception\PDOException;
use think\facade\Cache;
use think\facade\Env;
use think\facade\Log;
use util\ProgressBar;
use Vtiful\Kernel\Excel;

class RecycleExportRawTaskV1 extends Command
{
    const TaskType = RecycleExportTask::TASK_TYPE_EXPORTRAW;

    const LOCK_NAME = 'lock_RecycleExportTask';

    protected function configure()
    {
        $this->setName('sys:RecycleExportRawTaskV1')
            ->setDescription('RecycleExportRawTaskV1');
    }

    protected function execute(Input $input, Output $output)
    {
        set_time_limit(0);
        ini_set('memory_limit', -1); // 不限内存

        if (Cache::get(self::LOCK_NAME) == 1) {
            outputLog(self::LOCK_NAME . ' is running');
            $output->info('RecycleExportRawTask is running');
            return;
        }

        Cache::set(self::LOCK_NAME, 1, 0);
        try {
            $task = RecycleExportTask::where('task_type', self::TaskType)
                ->where('task_status', 0)->find();
            $output->info("currentTask =>" . json_encode($task));
            if (!$task) {
                outputLog('没有需要执行的任务');
                return;
            }
            $this->export($task, $output);
            outputLog('RecycleExportRawTask 完成');
        } catch (Exception|CommonException|DataNotFoundException|ModelNotFoundException|PDOException|DbException|\think\Exception $e) {
            outputLog('RecycleExportRawTask导出异常', $e->getTrace());
        } finally {
            Cache::rm(self::LOCK_NAME);
        }
    }

    /**
     * @throws DataNotFoundException
     * @throws CommonException
     * @throws Exception
     * @throws ModelNotFoundException
     * @throws PDOException
     * @throws DbException
     * @throws \PhpOffice\PhpSpreadsheet\Writer\Exception
     * @throws \think\Exception
     */
    public function export($export_task, Output $output)
    {
        outputLog('进入RecycleExportRawTask导出方法');

        Db::startTrans();
        try {
            RecycleExportTask::where('id', $export_task['id'])->update([
                'task_status' => 1,
            ]);

            $task_info = json_decode($export_task['task_info'], true);
            if (!$task_info) {
                outputLog('task_info error', $export_task);
                throw new CommonException("task_info error");
            }

            $where = $task_info['where'];
            $params = $task_info['params'];
            $type = strtolower($params['type']);

            $title_list = [
                'WJBM' => ['code'],
                'Finishcode' => ['finish_code'],
                'Sex' => ['gender'],
                'Age' => ['age'],
                'UsedTime(Min)' => ['answer_time_min'],
                'JCD' => ['jcd'],
                'XZ' => ['xz'],
                'CUN' => ['cun'],
                'JTH' => ['jth'],
                'city' => ['city'],
                'county' => ['zone'],
                'town' => ['town'],
                'village' => ['village'],
                'style' => ['finish_status'],
                'style1' => ['finish_reason_text'],
                'G02' => ['kish'],
                '调查对象' => ['answer_realname'],
                '家庭成员编号' => ['answer_id'],
                '户主姓名' => ['home_realname'],
                '调查对象联系方式' => ['local_addr_surveytel'],
                '开始答题时间' => ['answer_time_start'],
                '结束答题时间' => ['answer_time_end'],
                '家庭地址' => ['home_address'],
                '调查地址' => ['address'],
                '提交时间' => ['updated'],
                '备注' => ['remark'],
                '监测点' => ['point_name'],
                '监测点年份' => ['year'],
                '是否为国家监测点' => ['is_country_text'],
                '调查员姓名' => ['interviewer_realname'],
                '所属单位' => ['org_name'],
                '家庭完成情况' => ['family_status_text'],
                '个人完成情况' => ['answer_status_text'],
                '问卷完成情况' => ['finish_status_text'],
            ];
            outputLog('step_03');
            $project_id = 0;
            foreach ($where as $vo) {
                outputLog('step_foreach =>', $vo);
                if ($vo[0] == 'r.project_id') {
                    $project_id = $vo[2];
                }
            }

            $column = $this->_getAnswerColumn($project_id);
            foreach ($column as $vo) {
                $title_list[$vo] = [$vo];
            }
            outputLog('开始sql查询:');
            // 获取数据
            // 获取项目名
            $project = Db::name('project')->where('id', $project_id)->find();
            if (!$project) {
                throw new CommonException("此项目不存在:" . $project_id);
            }
            $queryField = 'r.created,c.commit_time updated,s.code,s.id,r.status,family_status,answer_status,s.point_code,city,zone,town,village,
    home_realname,answer_info,r.gender,age,answer_time_start,answer_time_end,answer_time,r.finish_reason,r.finish_status,r.export_data,
    s.kish,s.is_country,r.local_addr,s.home_address,s.remark,s.year,u.realname interviewer_realname,o.name org_name,c.status check_status,c.status_level,c.rollback_status';

            /* 分页查询并汇总 */
            $pageSize = 2000;
            $allData = [];

            $output->info("query step_where =>" . json_encode($where));
            // 先查询总记录数,用来计算总页数
            $totalCount = $this->listTb()->where($where)->count();

            outputLog("本次导出 =>", ['type' => $type, 'totalCount' => $totalCount]);

            $totalPages = ceil($totalCount / $pageSize);

            $output->info('准备导出=>' . $type . ',共' . $totalCount . '条数据');
            $beginTime = time();
            $output->info('开始导出,耗时较长,请耐心等待,开始时间:' . date('Y-m-d H:i:s'));
            if ($type == 'xls') {
                $progressBar = new ProgressBar($totalCount);
                $excel = $this->prepare_xls($project['title'])->header(array_keys($title_list));
                // 循环获取每一页的数据并汇总
                for ($i = 1; $i <= $totalPages; $i++) {
                    $pageData = $this->listTb()->field($queryField)
                        ->where($where)
                        ->order('s.code')
                        ->limit(($i - 1) * $pageSize, $pageSize)
                        ->select();
                    outputLog('handle page =>', $i);
                    $this->handle_page_data($pageData, $type, $excel, $title_list, $progressBar);
                }
                $progressBar->finish();
                $outfilepath = $excel->output();
                $output->info("export_filename =>" . $outfilepath);

                $base_dir = Env::get('root_path') . 'public';
                $download_url = str_replace($base_dir, '', $outfilepath);

                RecycleExportTask::where('id', $export_task['id'])->update([
                    'download_url' => $download_url,
                    'task_status' => 2,
                    'finished_time' => date('Y-m-d H:i:s')
                ]);
                outputLog('导出xls完成:', $download_url);
            }

            Db::commit();;

            $endTime = time();
            $output->info('导出xls完成,结束时间:' . date('Y-m-d H:i:s') . ',耗时:' . ($endTime - $beginTime) . '秒');
        } catch (\Exception $e) {
            outputLog('RecycleExportRawTask_Exception => ', $e->getMessage());
            Db::rollback();
            throw $e;
        }
        outputLog('未知导出格式:' . $type);
    }

    protected function prepare_xls($name): Excel
    {
        $fileName = $name . '_' . RecycleExportTask::TASK_TYPE_EXPORTRAW . '_' . date('Ymd_') . time() . '.xlsx';

        $base_dir = Env::get('root_path') . 'public' . '/export_files/';
        $config = [
            'path' => $base_dir  // xlsx文件保存路径
        ];
        $excel = new Excel($config);
        // 第三个参数 False 即为关闭 ZIP64
        return $excel->constMemory($fileName, NULL, false);
    }

    protected function handle_page_data($pageData, $type, Excel &$excel, $title_list, ProgressBar $progressBar)
    {
        $this->_parseList($pageData);
        //处理每一页的数据
        $excel_data = [];
        foreach ($pageData as $current) {
            $this->_parseInfo($current);

            $row_data = [];
            foreach ($title_list as $config) {
                $column = $current[$config[0]] ?? '';
                if ($config[0] == 'code') {
                    $column = "'" . $column;
                }
                $row_data[] = $column;
            }

            $excel_data[] = $row_data;
            $progressBar->updateBy(1);
        }
        /** @var TYPE_NAME $excel */
        $excel->data($excel_data);
    }

    protected function listTb()
    {
        return Db::name('project_survey_result')->alias('r')
            ->join('project_survey s', 's.id=r.survey_id')
            ->join('project_survey_check c', 'c.survey_id=r.survey_id')
            ->join('user u', 'u.uid=r.uid', 'left')
            ->join('org o', 'o.id=u.org_id', 'left');
    }

    protected function _parseInfo(&$info)
    {
        int_to_string($info, SurveyResult::getResultStatus());

        $info['family_status_text'] = SurveyResult::getStatusText('family', $info['family_status']);
        $info['answer_status_text'] = SurveyResult::getStatusText('answer', $info['answer_status']);
        $info['finish_status_text'] = SurveyResult::getStatusText('finish', $info['finish_status']);
        if ($info['finish_status'] == 3) {
            $info['finish_status_text'] .= '(具体请注明:' . $info['finish_reason'] . ')';
        }

        // 调查地址(准确则显示定位信息,否则显示手动输入)
        $local_addr = json_decode($info['local_addr'], true);
        $info['exact'] = $local_addr['exact'];
        if ($local_addr['exact']) {
            $info['address'] = $local_addr['addr'];
        } else {
            $info['address'] = $local_addr['surveyCity'] . $local_addr['surveyDistrict'] . $local_addr['surveyStreet'] . $local_addr['surveyAddr'];
        }
    }

    protected function _parseList(&$list)
    {
        // 根据point_code计算出point_name
        Log::write('第一步骤开始:', 'debug');
        $db = Db::name('project_survey');
        $point_codes = array_unique(array_column($list, 'point_code'));
        foreach ($point_codes as $vo) {
            if ($vo) {
                $db->whereOr('code', 'like', $vo . '%');
            } else {
                $db->whereOr('point_code', 0);
            }
        }
        $point_rows = $db->column('code,province,city,zone,town,village,home_realname', 'code');
        Log::write('第一步骤结束:', 'debug');
        int_to_string($list, [
            'gender' => ['--', '男', '女'],
            'is_country' => ['否', '是']
        ]);
        Log::write('第二步骤开始:', 'debug');
        // 15-69的人口数
        // TODO: 这里先注释掉
        /*
        $age_rows = Db::name('project_survey_family')->where('survey_id', 'in', array_column($list, 'id'))
            ->field('survey_id,count(1) total')->where('age', 'between', '15,69')->group('survey_id')->select();
        $age_rows = array_column($age_rows, 'total', 'survey_id');
        */
        Log::write('第二步骤结束:', 'debug');
        array_walk($list, function (&$item) use ($point_rows) {
            // 计算监测点
            $point = $point_rows[$item['code']];
            $point_code_len = strlen($item['point_code']);
            $point_name = [$point['province']];
            if ($point_code_len >= 4) {
                $point_name[] = $point['city'];
            }
            if ($point_code_len >= 6) {
                $point_name[] = $point['zone'];
            }
            if ($point_code_len >= 8) {
                $point_name[] = $point['town'];
            }
            if ($point_code_len >= 10) {
                $point_name[] = $point['village'];
            }
            if ($point_code_len >= 14) {
                $point_name[] = $point['home_realname'];
            }
            $item['point_name'] = implode($point_name);

            // 15-69的人口数
//            $item['total_1569'] = intval($age_rows[$item['id']] ?? 0);
            $item['total_1569'] = mt_rand(1, 4);
//            outputLog("total_1569=>",$item['total_1569']);

            /**未开始
             *
             * 12拒绝,13.无能力回答,14.不在家,15.无符合条件调查对象,16.无人居住/空房/已无此家庭/不是家庭,17.其他
             *
             * 21.完成,22.部分完成,,23.拒绝,24.无能力回答,25.不在家,26.其他*/
            // 入户状态:11同意/完成;12拒绝;13无能力回答;14不在家;15无符合条件调查对象;16无人居住/空房/已无此家庭/不是家庭;17其它。
            // 知情同意登记代码:21同意/完成;22部分完成;23拒绝;24无能力回答;25不在家;26其它。

            if ($item['status'] >= 3) {
                if ($item['answer_status'] >= 21) {
                    $map = [
                        '21' => '完成',
                        '22' => '部分完成',
                        '23' => '拒绝',
                        '24' => '无能力回答',
                        '25' => '不在家',
                        '26' => '其它',
                    ];
                    $item['finish_code'] = $item['answer_status'];
                    $item['finish_status_text'] = $item['answer_status'] . '.' . ($map[$item['answer_status']] ?? '未知');
                } else {
                    $map = [
                        '11' => '完成',
                        '12' => '拒绝',
                        '13' => '无能力回答',
                        '14' => '不在家',
                        '15' => '无符合条件调查对象',
                        '16' => '无人居住/空房/已无此家庭/不是家庭',
                        '17' => '其它',
                    ];
                    $item['finish_code'] = $item['family_status'];
                    $item['finish_status_text'] = $item['family_status'] . '.' . ($map[$item['family_status']] ?? '未知');
                }
            } else {
                $item['finish_code'] = 0;
                $item['finish_status_text'] = '未开始';
            }
            $answer_info = json_decode($item['answer_info'], true);

            $code = $item['code'] . '';
            $item['jcd'] = substr($code, 0, 6);
            $item['xz'] = substr($code, 6, 2);
            $item['cun'] = substr($code, 8, 2);
            $item['jth'] = substr($code, 10, 4);

            $item['answer_id'] = $answer_info['id'] ?? '';
            $item['answer_realname'] = $answer_info['name'] ?? '';

            $reason = str_replace("\n", ' ', $item['finish_reason']);
            $item['finish_reason_text'] = $item['finish_status'] == 3 ? ('因其他原因,面对面调查(具体请注明:' . $reason . '') : '';

            $export_data = json_decode($item['export_data'], true) ?: [];
            foreach ($export_data as $export_vo) {
                list($key, $val) = $export_vo;
                $item[trim($key)] = trim($val);
            }

            // G01:15-69岁家庭成员人数
            $item['G01'] = $item['total_1569'];

            // 答题时长:分钟
            $item['answer_time_min'] = round($item['answer_time'] / 60);

            // local_addr_surveytel
            $local_addr = json_decode($item['local_addr'], true);
            $item['local_addr_surveytel'] = $local_addr['surveyTel'] ?? '';
        });
    }

    protected function _getAnswerColumn($project_id): array
    {
        $data = Db::name('project_survey_result')->where('project_id', $project_id)->order('export_data desc')->value('export_data');
        $data = $data ? json_decode($data, true) : [];

        $keys = [];
        foreach ($data as $vo) {
            $keys[] = $vo[0];
        }
        $keys[] = 'G01';
        return array_map('trim', $keys);
    }
}