跳到主要内容

记一次PHP数据导出优化经历

阅读需 9 分钟

前段时间接手了广西健康素养测评系统的项目优化,项目需求是导出数据,之前使用的是 PHPExcel,但是导出的数据量太大了(>100w),导致导出时间过长,并且他们之前的操作逻辑是直接点击导出按钮后,同步执行数据查询再导出,要导出的数据还不是单表数据,需要联合多表查询,100w+的数据量,直接导出会报OOM,即使数据量小一点不OOM,也会导在导出期间,服务器CPU直接飙升到100%,导致服务器宕机

事情背景

前段时间接手了广西健康素养测评系统的项目优化,项目需求是导出数据,之前使用的是 PHPExcel,但是导出的数据量太大了(>100w),导致导出时间过长,并且他们之前的操作逻辑是直接点击导出按钮后,同步执行数据查询再导出,要导出的数据还不是单表数据,需要联合多表查询,100w+的数据量,直接导出会报OOM,即使数据量小一点不OOM,也会导在导出期间,服务器CPU直接飙升到100%,导致服务器宕机。

优化思路

  1. 缩小数据量,将数据量缩小到10万左右,然后进行导出。但是甲方显然不满意,因为他们明确要求就是要导出100w+的数据,所以继续考虑其他方案
  2. 查询方案优化,采用分页查询的方式,一次查询5000条数据(这个可以根据服务器配置来定),然后写入excel,继续查询再写入,类似分片写入的方式,直到查询完所有数据。经过测试,这个方案不会在引发OOM的问题,而且效率也比直接查询高。但是还是解决不了导出时间过长的问题,所以要在这个方案的基础上继续优化
  3. 同步改异步,点击导出按钮,后台生成一个导出的task,这个task会记录下要导出的数据库的详细信息,在系统资源比较空闲的时候,运行这个task,按第2步的方式去查询导出数据,完成任务后将任务标记为已完成,并通知到导出任务的人员,相关人员在后台可以看到自己的导出任务状态,并提供了下载链接,这样导出任务就完成了
  4. 上述方案已经比较好的解决了遇到的问题,但是我心里对导出时间过长的问题还是有点不满意,希望能进一步缩短异步任务的时长,直到遇到 https://xlswriter-docs.viest.me/ 这个三方库,简直碾压其他库无敌的存在

实际运行效果:任务运行期间,内存CPU几乎毫无波动,并且之前100w+的导出时间需要差不多半个小时,现在只需要不到2分钟不到

为什么内存优化模式内存占用少?看官方说明:当开启内存优化模式时,单元格将根据行落地磁盘,内存中只保留最新一行数据,所以内存优化模式最大内存占用等于数据最多一行的内存用量。内存只需要占用一行的量,所以内存占用少。

来说说如何使用

安装:

pecl install xlswriter

使用很简单:

初始化:

$fileName = $name . '_' . RecycleExportTask::TASK_TYPE_EXPORTRAW . '_' . date('Ymd_') . time() . '.xlsx';
$base_dir = Env::get('root_path') . 'public' . '/export_files/';
$config = [
'path' => $base_dir // xlsx文件保存路径
];
$excel = new new \Vtiful\Kernel\Excel($config);
// 第三个参数 False 即为关闭 ZIP64
$excel->constMemory($fileName, NULL, false);

然后$excel->data往里面丢数据就行了

$excel->data($excel_data);
$outfilepath = $excel->output(); // 返回导出文件的路径

贴上实际项目中优化后的代码:

<?php

namespace app\common\command;

use app\common\CommonException;
use app\common\model\project\SurveyResult;
use app\common\model\RecycleExportTask;
use PhpOffice\PhpSpreadsheet\Exception;
use SPSS\Sav\Writer;
use think\console\Command;
use think\console\Input;
use think\console\Output;
use think\Db;
use think\db\exception\DataNotFoundException;
use think\db\exception\ModelNotFoundException;
use think\exception\DbException;
use think\exception\PDOException;
use think\facade\Cache;
use think\facade\Env;
use think\facade\Log;
use util\ProgressBar;
use Vtiful\Kernel\Excel;

class RecycleExportRawTaskV1 extends Command
{
const TaskType = RecycleExportTask::TASK_TYPE_EXPORTRAW;

const LOCK_NAME = 'lock_RecycleExportTask';

protected function configure()
{
$this->setName('sys:RecycleExportRawTaskV1')
->setDescription('RecycleExportRawTaskV1');
}

protected function execute(Input $input, Output $output)
{
set_time_limit(0);
ini_set('memory_limit', -1); // 不限内存

if (Cache::get(self::LOCK_NAME) == 1) {
outputLog(self::LOCK_NAME . ' is running');
$output->info('RecycleExportRawTask is running');
return;
}

Cache::set(self::LOCK_NAME, 1, 0);
try {
$task = RecycleExportTask::where('task_type', self::TaskType)
->where('task_status', 0)->find();
$output->info("currentTask =>" . json_encode($task));
if (!$task) {
outputLog('没有需要执行的任务');
return;
}
$this->export($task, $output);
outputLog('RecycleExportRawTask 完成');
} catch (Exception|CommonException|DataNotFoundException|ModelNotFoundException|PDOException|DbException|\think\Exception $e) {
outputLog('RecycleExportRawTask导出异常', $e->getTrace());
} finally {
Cache::rm(self::LOCK_NAME);
}
}

/**
* @throws DataNotFoundException
* @throws CommonException
* @throws Exception
* @throws ModelNotFoundException
* @throws PDOException
* @throws DbException
* @throws \PhpOffice\PhpSpreadsheet\Writer\Exception
* @throws \think\Exception
*/
public function export($export_task, Output $output)
{
outputLog('进入RecycleExportRawTask导出方法');

Db::startTrans();
try {
RecycleExportTask::where('id', $export_task['id'])->update([
'task_status' => 1,
]);

$task_info = json_decode($export_task['task_info'], true);
if (!$task_info) {
outputLog('task_info error', $export_task);
throw new CommonException("task_info error");
}

$where = $task_info['where'];
$params = $task_info['params'];
$type = strtolower($params['type']);

$title_list = [
'WJBM' => ['code'],
'Finishcode' => ['finish_code'],
'Sex' => ['gender'],
'Age' => ['age'],
'UsedTime(Min)' => ['answer_time_min'],
'JCD' => ['jcd'],
'XZ' => ['xz'],
'CUN' => ['cun'],
'JTH' => ['jth'],
'city' => ['city'],
'county' => ['zone'],
'town' => ['town'],
'village' => ['village'],
'style' => ['finish_status'],
'style1' => ['finish_reason_text'],
'G02' => ['kish'],
'调查对象' => ['answer_realname'],
'家庭成员编号' => ['answer_id'],
'户主姓名' => ['home_realname'],
'调查对象联系方式' => ['local_addr_surveytel'],
'开始答题时间' => ['answer_time_start'],
'结束答题时间' => ['answer_time_end'],
'家庭地址' => ['home_address'],
'调查地址' => ['address'],
'提交时间' => ['updated'],
'备注' => ['remark'],
'监测点' => ['point_name'],
'监测点年份' => ['year'],
'是否为国家监测点' => ['is_country_text'],
'调查员姓名' => ['interviewer_realname'],
'所属单位' => ['org_name'],
'家庭完成情况' => ['family_status_text'],
'个人完成情况' => ['answer_status_text'],
'问卷完成情况' => ['finish_status_text'],
];
outputLog('step_03');
$project_id = 0;
foreach ($where as $vo) {
outputLog('step_foreach =>', $vo);
if ($vo[0] == 'r.project_id') {
$project_id = $vo[2];
}
}

$column = $this->_getAnswerColumn($project_id);
foreach ($column as $vo) {
$title_list[$vo] = [$vo];
}
outputLog('开始sql查询:');
// 获取数据
// 获取项目名
$project = Db::name('project')->where('id', $project_id)->find();
if (!$project) {
throw new CommonException("此项目不存在:" . $project_id);
}
$queryField = 'r.created,c.commit_time updated,s.code,s.id,r.status,family_status,answer_status,s.point_code,city,zone,town,village,
home_realname,answer_info,r.gender,age,answer_time_start,answer_time_end,answer_time,r.finish_reason,r.finish_status,r.export_data,
s.kish,s.is_country,r.local_addr,s.home_address,s.remark,s.year,u.realname interviewer_realname,o.name org_name,c.status check_status,c.status_level,c.rollback_status';

/* 分页查询并汇总 */
$pageSize = 2000;
$allData = [];

$output->info("query step_where =>" . json_encode($where));
// 先查询总记录数,用来计算总页数
$totalCount = $this->listTb()->where($where)->count();

outputLog("本次导出 =>", ['type' => $type, 'totalCount' => $totalCount]);

$totalPages = ceil($totalCount / $pageSize);

$output->info('准备导出=>' . $type . ',共' . $totalCount . '条数据');
$beginTime = time();
$output->info('开始导出,耗时较长,请耐心等待,开始时间:' . date('Y-m-d H:i:s'));
if ($type == 'xls') {
$progressBar = new ProgressBar($totalCount);
$excel = $this->prepare_xls($project['title'])->header(array_keys($title_list));
// 循环获取每一页的数据并汇总
for ($i = 1; $i <= $totalPages; $i++) {
$pageData = $this->listTb()->field($queryField)
->where($where)
->order('s.code')
->limit(($i - 1) * $pageSize, $pageSize)
->select();
outputLog('handle page =>', $i);
$this->handle_page_data($pageData, $type, $excel, $title_list, $progressBar);
}
$progressBar->finish();
$outfilepath = $excel->output();
$output->info("export_filename =>" . $outfilepath);

$base_dir = Env::get('root_path') . 'public';
$download_url = str_replace($base_dir, '', $outfilepath);

RecycleExportTask::where('id', $export_task['id'])->update([
'download_url' => $download_url,
'task_status' => 2,
'finished_time' => date('Y-m-d H:i:s')
]);
outputLog('导出xls完成:', $download_url);
}

Db::commit();;

$endTime = time();
$output->info('导出xls完成,结束时间:' . date('Y-m-d H:i:s') . ',耗时:' . ($endTime - $beginTime) . '秒');
} catch (\Exception $e) {
outputLog('RecycleExportRawTask_Exception => ', $e->getMessage());
Db::rollback();
throw $e;
}
outputLog('未知导出格式:' . $type);
}

protected function prepare_xls($name): Excel
{
$fileName = $name . '_' . RecycleExportTask::TASK_TYPE_EXPORTRAW . '_' . date('Ymd_') . time() . '.xlsx';

$base_dir = Env::get('root_path') . 'public' . '/export_files/';
$config = [
'path' => $base_dir // xlsx文件保存路径
];
$excel = new Excel($config);
// 第三个参数 False 即为关闭 ZIP64
return $excel->constMemory($fileName, NULL, false);
}

protected function handle_page_data($pageData, $type, Excel &$excel, $title_list, ProgressBar $progressBar)
{
$this->_parseList($pageData);
//处理每一页的数据
$excel_data = [];
foreach ($pageData as $current) {
$this->_parseInfo($current);

$row_data = [];
foreach ($title_list as $config) {
$column = $current[$config[0]] ?? '';
if ($config[0] == 'code') {
$column = "'" . $column;
}
$row_data[] = $column;
}

$excel_data[] = $row_data;
$progressBar->updateBy(1);
}
/** @var TYPE_NAME $excel */
$excel->data($excel_data);
}

protected function listTb()
{
return Db::name('project_survey_result')->alias('r')
->join('project_survey s', 's.id=r.survey_id')
->join('project_survey_check c', 'c.survey_id=r.survey_id')
->join('user u', 'u.uid=r.uid', 'left')
->join('org o', 'o.id=u.org_id', 'left');
}

protected function _parseInfo(&$info)
{
int_to_string($info, SurveyResult::getResultStatus());

$info['family_status_text'] = SurveyResult::getStatusText('family', $info['family_status']);
$info['answer_status_text'] = SurveyResult::getStatusText('answer', $info['answer_status']);
$info['finish_status_text'] = SurveyResult::getStatusText('finish', $info['finish_status']);
if ($info['finish_status'] == 3) {
$info['finish_status_text'] .= '(具体请注明:' . $info['finish_reason'] . ')';
}

// 调查地址(准确则显示定位信息,否则显示手动输入)
$local_addr = json_decode($info['local_addr'], true);
$info['exact'] = $local_addr['exact'];
if ($local_addr['exact']) {
$info['address'] = $local_addr['addr'];
} else {
$info['address'] = $local_addr['surveyCity'] . $local_addr['surveyDistrict'] . $local_addr['surveyStreet'] . $local_addr['surveyAddr'];
}
}

protected function _parseList(&$list)
{
// 根据point_code计算出point_name
Log::write('第一步骤开始:', 'debug');
$db = Db::name('project_survey');
$point_codes = array_unique(array_column($list, 'point_code'));
foreach ($point_codes as $vo) {
if ($vo) {
$db->whereOr('code', 'like', $vo . '%');
} else {
$db->whereOr('point_code', 0);
}
}
$point_rows = $db->column('code,province,city,zone,town,village,home_realname', 'code');
Log::write('第一步骤结束:', 'debug');
int_to_string($list, [
'gender' => ['--', '男', '女'],
'is_country' => ['否', '是']
]);
Log::write('第二步骤开始:', 'debug');
// 15-69的人口数
// TODO: 这里先注释掉
/*
$age_rows = Db::name('project_survey_family')->where('survey_id', 'in', array_column($list, 'id'))
->field('survey_id,count(1) total')->where('age', 'between', '15,69')->group('survey_id')->select();
$age_rows = array_column($age_rows, 'total', 'survey_id');
*/
Log::write('第二步骤结束:', 'debug');
array_walk($list, function (&$item) use ($point_rows) {
// 计算监测点
$point = $point_rows[$item['code']];
$point_code_len = strlen($item['point_code']);
$point_name = [$point['province']];
if ($point_code_len >= 4) {
$point_name[] = $point['city'];
}
if ($point_code_len >= 6) {
$point_name[] = $point['zone'];
}
if ($point_code_len >= 8) {
$point_name[] = $point['town'];
}
if ($point_code_len >= 10) {
$point_name[] = $point['village'];
}
if ($point_code_len >= 14) {
$point_name[] = $point['home_realname'];
}
$item['point_name'] = implode($point_name);

// 15-69的人口数
// $item['total_1569'] = intval($age_rows[$item['id']] ?? 0);
$item['total_1569'] = mt_rand(1, 4);
// outputLog("total_1569=>",$item['total_1569']);

/**未开始
*
* 12拒绝,13.无能力回答,14.不在家,15.无符合条件调查对象,16.无人居住/空房/已无此家庭/不是家庭,17.其他
*
* 21.完成,22.部分完成,,23.拒绝,24.无能力回答,25.不在家,26.其他*/
// 入户状态:11同意/完成;12拒绝;13无能力回答;14不在家;15无符合条件调查对象;16无人居住/空房/已无此家庭/不是家庭;17其它。
// 知情同意登记代码:21同意/完成;22部分完成;23拒绝;24无能力回答;25不在家;26其它。

if ($item['status'] >= 3) {
if ($item['answer_status'] >= 21) {
$map = [
'21' => '完成',
'22' => '部分完成',
'23' => '拒绝',
'24' => '无能力回答',
'25' => '不在家',
'26' => '其它',
];
$item['finish_code'] = $item['answer_status'];
$item['finish_status_text'] = $item['answer_status'] . '.' . ($map[$item['answer_status']] ?? '未知');
} else {
$map = [
'11' => '完成',
'12' => '拒绝',
'13' => '无能力回答',
'14' => '不在家',
'15' => '无符合条件调查对象',
'16' => '无人居住/空房/已无此家庭/不是家庭',
'17' => '其它',
];
$item['finish_code'] = $item['family_status'];
$item['finish_status_text'] = $item['family_status'] . '.' . ($map[$item['family_status']] ?? '未知');
}
} else {
$item['finish_code'] = 0;
$item['finish_status_text'] = '未开始';
}
$answer_info = json_decode($item['answer_info'], true);

$code = $item['code'] . '';
$item['jcd'] = substr($code, 0, 6);
$item['xz'] = substr($code, 6, 2);
$item['cun'] = substr($code, 8, 2);
$item['jth'] = substr($code, 10, 4);

$item['answer_id'] = $answer_info['id'] ?? '';
$item['answer_realname'] = $answer_info['name'] ?? '';

$reason = str_replace("\n", ' ', $item['finish_reason']);
$item['finish_reason_text'] = $item['finish_status'] == 3 ? ('因其他原因,面对面调查(具体请注明:' . $reason . '') : '';

$export_data = json_decode($item['export_data'], true) ?: [];
foreach ($export_data as $export_vo) {
list($key, $val) = $export_vo;
$item[trim($key)] = trim($val);
}

// G01:15-69岁家庭成员人数
$item['G01'] = $item['total_1569'];

// 答题时长:分钟
$item['answer_time_min'] = round($item['answer_time'] / 60);

// local_addr_surveytel
$local_addr = json_decode($item['local_addr'], true);
$item['local_addr_surveytel'] = $local_addr['surveyTel'] ?? '';
});
}

protected function _getAnswerColumn($project_id): array
{
$data = Db::name('project_survey_result')->where('project_id', $project_id)->order('export_data desc')->value('export_data');
$data = $data ? json_decode($data, true) : [];

$keys = [];
foreach ($data as $vo) {
$keys[] = $vo[0];
}
$keys[] = 'G01';
return array_map('trim', $keys);
}
}
Loading Comments...