本系统为"同推诉源立案系统"的第一期工程,专门针对法院立案环节中大量存在的纸质文档数字化问题而设计。通过整合先进的OCR技术和AI图像处理算法,实现对扫描文档、照片文档的高精度识别和智能化矫正。
支持PDF、JPG、PNG等多种格式
自动纠正倾斜、畸变等问题
多引擎融合提升准确率
自动识别关键字段信息
CREATE TABLE `ey_document_source` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`user_id` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '上传用户ID',
`batch_no` varchar(32) NOT NULL DEFAULT '' COMMENT '批次号,同一批上传的文档相同',
-- 更多字段...
PRIMARY KEY (`id`),
UNIQUE KEY `uk_md5_hash` (`md5_hash`) USING BTREE,
KEY `idx_user_batch` (`user_id`,`batch_no`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='文档来源主表';
CREATE TABLE `ey_ocr_result` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`document_id` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '文档ID',
`page_seq` smallint(5) unsigned NOT NULL DEFAULT '0' COMMENT '页码序列号(从0开始)',
-- 更多字段...
PRIMARY KEY (`id`),
UNIQUE KEY `uk_doc_page_block` (`document_id`,`page_seq`,`block_seq`),
CONSTRAINT `fk_ocr_document` FOREIGN KEY (`document_id`) REFERENCES `ey_document_source` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='OCR识别结果详情表';
CREATE TABLE `ey_image_preprocess` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`document_id` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '文档ID',
`step_sequence` tinyint(3) unsigned NOT NULL DEFAULT '0' COMMENT '步骤顺序号',
-- 更多字段...
PRIMARY KEY (`id`),
KEY `idx_document_step` (`document_id`,`step_sequence`) USING BTREE,
CONSTRAINT `fk_preprocess_document` FOREIGN KEY (`document_id`) REFERENCES `ey_document_source` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='图像预处理步骤记录表';
interface DocumentSourceInterface {
public function validate(): bool;
public function preprocess(): string;
public function extractMetadata(): array;
}
class ScannerAdapter implements DocumentSourceInterface {
private $config;
public function __construct(array $config) {
$this->config = $config;
}
public function validate(): bool {
// 检查扫描仪连接状态
if (!extension_loaded('twain')) {
throw new \RuntimeException("TWAIN扩展未加载");
}
return true;
}
}
class GeometricDistortionCorrector {
private $transformer;
public function detectDocumentBoundary(string $imagePath): array {
// 1. 边缘检测
$contours = $this->edgeDetector->findContours($imagePath);
// 2. 多边形近似
$approxPolygons = [];
foreach ($contours as $contour) {
$epsilon = 0.02 * $this->edgeDetector->arcLength($contour, true);
$approx = $this->edgeDetector->approxPolyDP($contour, $epsilon, true);
if (count($approx) === 4) {
$approxPolygons[] = $approx;
}
}
return $bestBoundary;
}
}
class OCREngineManager {
private $engines = [];
public function processWithFallback(string $imagePath): array {
$primaryEngine = $this->routeByContentType($imagePath);
try {
$primaryResult = $primaryEngine->recognize($imagePath);
// 质量检验
if ($this->validateRecognitionQuality($primaryResult)) {
return $primaryResult;
}
} catch (\Exception $e) {
// 记录日志
Log::error("Primary OCR engine failed: " . $e->getMessage());
}
// 尝试备用引擎
foreach ($this->engines as $backupName => $backupEngine) {
if ($backupName !== $primaryEngine->getName()) {
try {
$backupResult = $backupEngine->recognize($imagePath);
if ($this->validateRecognitionQuality($backupResult)) {
return $backupResult;
}
} catch (\Exception $e) {
continue;
}
}
}
throw new \RuntimeException("所有OCR引擎均无法处理该文档");
}
}
class DocumentUploader {
constructor(options) {
this.options = {
container: '#uploadContainer',
maxFiles: 10,
maxFileSize: 52428800, // 50MB
acceptedFiles: '.pdf,.jpg,.jpeg,.png,.tiff,.tif',
autoQueue: true,
parallelUploads: 3
};
this.init();
}
init() {
this.dropzone = new Dropzone(this.options.container, {
url: '/plugins/document_ai/Recognition/upload',
success: (file, response) => {
this.onUploadSuccess(file, response);
},
error: (file, errorMsg) => {
this.onUploadError(file, errorMsg);
}
});
}
}
# 检查PHP版本
php -v # >= 7.4.0
# 必需扩展
php -m | grep gd # GD图像处理
php -m | grep imagick # ImageMagick支持
# 推荐的PHP配置
memory_limit = 512M
upload_max_filesize = 50M
post_max_size = 55M
max_execution_time = 300
#!/bin/bash
# install_document_ai.sh
echo "正在安装纸张文档识别和AI矫正系统..."
# 1. 创建插件目录
PLUGIN_DIR="./plugins/document_ai"
mkdir -p $PLUGIN_DIR/{controller,service,util,vendor,view,static}
# 2. 复制文件
cp -r ../source/* $PLUGIN_DIR/
# 3. 设置目录权限
chmod -R 755 $PLUGIN_DIR/
chown -R www-data:www-data $PLUGIN_DIR/
# 4. 导入数据库
mysql -u root -p your_database < ./database/schema.sql
echo "安装完成!"
| 风险项 | 可能性 | 影响程度 | 缓解措施 |
|---|---|---|---|
| OCR识别准确率不足 | 中 | 高 | 采用多引擎融合,结合人工校对 |
| 大文件处理内存溢出 | 中 | 中 | 分块处理,流式传输 |
| 网络延迟导致API超时 | 高 | 中 | 本地缓存,异步重试 |