WX
2024-06-01 e1cd06bcd9860c1a8172f5779bc6bc52a5fd8e84
pdf ocr

工具库
PDF TO IMAGE: Ghostscript.NET
OCR: PaddleOCRSharp

注意:
1. 项目需要设置为x64
2. Ghostscript.NET 需要安装Ghostscript.NET(https://ghostscript.com/releases/gsdnld.html),下载 Ghostscript 10.03.1 for Windows (64 bit) -> Ghostscript AGPL Release
3. PDF路径尽量避免中文,可能会有问题

使用示例:
var pdfPath = @"D:\Document\Desktop\1.pdf";
var txt = OcrHelper.Pdf2Txt(pdfPath);
5个文件已修改
3个文件已添加
165 ■■■■■ 已修改文件
CommonHelper/CommonHelper.csproj 21 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
CommonHelper/File/OcrHelper.cs 83 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
CommonHelper/File/PdfHelper.cs 50 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
CommonHelper/packages.config 4 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
GasolineBlend/Controllers/RRController.cs 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
GasolineBlend/Depends/gsdll64.dll 补丁 | 查看 | 原始文档 | blame | 历史
GasolineBlend/GasolineBlend.csproj 3 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
GasolineBlend/Global.asax.cs 3 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
CommonHelper/CommonHelper.csproj
@@ -11,6 +11,8 @@
    <AssemblyName>CommonHelper</AssemblyName>
    <TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion>
    <FileAlignment>512</FileAlignment>
    <NuGetPackageImportStamp>
    </NuGetPackageImportStamp>
  </PropertyGroup>
  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
    <DebugSymbols>true</DebugSymbols>
@@ -21,7 +23,7 @@
    <ErrorReport>prompt</ErrorReport>
    <WarningLevel>4</WarningLevel>
    <AllowUnsafeBlocks>false</AllowUnsafeBlocks>
    <PlatformTarget>AnyCPU</PlatformTarget>
    <PlatformTarget>x64</PlatformTarget>
  </PropertyGroup>
  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
    <DebugType>pdbonly</DebugType>
@@ -63,6 +65,9 @@
    <Reference Include="CSRedisCore, Version=3.0.52.0, Culture=neutral, processorArchitecture=MSIL">
      <HintPath>..\packages\CSRedisCore.3.0.52\lib\netstandard2.0\CSRedisCore.dll</HintPath>
    </Reference>
    <Reference Include="Ghostscript.NET, Version=1.2.3.0, Culture=neutral, PublicKeyToken=f85051de34525b59, processorArchitecture=MSIL">
      <HintPath>..\packages\Ghostscript.NET.1.2.3.1\lib\net40\Ghostscript.NET.dll</HintPath>
    </Reference>
    <Reference Include="ICSharpCode.SharpZipLib">
      <HintPath>..\packages\SharpZipLib.0.86.0\lib\11\ICSharpCode.SharpZipLib.dll</HintPath>
    </Reference>
@@ -74,7 +79,7 @@
      <EmbedInteropTypes>False</EmbedInteropTypes>
    </Reference>
    <Reference Include="Newtonsoft.Json, Version=13.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
      <HintPath>..\packages\Newtonsoft.Json.13.0.2\lib\net45\Newtonsoft.Json.dll</HintPath>
      <HintPath>..\packages\Newtonsoft.Json.13.0.3\lib\net45\Newtonsoft.Json.dll</HintPath>
    </Reference>
    <Reference Include="NLog, Version=4.0.0.0, Culture=neutral, PublicKeyToken=5120e14c03d0593c, processorArchitecture=MSIL">
      <HintPath>..\packages\NLog.4.5.3\lib\net45\NLog.dll</HintPath>
@@ -90,6 +95,9 @@
    </Reference>
    <Reference Include="NPOI.OpenXmlFormats, Version=2.3.0.0, Culture=neutral, PublicKeyToken=0df73ec7942b34e1, processorArchitecture=MSIL">
      <HintPath>..\packages\NPOI.2.3.0\lib\net40\NPOI.OpenXmlFormats.dll</HintPath>
    </Reference>
    <Reference Include="PaddleOCRSharp, Version=4.3.0.0, Culture=neutral, processorArchitecture=AMD64">
      <HintPath>..\packages\PaddleOCRSharp.4.3.0\lib\net461\PaddleOCRSharp.dll</HintPath>
    </Reference>
    <Reference Include="SafeObjectPool, Version=2.0.1.0, Culture=neutral, processorArchitecture=MSIL">
      <HintPath>..\packages\SafeObjectPool.2.0.1\lib\netstandard2.0\SafeObjectPool.dll</HintPath>
@@ -157,6 +165,8 @@
    <Compile Include="File\Configs.cs" />
    <Compile Include="File\DownLoadHelper.cs" />
    <Compile Include="File\FileUtil.cs" />
    <Compile Include="File\OcrHelper.cs" />
    <Compile Include="File\PdfHelper.cs" />
    <Compile Include="Format\AutoMapperHelper.cs" />
    <Compile Include="Format\ConvertHelper.cs" />
    <Compile Include="Format\EnumDescriptionJsonConvert.cs" />
@@ -192,4 +202,11 @@
  </ItemGroup>
  <ItemGroup />
  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
  <Import Project="..\packages\PaddleOCRSharp.4.3.0\build\PaddleOCRSharp.targets" Condition="Exists('..\packages\PaddleOCRSharp.4.3.0\build\PaddleOCRSharp.targets')" />
  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
    <PropertyGroup>
      <ErrorText>这台计算机上缺少此项目引用的 NuGet 程序包。使用“NuGet 程序包还原”可下载这些程序包。有关更多信息,请参见 http://go.microsoft.com/fwlink/?LinkID=322105。缺少的文件是 {0}。</ErrorText>
    </PropertyGroup>
    <Error Condition="!Exists('..\packages\PaddleOCRSharp.4.3.0\build\PaddleOCRSharp.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\PaddleOCRSharp.4.3.0\build\PaddleOCRSharp.targets'))" />
  </Target>
</Project>
CommonHelper/File/OcrHelper.cs
New file
@@ -0,0 +1,83 @@
using System;
using System.IO;
using System.Text;
using PaddleOCRSharp;
namespace CommonHelper
{
    public static class OcrHelper
    {
        private static PaddleOCREngine engine;
        /// <summary>
        /// 初始化
        /// </summary>
        public static void Init()
        {
            //自带轻量版中英文模型V4模型
            OCRModelConfig config = null;
#if DEBUG
            config = new OCRModelConfig();
            string str1 = EngineBase.GetRootDirectory().TrimEnd('\\');
            config = new OCRModelConfig();
            string str2 = str1 + "\\bin\\inference";
            config.det_infer = str2 + "\\ch_PP-OCRv4_det_infer";
            config.cls_infer = str2 + "\\ch_ppocr_mobile_v2.0_cls_infer";
            config.rec_infer = str2 + "\\ch_PP-OCRv4_rec_infer";
            config.keys = str2 + "\\ppocr_keys.txt";
#endif
            //OCR参数
            OCRParameter oCRParameter = new OCRParameter();
            oCRParameter.cpu_math_library_num_threads = 10;//预测并发线程数
            oCRParameter.enable_mkldnn = true;
            oCRParameter.cls = false; //是否执行文字方向分类;默认false
            oCRParameter.det = true;//是否开启文本框检测,用于检测文本块
            oCRParameter.use_angle_cls = false;//是否开启方向检测,用于检测识别180旋转
            oCRParameter.det_db_score_mode = true;//是否使用多段线,即文字区域是用多段线还是用矩形,
            oCRParameter.max_side_len = 960;
            oCRParameter.rec_img_h = 48;
            oCRParameter.rec_img_w = 320;
            oCRParameter.det_db_thresh = 0.3f;
            oCRParameter.det_db_box_thresh = 0.618f;
            //初始化OCR引擎
            engine = new PaddleOCREngine(config, oCRParameter);
        }
        /// <summary>
        /// 图片ocr
        /// </summary>
        /// <param name="path">图片地址</param>
        /// <returns></returns>
        public static string Img2Txt(string path)
        {
            var imgByte = File.ReadAllBytes(path);
            var ocrRes = engine.DetectText(imgByte);
            return ocrRes.Text;
        }
        /// <summary>
        /// pdf ocr
        /// </summary>
        /// <param name="path">pdf文件地址</param>
        /// <returns></returns>
        public static string Pdf2Txt(string path)
        {
            var sb = new StringBuilder();
            foreach (var imgPath in PdfHelper.Pdf2Imgs(path))
            {
                var txt = Img2Txt(imgPath);
                sb.Append(txt);
                File.Delete(imgPath);
            }
            return sb.ToString();
        }
    }
}
CommonHelper/File/PdfHelper.cs
New file
@@ -0,0 +1,50 @@
using Ghostscript.NET.Rasterizer;
using Ghostscript.NET;
using System.Collections.Generic;
using System.Drawing.Imaging;
using System.IO;
using System;
using System.Runtime.Remoting.Channels;
using System.Web;
namespace CommonHelper
{
    public static class PdfHelper
    {
        /// <summary>
        /// pdf转图片
        /// </summary>
        /// <param name="inputPdfPath">pdf路径,尽量避免中文</param>
        /// <returns></returns>
        public static IEnumerable<string> Pdf2Imgs(string inputPdfPath)
        {
            int desired_dpi = 96;
            var pdfName = Path.GetFileNameWithoutExtension(inputPdfPath);
            string outputPath = HttpContext.Current.Server.MapPath("~/CachePdfImages");
            Directory.CreateDirectory(outputPath);
            var dllPath = HttpContext.Current.Server.MapPath("~/Depends/gsdll64.dll");
#if DEBUG
            dllPath = HttpContext.Current.Server.MapPath("~/bin/Depends/gsdll64.dll");
#endif
            GhostscriptVersionInfo gvi = new GhostscriptVersionInfo(dllPath);
            using (var rasterizer = new GhostscriptRasterizer())
            {
                rasterizer.Open(inputPdfPath, gvi, false);
                for (var pageNumber = 1; pageNumber <= rasterizer.PageCount; pageNumber++)
                {
                    var pageFilePath = Path.Combine(outputPath, string.Format("p{0}_{1}_{2}.png", pageNumber,pdfName,DateTime.Now.ToString("HHmmssfff")));
                    var img = rasterizer.GetPage(desired_dpi, pageNumber);
                    img.Save(pageFilePath, ImageFormat.Png);
                    yield return pageFilePath;
                }
            }
        }
    }
}
CommonHelper/packages.config
@@ -2,11 +2,13 @@
<packages>
  <package id="AutoMapper" version="9.0.0" targetFramework="net461" />
  <package id="CSRedisCore" version="3.0.52" targetFramework="net461" />
  <package id="Ghostscript.NET" version="1.2.3.1" targetFramework="net461" />
  <package id="Microsoft.CSharp" version="4.5.0" targetFramework="net461" />
  <package id="Microsoft.Office.Interop.Word" version="15.0.4797.1003" targetFramework="net461" />
  <package id="Newtonsoft.Json" version="13.0.2" targetFramework="net461" />
  <package id="Newtonsoft.Json" version="13.0.3" targetFramework="net461" />
  <package id="NLog" version="4.5.3" targetFramework="net461" />
  <package id="NPOI" version="2.3.0" targetFramework="net461" />
  <package id="PaddleOCRSharp" version="4.3.0" targetFramework="net461" />
  <package id="Portable.BouncyCastle" version="1.8.6" targetFramework="net461" />
  <package id="SafeObjectPool" version="2.0.1" targetFramework="net461" />
  <package id="ServiceStack.Common" version="5.11.0" targetFramework="net461" />
GasolineBlend/Controllers/RRController.cs
@@ -1,6 +1,7 @@
using System.Linq;
using System.Threading.Tasks;
using System.Web.Mvc;
using CommonHelper;
using RiskControl.NewService.Service;
namespace GasolineBlend.Controllers
GasolineBlend/Depends/gsdll64.dll
Binary files differ
GasolineBlend/GasolineBlend.csproj
@@ -403,6 +403,9 @@
    <Compile Include="Controllers\RecoverPlanController.cs" />
  </ItemGroup>
  <ItemGroup>
    <Content Include="Depends\gsdll64.dll">
      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
    </Content>
    <Content Include="favicon.ico" />
    <Content Include="Global.asax" />
    <Content Include="ApplicationInsights.config">
GasolineBlend/Global.asax.cs
@@ -5,6 +5,7 @@
using System.Web.Mvc;
using System.Web.Optimization;
using System.Web.Routing;
using CommonHelper;
namespace GasolineBlend
{
@@ -15,6 +16,8 @@
            AreaRegistration.RegisterAllAreas();
            FilterConfig.RegisterGlobalFilters(GlobalFilters.Filters);
            RouteConfig.RegisterRoutes(RouteTable.Routes);
            OcrHelper.Init();
        }
    }
}