Skip to content

Commit

Permalink
chore: Change PDF text extraction logics (#10420)
Browse files Browse the repository at this point in the history
* chore: change PDF text extraction logics

* test(snapshot): update snapshots c2df7e0

* test(snapshot): update snapshots a124734

---------

Co-authored-by: filzrev <[email protected]>
Co-authored-by: Yufei Huang <[email protected]>
  • Loading branch information
3 people authored Nov 27, 2024
1 parent 1a6038c commit 8704b50
Show file tree
Hide file tree
Showing 8 changed files with 468 additions and 446 deletions.
168 changes: 84 additions & 84 deletions test/docfx.Snapshot.Tests/SamplesTest.Seed/api/toc.pdf.verified.json

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

160 changes: 80 additions & 80 deletions test/docfx.Snapshot.Tests/SamplesTest.Seed/md/toc.pdf.verified.json

Large diffs are not rendered by default.

254 changes: 127 additions & 127 deletions test/docfx.Snapshot.Tests/SamplesTest.Seed/pdf/toc.pdf.verified.json

Large diffs are not rendered by default.

Large diffs are not rendered by default.

44 changes: 22 additions & 22 deletions test/docfx.Snapshot.Tests/SamplesTest.Seed/toc.pdf.verified.json

Large diffs are not rendered by default.

24 changes: 23 additions & 1 deletion test/docfx.Snapshot.Tests/SamplesTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
using UglyToad.PdfPig;
using UglyToad.PdfPig.Actions;
using UglyToad.PdfPig.Annotations;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
using UglyToad.PdfPig.Outline;

namespace Docfx.Tests;
Expand Down Expand Up @@ -80,7 +82,7 @@ void PdfToJson(string path)
{
p.Number,
p.NumberOfImages,
p.Text,
Text = ExtractText(p),
Links = p.ExperimentalAccess.GetAnnotations().Select(ToLink).ToArray(),
}).ToArray(),
Bookmarks = document.TryGetBookmarks(out var bookmarks) ? ToBookmarks(bookmarks.Roots) : null,
Expand Down Expand Up @@ -207,4 +209,24 @@ private void ScrubFile(string path, StringBuilder builder)
}));
}
}

private string ExtractText(Page page)
{
// Gets PDF text content
var text = ContentOrderTextExtractor.GetText(page, new ContentOrderTextExtractor.Options { ReplaceWhitespaceWithSpace = true });

// string.Normalize is not works when using `Globalization Invariant Mode`.
StringBuilder sb = new(text);

// Normalize known ligature chars. (Note: `string.Normalize` is not works when using `Globalization Invariant Mode`)
sb.Replace("", "ff");
sb.Replace("", "ffi");
sb.Replace("", "fl");
sb.Replace("", "fi");

// Normalize newline char.
sb.Replace("\r\n", "\n");

return sb.ToString();
}
}

0 comments on commit 8704b50

Please sign in to comment.