diff --git a/CMakeLists.txt b/CMakeLists.txt index d0067b11..242b0c62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,6 +191,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/pdf/pdf_file.cpp" "src/odr/internal/pdf/pdf_file_object.cpp" "src/odr/internal/pdf/pdf_file_parser.cpp" + "src/odr/internal/pdf/pdf_filesystem.cpp" "src/odr/internal/pdf/pdf_filter.cpp" "src/odr/internal/pdf/pdf_graphics_operator_parser.cpp" "src/odr/internal/pdf/pdf_graphics_state.cpp" diff --git a/src/odr/internal/pdf/pdf_filesystem.cpp b/src/odr/internal/pdf/pdf_filesystem.cpp new file mode 100644 index 00000000..1d9dac59 --- /dev/null +++ b/src/odr/internal/pdf/pdf_filesystem.cpp @@ -0,0 +1,159 @@ +#include + +#include +#include +#include + +#include +#include +#include + +namespace odr::internal::pdf { + +namespace { + +std::string object_name(const ObjectReference &reference) { + return std::to_string(reference.id) + "_" + std::to_string(reference.gen); +} + +// Walks a precomputed set of paths. Mirrors the virtual-filesystem walker: the +// entry map carries only path + kind (file vs directory); content is produced +// by `open`, so the walk itself never touches the parser. +class PdfFileWalker final : public abstract::FileWalker { +public: + using Entries = std::map; // path -> is_file + + PdfFileWalker(const AbsPath &root, const Entries &entries) { + for (const auto &[path, is_file] : entries) { + if (path.ancestor_of(root)) { + m_entries[path] = is_file; + } + } + m_iterator = std::begin(m_entries); + } + + [[nodiscard]] std::unique_ptr clone() const override { + return std::make_unique(*this); + } + + [[nodiscard]] bool equals(const FileWalker &rhs_) const override { + const auto &rhs = dynamic_cast(rhs_); + return m_iterator == rhs.m_iterator; + } + + [[nodiscard]] bool end() const override { + return m_iterator == std::end(m_entries); + } + + [[nodiscard]] std::uint32_t depth() const override { return 0; } + + [[nodiscard]] AbsPath path() const override { return m_iterator->first; } + + [[nodiscard]] bool is_file() const override { return m_iterator->second; } + + [[nodiscard]] bool is_directory() const override { return !is_file(); } + + void pop() override {} + + void next() override { ++m_iterator; } + + void flat_next() override {} + +private: + Entries m_entries; + Entries::iterator m_iterator; +}; + +} // namespace + +PdfFilesystem::PdfFilesystem(const PdfFile &pdf_file, const Logger &logger) + : m_parser{pdf_file.create_parser(logger)} { + m_entries[AbsPath("/")] = {Kind::directory, {}}; + m_entries[AbsPath("/objects")] = {Kind::directory, {}}; + m_entries[AbsPath("/trailer")] = {Kind::trailer, {}}; + + bool any_stream = false; + for (const auto &[reference, entry] : m_parser.xref().table) { + if (entry.is_free()) { + continue; + } + + bool has_stream = false; + try { + has_stream = m_parser.read_object(reference).has_stream; + } catch (...) { + continue; // unreadable object: leave it out of the listing + } + + const RelPath name(object_name(reference)); + m_entries[AbsPath("/objects").join(name)] = {Kind::object, reference}; + if (has_stream) { + m_entries[AbsPath("/streams").join(name)] = {Kind::stream, reference}; + any_stream = true; + } + } + + if (any_stream) { + m_entries[AbsPath("/streams")] = {Kind::directory, {}}; + } +} + +bool PdfFilesystem::exists(const AbsPath &path) const { + return m_entries.contains(path); +} + +bool PdfFilesystem::is_file(const AbsPath &path) const { + const auto it = m_entries.find(path); + return it != std::end(m_entries) && it->second.kind != Kind::directory; +} + +bool PdfFilesystem::is_directory(const AbsPath &path) const { + const auto it = m_entries.find(path); + return it != std::end(m_entries) && it->second.kind == Kind::directory; +} + +std::unique_ptr +PdfFilesystem::file_walker(const AbsPath &path) const { + PdfFileWalker::Entries entries; + for (const auto &[entry_path, entry] : m_entries) { + entries[entry_path] = entry.kind != Kind::directory; + } + return std::make_unique(path, entries); +} + +std::shared_ptr PdfFilesystem::open(const AbsPath &path) const { + const auto it = m_entries.find(path); + if (it == std::end(m_entries)) { + return {}; + } + + std::string content; + switch (const Entry &entry = it->second; entry.kind) { + case Kind::directory: + return {}; + case Kind::trailer: + content = m_parser.trailer().to_string(); + break; + case Kind::object: + content = m_parser.read_object(entry.reference).object.to_string(); + break; + case Kind::stream: + try { + content = m_parser.read_decoded_stream(entry.reference); + } catch (...) { + // image codecs and unsupported filters cannot be decoded: fall back to + // the raw stream bytes so the object is still inspectable. + content = m_parser.read_object_stream(entry.reference); + } + break; + } + + return std::make_shared(std::move(content)); +} + +odr::Filesystem create_object_filesystem(const PdfFile &pdf_file, + const Logger &logger) { + return odr::Filesystem(std::make_shared(pdf_file, logger)); +} + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_filesystem.hpp b/src/odr/internal/pdf/pdf_filesystem.hpp new file mode 100644 index 00000000..785225b1 --- /dev/null +++ b/src/odr/internal/pdf/pdf_filesystem.hpp @@ -0,0 +1,68 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace odr::internal::pdf { + +class PdfFile; + +/// Exposes the low-level object structure of a PDF as a read-only filesystem, +/// so it can be browsed (and rendered) with the generic filesystem HTML viewer +/// instead of needing a dedicated PDF inspector. +/// +/// Layout: +/// /trailer the trailer dictionary +/// /objects/_ each indirect object's value (the stream dictionary +/// for stream objects) +/// /streams/_ the stream bytes of each stream object, decoded +/// through the `/Filter` chain when possible and the +/// raw bytes otherwise (e.g. image codecs) +/// +/// The object set is taken from the cross-reference table at construction; +/// content is serialized lazily on `open`. The backing `DocumentParser` is held +/// for the lifetime of the filesystem, so treat an instance as transient — one +/// per browse — matching the parser's own usage contract. +class PdfFilesystem final : public abstract::ReadableFilesystem { +public: + explicit PdfFilesystem(const PdfFile &pdf_file, + const Logger &logger = Logger::null()); + + [[nodiscard]] bool exists(const AbsPath &path) const override; + [[nodiscard]] bool is_file(const AbsPath &path) const override; + [[nodiscard]] bool is_directory(const AbsPath &path) const override; + + [[nodiscard]] std::unique_ptr + file_walker(const AbsPath &path) const override; + + [[nodiscard]] std::shared_ptr + open(const AbsPath &path) const override; + +private: + enum class Kind { directory, trailer, object, stream }; + struct Entry { + Kind kind{}; + ObjectReference reference; + }; + + // `mutable`: reads memoize in the parser, and the `ReadableFilesystem` + // interface is `const`. + mutable DocumentParser m_parser; + std::map m_entries; +}; + +/// Wraps a PDF's object structure as a browsable `Filesystem`, ready to hand to +/// `html::translate` for the filesystem HTML viewer. +[[nodiscard]] odr::Filesystem +create_object_filesystem(const PdfFile &pdf_file, + const Logger &logger = Logger::null()); + +} // namespace odr::internal::pdf diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 975180f4..ff69e30a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -46,6 +46,7 @@ add_executable(odr_test "src/internal/pdf/pdf_encryption.cpp" "src/internal/pdf/pdf_file_object.cpp" "src/internal/pdf/pdf_file_parser.cpp" + "src/internal/pdf/pdf_filesystem.cpp" "src/internal/pdf/pdf_filter.cpp" "src/internal/pdf/pdf_font.cpp" "src/internal/util/math_util_test.cpp" diff --git a/test/src/internal/pdf/pdf_filesystem.cpp b/test/src/internal/pdf/pdf_filesystem.cpp new file mode 100644 index 00000000..621737df --- /dev/null +++ b/test/src/internal/pdf/pdf_filesystem.cpp @@ -0,0 +1,92 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +using odr::File; +using odr::Filesystem; +using odr::HtmlConfig; +using odr::HtmlService; +using namespace odr::internal; // MemoryFile, util +using namespace odr::internal::pdf; // PdfFile, create_object_filesystem +using PdfFileBuilder = odr::test::pdf::PdfFileBuilder; + +namespace { + +// A minimal four-object PDF: catalog, page tree, page, and one stream object +// (the page content). Object ids are 1..4 in insertion order. +PdfFile make_pdf() { + PdfFileBuilder builder; + builder.object("<< /Type /Catalog /Pages 2 0 R >>") + .object("<< /Type /Pages /Kids [3 0 R] /Count 1 >>") + .object("<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + "/Resources << >> /Contents 4 0 R >>") + .stream_object("", "BT ET") + .trailer("/Root 1 0 R"); + return PdfFile(std::make_shared(builder.build_classic())); +} + +std::string read(const File &file) { + const std::unique_ptr stream = file.stream(); + return util::stream::read(*stream); +} + +} // namespace + +TEST(PdfFilesystem, lists_objects_and_streams) { + const Filesystem filesystem = create_object_filesystem(make_pdf()); + + EXPECT_TRUE(filesystem.is_directory("/objects")); + EXPECT_TRUE(filesystem.is_directory("/streams")); + EXPECT_TRUE(filesystem.is_file("/trailer")); + + for (const char *object : + {"/objects/1_0", "/objects/2_0", "/objects/3_0", "/objects/4_0"}) { + EXPECT_TRUE(filesystem.is_file(object)) << object; + } + + // only the content object (4) carries a stream + EXPECT_TRUE(filesystem.is_file("/streams/4_0")); + EXPECT_FALSE(filesystem.exists("/streams/1_0")); +} + +TEST(PdfFilesystem, object_content_is_the_serialized_value) { + const Filesystem filesystem = create_object_filesystem(make_pdf()); + + const std::string catalog = read(filesystem.open("/objects/1_0")); + EXPECT_NE(catalog.find("/Catalog"), std::string::npos); + + const std::string trailer = read(filesystem.open("/trailer")); + EXPECT_NE(trailer.find("/Root"), std::string::npos); +} + +TEST(PdfFilesystem, stream_content_is_decoded_bytes) { + const Filesystem filesystem = create_object_filesystem(make_pdf()); + + EXPECT_EQ(read(filesystem.open("/streams/4_0")), "BT ET"); +} + +TEST(PdfFilesystem, renders_through_the_filesystem_viewer) { + const Filesystem filesystem = create_object_filesystem(make_pdf()); + + HtmlConfig config; + const std::string cache = std::filesystem::temp_directory_path().string(); + const HtmlService service = odr::html::translate(filesystem, cache, config); + + std::ostringstream out; + service.write("files.html", out); + EXPECT_NE(out.str().find("/objects/1_0"), std::string::npos); +}