Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ set(ODR_SOURCE_FILES
"src/odr/internal/pdf/pdf_file.cpp"
"src/odr/internal/pdf/pdf_file_object.cpp"
"src/odr/internal/pdf/pdf_file_parser.cpp"
"src/odr/internal/pdf/pdf_filesystem.cpp"
"src/odr/internal/pdf/pdf_filter.cpp"
"src/odr/internal/pdf/pdf_graphics_operator_parser.cpp"
"src/odr/internal/pdf/pdf_graphics_state.cpp"
Expand Down
159 changes: 159 additions & 0 deletions src/odr/internal/pdf/pdf_filesystem.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#include <odr/internal/pdf/pdf_filesystem.hpp>

#include <odr/internal/common/file.hpp>
#include <odr/internal/pdf/pdf_file.hpp>
#include <odr/internal/pdf/pdf_file_object.hpp>

#include <cstdint>
#include <string>
#include <utility>

namespace odr::internal::pdf {

namespace {

std::string object_name(const ObjectReference &reference) {
return std::to_string(reference.id) + "_" + std::to_string(reference.gen);
}

// Walks a precomputed set of paths. Mirrors the virtual-filesystem walker: the
// entry map carries only path + kind (file vs directory); content is produced
// by `open`, so the walk itself never touches the parser.
class PdfFileWalker final : public abstract::FileWalker {
public:
using Entries = std::map<AbsPath, bool>; // path -> is_file

PdfFileWalker(const AbsPath &root, const Entries &entries) {
for (const auto &[path, is_file] : entries) {
if (path.ancestor_of(root)) {
m_entries[path] = is_file;
}
}
m_iterator = std::begin(m_entries);
}

[[nodiscard]] std::unique_ptr<FileWalker> clone() const override {
return std::make_unique<PdfFileWalker>(*this);
}

[[nodiscard]] bool equals(const FileWalker &rhs_) const override {
const auto &rhs = dynamic_cast<const PdfFileWalker &>(rhs_);
return m_iterator == rhs.m_iterator;
}

[[nodiscard]] bool end() const override {
return m_iterator == std::end(m_entries);
}

[[nodiscard]] std::uint32_t depth() const override { return 0; }

[[nodiscard]] AbsPath path() const override { return m_iterator->first; }

[[nodiscard]] bool is_file() const override { return m_iterator->second; }

[[nodiscard]] bool is_directory() const override { return !is_file(); }

void pop() override {}

void next() override { ++m_iterator; }

void flat_next() override {}

private:
Entries m_entries;
Entries::iterator m_iterator;
};

} // namespace

PdfFilesystem::PdfFilesystem(const PdfFile &pdf_file, const Logger &logger)
: m_parser{pdf_file.create_parser(logger)} {
m_entries[AbsPath("/")] = {Kind::directory, {}};
m_entries[AbsPath("/objects")] = {Kind::directory, {}};
m_entries[AbsPath("/trailer")] = {Kind::trailer, {}};

bool any_stream = false;
for (const auto &[reference, entry] : m_parser.xref().table) {
if (entry.is_free()) {
continue;
}

bool has_stream = false;
try {
has_stream = m_parser.read_object(reference).has_stream;
} catch (...) {
continue; // unreadable object: leave it out of the listing
}

const RelPath name(object_name(reference));
m_entries[AbsPath("/objects").join(name)] = {Kind::object, reference};
if (has_stream) {
m_entries[AbsPath("/streams").join(name)] = {Kind::stream, reference};
any_stream = true;
}
}

if (any_stream) {
m_entries[AbsPath("/streams")] = {Kind::directory, {}};
}
}

bool PdfFilesystem::exists(const AbsPath &path) const {
return m_entries.contains(path);
}

bool PdfFilesystem::is_file(const AbsPath &path) const {
const auto it = m_entries.find(path);
return it != std::end(m_entries) && it->second.kind != Kind::directory;
}

bool PdfFilesystem::is_directory(const AbsPath &path) const {
const auto it = m_entries.find(path);
return it != std::end(m_entries) && it->second.kind == Kind::directory;
}

std::unique_ptr<abstract::FileWalker>
PdfFilesystem::file_walker(const AbsPath &path) const {
PdfFileWalker::Entries entries;
for (const auto &[entry_path, entry] : m_entries) {
entries[entry_path] = entry.kind != Kind::directory;
}
return std::make_unique<PdfFileWalker>(path, entries);
}

std::shared_ptr<abstract::File> PdfFilesystem::open(const AbsPath &path) const {
const auto it = m_entries.find(path);
if (it == std::end(m_entries)) {
return {};
}

std::string content;
switch (const Entry &entry = it->second; entry.kind) {
case Kind::directory:
return {};
case Kind::trailer:
content = m_parser.trailer().to_string();
break;
case Kind::object:
content = m_parser.read_object(entry.reference).object.to_string();
break;
case Kind::stream:
try {
content = m_parser.read_decoded_stream(entry.reference);
} catch (...) {
// image codecs and unsupported filters cannot be decoded: fall back to
// the raw stream bytes so the object is still inspectable.
content = m_parser.read_object_stream(entry.reference);
}
break;
}

return std::make_shared<MemoryFile>(std::move(content));
}

odr::Filesystem create_object_filesystem(const PdfFile &pdf_file,
const Logger &logger) {
return odr::Filesystem(std::make_shared<PdfFilesystem>(pdf_file, logger));
}

} // namespace odr::internal::pdf
68 changes: 68 additions & 0 deletions src/odr/internal/pdf/pdf_filesystem.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#pragma once

#include <odr/filesystem.hpp>
#include <odr/logger.hpp>

#include <odr/internal/abstract/filesystem.hpp>
#include <odr/internal/common/path.hpp>
#include <odr/internal/pdf/pdf_document_parser.hpp>
#include <odr/internal/pdf/pdf_object.hpp>

#include <map>
#include <memory>

namespace odr::internal::pdf {

class PdfFile;

/// Exposes the low-level object structure of a PDF as a read-only filesystem,
/// so it can be browsed (and rendered) with the generic filesystem HTML viewer
/// instead of needing a dedicated PDF inspector.
///
/// Layout:
/// /trailer the trailer dictionary
/// /objects/<id>_<gen> each indirect object's value (the stream dictionary
/// for stream objects)
/// /streams/<id>_<gen> the stream bytes of each stream object, decoded
/// through the `/Filter` chain when possible and the
/// raw bytes otherwise (e.g. image codecs)
///
/// The object set is taken from the cross-reference table at construction;
/// content is serialized lazily on `open`. The backing `DocumentParser` is held
/// for the lifetime of the filesystem, so treat an instance as transient — one
/// per browse — matching the parser's own usage contract.
class PdfFilesystem final : public abstract::ReadableFilesystem {
public:
explicit PdfFilesystem(const PdfFile &pdf_file,
const Logger &logger = Logger::null());

[[nodiscard]] bool exists(const AbsPath &path) const override;
[[nodiscard]] bool is_file(const AbsPath &path) const override;
[[nodiscard]] bool is_directory(const AbsPath &path) const override;

[[nodiscard]] std::unique_ptr<abstract::FileWalker>
file_walker(const AbsPath &path) const override;

[[nodiscard]] std::shared_ptr<abstract::File>
open(const AbsPath &path) const override;

private:
enum class Kind { directory, trailer, object, stream };
struct Entry {
Kind kind{};
ObjectReference reference;
};

// `mutable`: reads memoize in the parser, and the `ReadableFilesystem`
// interface is `const`.
mutable DocumentParser m_parser;
std::map<AbsPath, Entry> m_entries;
};

/// Wraps a PDF's object structure as a browsable `Filesystem`, ready to hand to
/// `html::translate` for the filesystem HTML viewer.
[[nodiscard]] odr::Filesystem
create_object_filesystem(const PdfFile &pdf_file,
const Logger &logger = Logger::null());

} // namespace odr::internal::pdf
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ add_executable(odr_test
"src/internal/pdf/pdf_encryption.cpp"
"src/internal/pdf/pdf_file_object.cpp"
"src/internal/pdf/pdf_file_parser.cpp"
"src/internal/pdf/pdf_filesystem.cpp"
"src/internal/pdf/pdf_filter.cpp"
"src/internal/pdf/pdf_font.cpp"
"src/internal/util/math_util_test.cpp"
Expand Down
92 changes: 92 additions & 0 deletions test/src/internal/pdf/pdf_filesystem.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#include <odr/file.hpp>
#include <odr/filesystem.hpp>
#include <odr/html.hpp>

#include <odr/internal/common/file.hpp>
#include <odr/internal/pdf/pdf_file.hpp>
#include <odr/internal/pdf/pdf_filesystem.hpp>
#include <odr/internal/util/stream_util.hpp>

#include <internal/pdf/pdf_test_file_builder.hpp>

#include <filesystem>
#include <memory>
#include <sstream>
#include <string>

#include <gtest/gtest.h>

using odr::File;
using odr::Filesystem;
using odr::HtmlConfig;
using odr::HtmlService;
using namespace odr::internal; // MemoryFile, util
using namespace odr::internal::pdf; // PdfFile, create_object_filesystem
using PdfFileBuilder = odr::test::pdf::PdfFileBuilder;

namespace {

// A minimal four-object PDF: catalog, page tree, page, and one stream object
// (the page content). Object ids are 1..4 in insertion order.
PdfFile make_pdf() {
PdfFileBuilder builder;
builder.object("<< /Type /Catalog /Pages 2 0 R >>")
.object("<< /Type /Pages /Kids [3 0 R] /Count 1 >>")
.object("<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
"/Resources << >> /Contents 4 0 R >>")
.stream_object("", "BT ET")
.trailer("/Root 1 0 R");
return PdfFile(std::make_shared<MemoryFile>(builder.build_classic()));
}

std::string read(const File &file) {
const std::unique_ptr<std::istream> stream = file.stream();
return util::stream::read(*stream);
}

} // namespace

TEST(PdfFilesystem, lists_objects_and_streams) {
const Filesystem filesystem = create_object_filesystem(make_pdf());

EXPECT_TRUE(filesystem.is_directory("/objects"));
EXPECT_TRUE(filesystem.is_directory("/streams"));
EXPECT_TRUE(filesystem.is_file("/trailer"));

for (const char *object :
{"/objects/1_0", "/objects/2_0", "/objects/3_0", "/objects/4_0"}) {
EXPECT_TRUE(filesystem.is_file(object)) << object;
}

// only the content object (4) carries a stream
EXPECT_TRUE(filesystem.is_file("/streams/4_0"));
EXPECT_FALSE(filesystem.exists("/streams/1_0"));
}

TEST(PdfFilesystem, object_content_is_the_serialized_value) {
const Filesystem filesystem = create_object_filesystem(make_pdf());

const std::string catalog = read(filesystem.open("/objects/1_0"));
EXPECT_NE(catalog.find("/Catalog"), std::string::npos);

const std::string trailer = read(filesystem.open("/trailer"));
EXPECT_NE(trailer.find("/Root"), std::string::npos);
}

TEST(PdfFilesystem, stream_content_is_decoded_bytes) {
const Filesystem filesystem = create_object_filesystem(make_pdf());

EXPECT_EQ(read(filesystem.open("/streams/4_0")), "BT ET");
}

TEST(PdfFilesystem, renders_through_the_filesystem_viewer) {
const Filesystem filesystem = create_object_filesystem(make_pdf());

HtmlConfig config;
const std::string cache = std::filesystem::temp_directory_path().string();
const HtmlService service = odr::html::translate(filesystem, cache, config);

std::ostringstream out;
service.write("files.html", out);
EXPECT_NE(out.str().find("/objects/1_0"), std::string::npos);
}
Loading