From 5a7cdcc2cb1226162dc1b5863d5009c7bf06f924 Mon Sep 17 00:00:00 2001 From: Luca Toniolo <10792599+grandixximo@users.noreply.github.com> Date: Fri, 19 Jun 2026 11:00:57 +0800 Subject: [PATCH] docs: deduplicate per-language images into a shared build-time pool The HTML build copied every referenced image into all 8 language trees, so the built tree was ~324MB, mostly duplicate image bytes. The image resolver, previously used only for the PDF build, now runs for HTML too and rewrites every image src (and click-to-enlarge link) to a shared pool: a generic image to image/ and a localized foo_ variant to /image/, where is the path relative to docs/src. .html-images-stamp then materialises the pool as real files, one copy per unique image. No post-build pass and no symlinks (a server may lack FollowSymLinks). The 8-language tree drops to ~120MB. The page language is passed explicitly as lcnc-lang to both the HTML and PDF builds: the Asciidoctor 2.0 CLI has no sourcemap option and the render-tree path does not encode the language, so the previous path-based detection was unreliable. This replaces the CI-only dedup-docs.py pass, which is removed. --- docs/src/Submakefile | 75 +++++---- docs/src/extensions/image_resolver.rb | 234 ++++++++++++++++++++------ 2 files changed, 223 insertions(+), 86 deletions(-) diff --git a/docs/src/Submakefile b/docs/src/Submakefile index 547b389290e..d5760671a81 100644 --- a/docs/src/Submakefile +++ b/docs/src/Submakefile @@ -967,6 +967,7 @@ $(4)/%.pdf: $(1)/%.adoc .adoc-images-stamp $$(DOC_FONTS) | svgs_made_from_dots --sourcemap \ -a compat-mode \ -a "doc-languages=$$(LANGUAGES)" \ + -a "lcnc-lang=$3" \ -a xref-root=$$(dir $$<) \ -a "xref-exclude=$(2)" \ -a "scriptdir=$$(DOC_SRCDIR)/" \ @@ -1102,36 +1103,40 @@ $(foreach L,$(LANGUAGES),$(eval $(call HTML_COPY_RULE,$(L)))) .images-stamp: .adoc-images-stamp .html-images-stamp touch $@ -# Image copy from sources into the output tree. HTML files for English -# live at $(DOC_OUT_HTML)/en//X.html; for translations at -# $(DOC_OUT_HTML)///X.html. The source images are in -# $(DOC_SRCDIR)// regardless of language (translations are -# image-symlinked to English originals via the sed below). Generated SVGs are -# not in src; fall back to the English build tree where they render. -# -# src="..." covers displayed images; href="...png" covers click-to-enlarge -# targets from image:thumb[link=...] (which would otherwise 404). The -# extension filter keeps page/anchor hrefs out. +# Materialise the shared image pool. image_resolver has already rewritten +# every HTML src/href to a pool path (image/ or /image/); +# here we copy each referenced image to the location its link points at. All +# languages' generic refs resolve to the one shared image/ tree, so the first +# copy wins and the existence guard skips the rest: no per-language duplication, +# no symlinks (a server may lack FollowSymLinks), and a no-op second build. +# DEST comes from the page dir + the ref; finds the source under +# $(DOC_SRCDIR) (or the English build tree for generated SVGs). # # Depend on .lang-switcher-stamp so this runs after the post-processor has # rewritten the HTML in place; otherwise that later rewrite leaves this stamp # older than its inputs and the image copy re-fires on every subsequent make. .html-images-stamp: $(DOC_TARGETS_HTML) .lang-switcher-stamp - set -e; for HTML_FILE in $^; do \ - HTML_REL=$$(echo $$HTML_FILE | sed 's%^$(DOC_OUT_HTML)/%%'); \ - LANG=$$(echo $$HTML_REL | cut -d/ -f1); \ - REST=$$(echo $$HTML_REL | cut -d/ -f2-); \ - HTML_DIR=$$(dirname $$REST | cut -d/ -f1); \ - for IMAGE_FILE in $$(grep -oE '(src|href)="[^"]+"' $$HTML_FILE | sed -E 's/^(src|href)="//;s/"$$//' | grep -vE '^https?:|^data:|^/|lcnc-docs\.svg' | grep -iE '\.(png|jpe?g|gif|svg|webp)$$'); do \ - IMAGE_DIR=$$(dirname $$IMAGE_FILE); \ - IMAGE_PATH=$(DOC_SRCDIR)/$$HTML_DIR/$$IMAGE_FILE; \ - if [ ! -e $$IMAGE_PATH ] ; then \ - IMAGE_PATH=$(DOC_OUT_ADOC)/en/$$HTML_DIR/$$IMAGE_FILE; \ - fi; \ - mkdir -p $(DOC_OUT_HTML)/$$LANG/$$HTML_DIR/$$IMAGE_DIR; \ - cp -f $$IMAGE_PATH $(DOC_OUT_HTML)/$$LANG/$$HTML_DIR/$$IMAGE_FILE; \ + set -e; \ + HTMLROOT=$$(realpath $(DOC_OUT_HTML)); \ + place() { \ + PAGE_DIR=$$(dirname $$1); \ + for REF in $$(grep -oE '(src|href)="[^"]+"' $$1 | sed -E 's/^(src|href)="//;s/"$$//' | grep -vE '^https?:|^data:|^#|^/|lcnc-docs\.svg' | grep -iE '\.(png|jpe?g|gif|svg|webp)$$'); do \ + DEST=$$(realpath -m "$$PAGE_DIR/$$REF"); \ + case $$DEST in $$HTMLROOT/*) ;; *) echo "html-images: out-of-tree $$DEST (page $$1)" >&2; exit 1;; esac; \ + [ -e "$$DEST" ] && continue; \ + REL=$${DEST#$$HTMLROOT/}; \ + case $$REL in \ + image/*) SRCREL=$${REL#image/}; SRC=$(DOC_SRCDIR)/$$SRCREL; [ -e "$$SRC" ] || SRC=$(DOC_OUT_ADOC)/en/$$SRCREL;; \ + */image/*) L=$${REL%%/*}; SRCREL=$${REL#*/image/}; SRC=$(DOC_SRCDIR)/$$SRCREL; [ -e "$$SRC" ] || SRC=$(DOC_OUT_ADOC)/$$L/$$SRCREL;; \ + *) echo "html-images: unexpected pool dest $$REL (page $$1)" >&2; exit 1;; \ + esac; \ + if [ ! -e "$$SRC" ] ; then echo "html-images: no source for $$REL (page $$1)" >&2; exit 1; fi; \ + mkdir -p "$$(dirname "$$DEST")"; \ + cp -f "$$SRC" "$$DEST"; \ done; \ - done > $@.new && mv $@.new $@ + }; \ + for HTML_FILE in $(DOC_TARGETS_HTML); do place $$HTML_FILE; done \ + > $@.new && mv $@.new $@ # mb2hal_HOWTO.ini lives in the docs source tree next to mb2hal.adoc, so the # English build includes it directly. Copy it into the per-language build @@ -1157,10 +1162,12 @@ $(foreach L,$(LANGUAGES),$(eval $(call HTML_COPY_RULE,$(L)))) cp -f emc/usr_intf/gmoccapy/release_notes.txt $(DOC_OUT_HTML)/$$lang/gui/gmoccapy_release_notes.txt ; \ done) > $@.new && mv $@.new $@ -# Copy all images used by translated adoc files into the directories -# with translated adoc files. The .translateddocs-stamp prerequisite -# only exists when BUILD_DOCS_TRANSLATED=yes; without it the rule has -# no work to do (filter-out yields empty) so we skip the dep too. +# Stage the images used by translated adoc files into the translated adoc +# tree, so asciidoctor-pdf can read them at render time. The .translateddocs-stamp +# prerequisite only exists when BUILD_DOCS_TRANSLATED=yes; without it the rule +# has no work to do (filter-out yields empty) so we skip the dep too. HTML +# image placement is no longer done here: the image_resolver rewrites HTML src +# to the shared pool and .html-images-stamp materialises it. ifeq ($(BUILD_DOCS_TRANSLATED),yes) ADOC_IMAGES_STAMP_DEPS := $(DOC_DIR)/.translateddocs-stamp endif @@ -1176,13 +1183,10 @@ endif IMAGE_PATH=$(DOC_OUT_ADOC)/en/$$EN_DIR/$$IMAGE_FILE; \ fi; \ TIMAGE_PATH=$(DOC_OUT_ADOC)/$$ADOC_DIR/$$IMAGE_FILE; \ - HIMAGE_PATH=$(DOC_OUT_HTML)/$$ADOC_DIR/$$IMAGE_FILE; \ mkdir -p $(DOC_OUT_ADOC)/$$ADOC_DIR/$$IMAGE_DIR; \ - mkdir -p $(DOC_OUT_HTML)/$$ADOC_DIR/$$IMAGE_DIR; \ if [ ! -e $$TIMAGE_PATH ] ; then \ echo "Generating $$TIMAGE_PATH for $$ADOC_FILE"; \ cp -f $$IMAGE_PATH $$TIMAGE_PATH; \ - cp -f $$IMAGE_PATH $$HIMAGE_PATH; \ fi ; \ done; \ done > $@.new && mv $@.new $@ @@ -1207,13 +1211,20 @@ $(DOC_OUT_ADOC)/en/%.html: LCNC_CSSREL=$(shell python3 -c "print('../' * (1 + '$ # $4 xref-exclude pattern (English filters all lang subdirs; translated # trees are rooted inside their own lang dir so the exclude is empty). define ASCIIDOCTOR_HTML_RULE -$$(patsubst %.adoc,$2/%.html,$$(DOC_SRCS_$(call toUC,$1)_SMALL)): $2/%.html: $2/%.adoc $$(DOC_SRCDIR)/docinfo.html $$(DOC_SRCDIR)/docinfo-header.html +# Order-only dep on .adoc-images-stamp so translated images are staged before +# the resolver probes for them at render (it also falls back to docs/src). +$$(patsubst %.adoc,$2/%.html,$$(DOC_SRCS_$(call toUC,$1)_SMALL)): $2/%.html: $2/%.adoc $$(DOC_SRCDIR)/docinfo.html $$(DOC_SRCDIR)/docinfo-header.html | .adoc-images-stamp $$(ECHO) "Building '$1' adoc to html: " $$< $$(Q)asciidoctor -r $$(realpath $$(DOC_SRCDIR))/extensions/xref_resolver.rb \ + -r $$(realpath $$(DOC_SRCDIR))/extensions/image_resolver.rb \ -r $$(realpath $$(DOC_SRCDIR))/extensions/rouge_hal.rb \ -r $$(realpath $$(DOC_SRCDIR))/extensions/rouge_ngc.rb \ -r $$(realpath $$(DOC_SRCDIR))/extensions/rouge_ini.rb \ -a compat-mode \ + -a "doc-languages=$$(LANGUAGES)" \ + -a "lcnc-lang=$1" \ + -a "lcnc-srcdir=$$(realpath $$(DOC_SRCDIR))" \ + -a "lcnc-adocdir=$$(realpath $$(DOC_OUT_ADOC))" \ -a xref-root=$3 \ -a "xref-exclude=$4" \ -a "relindir=$$(shell dirname $$*)" \ diff --git a/docs/src/extensions/image_resolver.rb b/docs/src/extensions/image_resolver.rb index 57b0a745c2a..b8f286ebf89 100644 --- a/docs/src/extensions/image_resolver.rb +++ b/docs/src/extensions/image_resolver.rb @@ -13,8 +13,16 @@ # convention BsAtHome proposed on #4053: img_en.ext is the default, # img_.ext (when it exists) is the translated override. # -# Requires the document to be loaded with sourcemap: true (passed on the -# CLI as -a sourcemap=true) so blocks expose .file. +# For HTML the resolver also rewrites every image reference (and click-to- +# enlarge link=) to a shared pool so each picture is stored once instead of +# copied into every language tree: +# generic image -> image/ +# localized foo_ -> /image/ +# is the path relative to docs/src; (lcnc-cssrel) is the +# path back to the html/ root. .html-images-stamp materialises the pool as +# real files (no symlinks). The page language comes from the lcnc-lang +# attribute, not the path, since the Asciidoctor 2.0 CLI has no sourcemap +# option. The PDF backend is unchanged (absolute paths). require 'asciidoctor' require 'asciidoctor/extensions' @@ -22,18 +30,13 @@ module LinuxCNCDocs class ImageResolver < Asciidoctor::Extensions::Treeprocessor IMAGE_EXTS = %w[.png .svg .jpg .jpeg].freeze - # Inline image: macros, with target as captured group 1. Skip the - # block image:: form (handled by find_by(:image)) and anything that - # already looks like a URL or absolute path. - INLINE_IMAGE_RE = /(? variant. Probing docs/src means an + # inline image on a translated page resolves even though .adoc-images-stamp + # stages only block images. + def resolve_pool_file(ref, base_dir, lang, document) + bases = pool_bases(base_dir, document) + if lang != 'en' + swapped = swap_lang(ref, lang) + if swapped != ref + bases.each do |b| + r = probe_file(File.expand_path(swapped, b)) + return r if r + end + end + end + bases.each do |b| + r = probe_file(File.expand_path(ref, b)) + return r if r + end + nil + end + + def probe_file(path) + return path if File.file?(path) + resolve_extension(path) + end + + # Candidate base directories for a page: the staged adoc dir itself, + # plus the matching docs/src dir and the English build/adoc dir. + def pool_bases(base_dir, document) + list = [base_dir] + srcdir = document.attr('lcnc-srcdir') + adocdir = document.attr('lcnc-adocdir') + if srcdir && adocdir + bd = File.expand_path(base_dir) + if bd.start_with?(adocdir + '/') + rel = bd[(adocdir.length + 1)..-1] # e.g. "de/config" or "en/config" + lang_tags(document).each do |l| + if rel == l + rel = '' + break + elsif rel.start_with?(l + '/') + rel = rel[(l.length + 1)..-1] + break + end end + list << (rel.empty? ? srcdir : File.join(srcdir, rel)) + list << File.join(adocdir, 'en', rel) end end + list.uniq + end - # No translated variant available (or English doc); the original - # target stays for HTML, only PDF needs the absolute rewrite. - return unless pdf - abs = resolve_candidate(File.expand_path(target, base_dir), lang_re) - return unless abs - node.set_attr('target', abs) - apply_default_width(node) - apply_default_alignment(node) if pdf + # Path of the resolved image relative to docs/src. resolve_candidate + # returns a file either under docs/src or under the staging build/adoc + # tree (build/adoc/en/... or build/adoc//...); strip whichever + # root applies, and the leading staging-language segment. + def pool_srcrel(abs, document) + srcdir = document.attr('lcnc-srcdir') + adocdir = document.attr('lcnc-adocdir') + p = File.expand_path(abs) + return p[(srcdir.length + 1)..-1] if srcdir && p.start_with?(srcdir + '/') + if adocdir && p.start_with?(adocdir + '/') + rel = p[(adocdir.length + 1)..-1] + lang_tags(document).each do |l| + return rel[(l.length + 1)..-1] if rel.start_with?(l + '/') + end + return rel + end + nil end # Rewrite an `*_en.` filename to `*_.`. The check @@ -143,15 +263,18 @@ def resolve_candidate(path, lang_re) end def rewrite_inline_in_block(block) - base_dir = File.dirname(File.expand_path(block.file)) + src = block.file || block.document.attr('docfile') + return unless src + base_dir = File.dirname(File.expand_path(src)) lang_re = lang_re_for(block.document) - lang = lang_re && (m = lang_re.match(block.file.to_s)) ? m[1] : 'en' + lang = detect_lang(block.document, src, lang_re) pdf = block.document.backend == 'pdf' + document = block.document # :paragraph / :literal / :sidebar etc. carry source in .lines (Array). if block.respond_to?(:lines=) && block.lines.is_a?(Array) && !block.lines.empty? changed = false - new_lines = block.lines.map { |ln| rewrite_inline(ln, base_dir, lang, lang_re, pdf) { changed = true } } + new_lines = block.lines.map { |ln| rewrite_inline(ln, base_dir, lang, lang_re, pdf, document) { changed = true } } block.lines = new_lines if changed end @@ -160,35 +283,38 @@ def rewrite_inline_in_block(block) old = block.instance_variable_get(:@text) if old.is_a?(String) && !old.empty? changed = false - new_text = rewrite_inline(old, base_dir, lang, lang_re, pdf) { changed = true } + new_text = rewrite_inline(old, base_dir, lang, lang_re, pdf, document) { changed = true } block.text = new_text if changed end end end - def rewrite_inline(text, base_dir, lang, lang_re, pdf) + def rewrite_inline(text, base_dir, lang, lang_re, pdf, document) text.gsub(INLINE_IMAGE_RE) do full = Regexp.last_match(0) target = Regexp.last_match(1) - next full if target.start_with?('http://', 'https://', '/') + body = Regexp.last_match(2) + next full if target.start_with?('http://', 'https://', '/', '<') next full if target.include?('{') - if lang != 'en' - swapped = swap_lang(target, lang) - if swapped != target - abs = resolve_candidate(File.expand_path(swapped, base_dir), lang_re) - if abs - yield if block_given? - next "image:#{pdf ? abs : swapped}[" - end - end + if pdf + abs = resolve_target(target, base_dir, lang, lang_re) + next full unless abs + yield if block_given? + next "image:#{abs}[#{body}]" end - next full unless pdf - candidate = resolve_candidate(File.expand_path(target, base_dir), lang_re) - if candidate + new_target = pool_target(target, base_dir, lang, lang_re, document) + new_body = body.gsub(INLINE_LINK_RE) do + q = Regexp.last_match(1) + lval = Regexp.last_match(2) + next Regexp.last_match(0) if lval.start_with?('http://', 'https://', '/', '#', '<') || lval.include?('{') + lp = pool_target(lval, base_dir, lang, lang_re, document) + lp ? "link=#{q}#{lp}#{q}" : Regexp.last_match(0) + end + if new_target || new_body != body yield if block_given? - "image:#{candidate}[" + "image:#{new_target || target}[#{new_body}]" else full end @@ -209,7 +335,7 @@ def apply_default_width(node) return if node.attr('width') node.set_attr('pdfwidth', '75%') end - + # center images by default if no alignmen is given def apply_default_alignment(node) return if node.context == :inline_image