Skip to content

Commit 95e3cc8

Browse files
authored
fix(parse): preserve inline children inside self-contained block-level HTML (#192)
1 parent 82095d2 commit 95e3cc8

3 files changed

Lines changed: 124 additions & 9 deletions

File tree

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
## Input
2+
3+
```md
4+
<p><img src="/foo.png" alt="x"></p>
5+
```
6+
7+
## AST
8+
9+
```json
10+
{
11+
"frontmatter": {},
12+
"meta": {},
13+
"nodes": [
14+
[
15+
"p",
16+
{
17+
"$": { "html": 1, "block": 1 }
18+
},
19+
[
20+
"img",
21+
{
22+
"$": { "html": 1, "block": 1 },
23+
"src": "/foo.png",
24+
"alt": "x"
25+
}
26+
]
27+
]
28+
]
29+
}
30+
```
31+
32+
## HTML
33+
34+
```html
35+
<p><img src="/foo.png" alt="x" /></p>
36+
```
37+
38+
## Markdown
39+
40+
```md
41+
<p><img src="/foo.png" alt="x" /></p>
42+
```

packages/comark/src/internal/parse/token-processor.ts

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@ export function marmdownItTokensToComarkTree(
4848
let i = 0
4949
let endLine = options.startLine
5050
while (i < tokens.length) {
51+
const token = tokens[i]
52+
53+
// An html_block whose own content already closes its outer element
54+
// (e.g. `<p><img></p>`, `<div>foo</div>`, `<img>`, `<!-- ... -->`) has no
55+
// paired html_block_close later in the stream.
56+
if (token.type === 'html_block' && htmlBlockHasOwnClose(token.content || '')) {
57+
const htmlNodes = htmlToComarkNodes(token.content || '')
58+
for (const htmlNode of htmlNodes) {
59+
nodes.push(htmlNode)
60+
}
61+
i++
62+
continue
63+
}
64+
5165
const result = processBlockToken(tokens, i, false, state)
5266
if (result.node) {
5367
if (options.preservePositions) {
@@ -69,6 +83,24 @@ export function marmdownItTokensToComarkTree(
6983
return nodes
7084
}
7185

86+
/**
87+
* Whether an `html_block` token's content already closes its own outer element.
88+
* The block tokeniser only emits an `html_block_close` for lines that begin with
89+
* `</`, so any block whose closer sits on the opener's line (`<p><img></p>`),
90+
* including void elements and comments, has no companion close token.
91+
*/
92+
function htmlBlockHasOwnClose(content: string): boolean {
93+
const trimmed = content.trim()
94+
if (!trimmed) return false
95+
// Comments, declarations, CDATA, processing instructions: self-terminating.
96+
if (trimmed.startsWith('<!') || trimmed.startsWith('<?')) return true
97+
const match = trimmed.match(/^<\s*([a-zA-Z][a-zA-Z0-9]*)/)
98+
if (!match) return false
99+
const tag = match[1]
100+
if (VOID_ELEMENTS.has(tag.toLowerCase())) return true
101+
return new RegExp(`</\\s*${tag}\\s*>`, 'i').test(trimmed)
102+
}
103+
72104
/**
73105
* Extract and process attributes from a token's attrs array
74106
*/
@@ -289,18 +321,11 @@ function processBlockToken(
289321
return { node: [null, {}, inner] as unknown as ComarkNode, nextIndex: startIndex + 1 }
290322
}
291323

292-
const htmlNodes = htmlToComarkNodes(content)
293-
const [node1] = htmlNodes
324+
const children = processBlockChildren(tokens, startIndex + 1, 'html_block_close', false, false, false, state)
325+
const [node1] = htmlToComarkNodes(content)
294326
if (!node1) {
295327
return { node: null, nextIndex: startIndex + 1 }
296328
}
297-
298-
const isVoid = Array.isArray(node1) && VOID_ELEMENTS.has(node1[0] as string)
299-
if (isVoid) {
300-
return { node: node1, nextIndex: startIndex + 1 }
301-
}
302-
303-
const children = processBlockChildren(tokens, startIndex + 1, 'html_block_close', false, false, false, state)
304329
const node = [node1[0]!, node1[1]! as ComarkElementAttributes, ...children.nodes] as ComarkNode
305330

306331
return { node, nextIndex: children.nextIndex + 1 }
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import { describe, expect, it } from 'vitest'
2+
import { parse } from '../src/index'
3+
4+
describe('block-level raw HTML', () => {
5+
it('preserves inline children inside a self-contained block-level <p>', async () => {
6+
const result = await parse('<p><img src="/foo.png" alt="x"></p>')
7+
8+
expect(result.nodes).toEqual([
9+
['p', { $: { html: 1, block: 1 } }, ['img', { $: { html: 1, block: 1 }, src: '/foo.png', alt: 'x' }]],
10+
])
11+
})
12+
13+
it('preserves mixed text and inline children inside a single-line block-level <p>', async () => {
14+
const result = await parse('<p>hello <img src="/foo.png" alt="x"> world</p>')
15+
16+
expect(result.nodes).toEqual([
17+
[
18+
'p',
19+
{ $: { html: 1, block: 1 } },
20+
'hello',
21+
['img', { $: { html: 1, block: 1 }, src: '/foo.png', alt: 'x' }],
22+
'world',
23+
],
24+
])
25+
})
26+
27+
it('does not merge the following markdown paragraph into the preceding block-level <p>', async () => {
28+
const md = `# Hello
29+
30+
<p><img src="/foo.png" alt="x"></p>
31+
32+
That is some text here.`
33+
34+
const result = await parse(md)
35+
36+
expect(result.nodes).toEqual([
37+
['h1', { id: 'hello' }, 'Hello'],
38+
['p', { $: { html: 1, block: 1 } }, ['img', { $: { html: 1, block: 1 }, src: '/foo.png', alt: 'x' }]],
39+
['p', {}, 'That is some text here.'],
40+
])
41+
})
42+
43+
it('preserves text inside a single-line block-level <div>', async () => {
44+
const result = await parse('<div>foo</div>')
45+
46+
expect(result.nodes).toEqual([['div', { $: { html: 1, block: 1 } }, 'foo']])
47+
})
48+
})

0 commit comments

Comments
 (0)