Skip to content

Commit

Permalink
Merge pull request #550 from tdakkota/fix/html-parser-entity-generation
Browse files Browse the repository at this point in the history
fix(entity): length and offset computing
  • Loading branch information
ernado authored Oct 21, 2021
2 parents 6af6bda + 6d119fb commit ac4d875
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 20 deletions.
37 changes: 23 additions & 14 deletions telegram/message/entity/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,16 @@ import (
"github.com/gotd/td/tg"
)

type utf8entity struct {
offset int
length int
}

// Builder builds message string and text entities.
type Builder struct {
entities []tg.MessageEntityClass
// lengths stores offset/length data too, but in UTF-8 codepoints
lengths []utf8entity
// We store index of first entity added at last Format call.
// It needed to trim space in all entities of last text block.
lastFormatIndex int
Expand Down Expand Up @@ -44,24 +51,25 @@ func (b *Builder) Complete() (string, []tg.MessageEntityClass) {
entities := b.entities
b.reset()

// If there are not entities or last text block does not have entities
// If there are no entities or last text block does not have entities,
// so we just return built message.
if len(entities) == 0 || b.lastFormatIndex >= len(entities) {
if len(b.lengths) == 0 || b.lastFormatIndex >= len(entities) {
return msg, entities
}

// Since Telegram client does not handle space after formatted message
// we should compute length of the last block to trim it.
// Get first entity of last text block.
entity := entities[len(entities)-1]
offset := entity.GetOffset()
entity := b.lengths[len(b.lengths)-1]
offset := entity.offset
length := entity.length
// Get last text block.
lastBlock := msg[offset:]
// Trim this block.
trimmed := strings.TrimRightFunc(lastBlock, unicode.IsSpace)

// If there are a difference, we should change length of the all entities.
if len(trimmed) != len(lastBlock) {
if length >= len(lastBlock) && len(trimmed) != len(lastBlock) {
length := ComputeLength(trimmed)
for idx := range entities[b.lastFormatIndex:] {
setLength(idx, length, entities[b.lastFormatIndex:])
Expand Down Expand Up @@ -102,13 +110,6 @@ func ComputeLength(s string) int {
return n
}

// AddEntities adds given raw entities to the builder.
// Use carefully.
func (b *Builder) AddEntities(e ...tg.MessageEntityClass) *Builder {
b.entities = append(b.entities, e...)
return b
}

func (b *Builder) appendMessage(s string, formats ...Formatter) *Builder {
if s == "" {
return b
Expand All @@ -117,11 +118,19 @@ func (b *Builder) appendMessage(s string, formats ...Formatter) *Builder {
offset := ComputeLength(b.message.String())
length := ComputeLength(s)

b.appendEntities(offset, length, utf8entity{
offset: b.message.Len(),
length: len(s),
}, formats...)
b.message.WriteString(s)
return b
}

func (b *Builder) appendEntities(offset, length int, u utf8entity, formats ...Formatter) *Builder {
b.lastFormatIndex = len(b.entities)
for i := range formats {
b.entities = append(b.entities, formats[i](offset, length))
b.lengths = append(b.lengths, u)
}

b.message.WriteString(s)
return b
}
21 changes: 15 additions & 6 deletions telegram/message/entity/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@ import (
)

type stackElem struct {
offset int
tag string
format Formatter
offset int
utf8offset int
tag string
format Formatter
}

type htmlParser struct {
tokenizer *html.Tokenizer
builder *Builder
offset int
stack []stackElem
attr map[string]string
userResolver func(id int64) (tg.InputUserClass, error)
Expand Down Expand Up @@ -52,7 +54,8 @@ func (p *htmlParser) startTag() error {
p.fillAttrs()
}

e.offset = p.builder.message.Len()
e.offset = p.offset
e.utf8offset = p.builder.message.Len()
// See https://core.telegram.org/bots/api#html-style.
switch e.tag {
case "b", "strong":
Expand Down Expand Up @@ -123,7 +126,11 @@ func (p *htmlParser) endTag() error {

length := ComputeLength(p.builder.message.String())
if s.format != nil {
p.builder.entities = append(p.builder.entities, s.format(s.offset, length-s.offset))
u8 := utf8entity{
offset: s.utf8offset,
length: p.builder.message.Len() - s.utf8offset,
}
p.builder.appendEntities(s.offset, length-s.offset, u8, s.format)
}
return nil
}
Expand All @@ -138,7 +145,9 @@ func (p *htmlParser) parse() error {
}
return nil
case html.TextToken:
p.builder.message.Write(p.tokenizer.Text())
text := p.tokenizer.Text()
p.builder.message.Write(text)
p.offset += ComputeLength(string(text))
case html.StartTagToken:
if err := p.startTag(); err != nil {
return err
Expand Down
73 changes: 73 additions & 0 deletions telegram/message/entity/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"testing"

"github.com/stretchr/testify/require"
"golang.org/x/net/html"

"github.com/gotd/td/tg"
)
Expand Down Expand Up @@ -60,3 +61,75 @@ func TestHTML(t *testing.T) {
})
}
}

func TestIssue525(t *testing.T) {
test := func(text string, expected []tg.MessageEntityClass) func(t *testing.T) {
return func(t *testing.T) {
a := require.New(t)

b := Builder{}
p := htmlParser{
tokenizer: html.NewTokenizer(strings.NewReader(text)),
builder: &b,
attr: map[string]string{},
userResolver: nil,
}

a.NoError(p.parse())
_, entities := b.Complete()
a.Equal(expected, entities)
}
}

t.Run("Ru", test(`Строка
<i>Строка текста курсивом</i>
Обычный текст с <a href="https://google.com">Ссылкой</a> внутри, и
ещё одна ссылка - <a href="https://go.dev">Здесь</a>.
Ещё одна строка.
`,
[]tg.MessageEntityClass{
&tg.MessageEntityItalic{
Offset: 7,
Length: 22,
},
&tg.MessageEntityTextURL{
Offset: 47,
Length: 7,
URL: "https://google.com",
},
&tg.MessageEntityTextURL{
Offset: 83,
Length: 5,
URL: "https://go.dev",
},
}),
)
t.Run("En", test(`Line
<i>Italic line of text</i>
Normal line of text with <a href="https://google.com">Link</a> inside, and
another link now - <a href="https://go.dev">Here</a>.
One more line.
`,
[]tg.MessageEntityClass{
&tg.MessageEntityItalic{
Offset: 5,
Length: 19,
},
&tg.MessageEntityTextURL{
Offset: 51,
Length: 4,
URL: "https://google.com",
},
&tg.MessageEntityTextURL{
Offset: 87,
Length: 4,
URL: "https://go.dev",
},
}),
)

}

0 comments on commit ac4d875

Please sign in to comment.