1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
From 15ae97b1c2c14f1263cdc853673c4129625323de Mon Sep 17 00:00:00 2001
From: Marko Mäkelä <marko.makela@mariadb.com>
Date: Thu, 8 Feb 2024 08:09:20 +0000
Subject: [PATCH] MDEV-32578 row_merge_fts_doc_tokenize() handles parser plugin
inconsistently
When mysql/mysql-server@0c954c2
added a plugin interface for FULLTEXT INDEX tokenization to MySQL 5.7,
fts_tokenize_ctx::processed_len got a second meaning, which is only
partly implemented in row_merge_fts_doc_tokenize().
This inconsistency could cause a crash when using FULLTEXT...WITH PARSER.
A test case that would crash MySQL 8.0 when using an n-gram parser and
single-character words would fail to crash in MySQL 5.7, because the
buf_full condition in row_merge_fts_doc_tokenize() was not met.
This change is inspired by
mysql/mysql-server@38e9a07
that appeared in MySQL 5.7.44.
CVE: CVE-2023-22084
Upstream-Status: Backport [https://github.com/MariaDB/server/commit/15ae97b1c2c1]
Signed-off-by: Yogita Urade <yogita.urade@windriver.com>
---
storage/innobase/include/row0ftsort.h | 6 +++++-
storage/innobase/row/row0ftsort.cc | 11 ++++++++---
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
index 65508caf..3ffa8243 100644
--- a/storage/innobase/include/row0ftsort.h
+++ b/storage/innobase/include/row0ftsort.h
@@ -104,7 +104,10 @@ typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t;
/** Structure stores information from string tokenization operation */
struct fts_tokenize_ctx {
- ulint processed_len; /*!< processed string length */
+ /** the processed string length in bytes
+ (when using the built-in tokenizer),
+ or the number of row_merge_fts_doc_tokenize_by_parser() calls */
+ ulint processed_len;
ulint init_pos; /*!< doc start position */
ulint buf_used; /*!< the sort buffer (ID) when
tokenization stops, which
@@ -115,6 +118,7 @@ struct fts_tokenize_ctx {
ib_rbt_t* cached_stopword;/*!< in: stopword list */
dfield_t sort_field[FTS_NUM_FIELDS_SORT];
/*!< in: sort field */
+ /** parsed tokens (when using an external parser) */
fts_token_list_t fts_token_list;
fts_tokenize_ctx() :
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index 86e96624..406ff60f 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -491,7 +491,10 @@ row_merge_fts_doc_tokenize(
/* Tokenize the data and add each word string, its corresponding
doc id and position to sort buffer */
- while (t_ctx->processed_len < doc->text.f_len) {
+ while (parser
+ ? (!t_ctx->processed_len
+ || UT_LIST_GET_LEN(t_ctx->fts_token_list))
+ : t_ctx->processed_len < doc->text.f_len) {
ulint idx = 0;
ulint cur_len;
doc_id_t write_doc_id;
@@ -831,7 +834,8 @@ void fts_parallel_tokenization(
/* Not yet finish processing the "doc" on hand,
continue processing it */
ut_ad(doc.text.f_str);
- ut_ad(t_ctx.processed_len < doc.text.f_len);
+ ut_ad(buf[0]->index->parser
+ || t_ctx.processed_len < doc.text.f_len);
}
processed = row_merge_fts_doc_tokenize(
@@ -841,7 +845,8 @@ void fts_parallel_tokenization(
/* Current sort buffer full, need to recycle */
if (!processed) {
- ut_ad(t_ctx.processed_len < doc.text.f_len);
+ ut_ad(buf[0]->index->parser
+ || t_ctx.processed_len < doc.text.f_len);
ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
break;
}
--
2.40.0
|