meta-oe/recipes-dbs/mysql/mariadb/CVE-2023-22084.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

From 15ae97b1c2c14f1263cdc853673c4129625323de Mon Sep 17 00:00:00 2001
From: Marko Mäkelä <marko.makela@mariadb.com>
Date: Thu, 8 Feb 2024 08:09:20 +0000
Subject: [PATCH] MDEV-32578 row_merge_fts_doc_tokenize() handles parser plugin
  inconsistently

When mysql/mysql-server@0c954c2
added a plugin interface for FULLTEXT INDEX tokenization to MySQL 5.7,
fts_tokenize_ctx::processed_len got a second meaning, which is only
partly implemented in row_merge_fts_doc_tokenize().

This inconsistency could cause a crash when using FULLTEXT...WITH PARSER.
A test case that would crash MySQL 8.0 when using an n-gram parser and
single-character words would fail to crash in MySQL 5.7, because the
buf_full condition in row_merge_fts_doc_tokenize() was not met.

This change is inspired by
mysql/mysql-server@38e9a07
that appeared in MySQL 5.7.44.

CVE: CVE-2023-22084
Upstream-Status: Backport [https://github.com/MariaDB/server/commit/15ae97b1c2c1]

Signed-off-by: Yogita Urade <yogita.urade@windriver.com>
---
 storage/innobase/include/row0ftsort.h |  6 +++++-
 storage/innobase/row/row0ftsort.cc    | 11 ++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
index 65508caf..3ffa8243 100644
--- a/storage/innobase/include/row0ftsort.h
+++ b/storage/innobase/include/row0ftsort.h
@@ -104,7 +104,10 @@ typedef UT_LIST_BASE_NODE_T(row_fts_token_t)     fts_token_list_t;

 /** Structure stores information from string tokenization operation */
 struct fts_tokenize_ctx {
-	ulint			processed_len;  /*!< processed string length */
+	/** the processed string length in bytes
+	(when using the built-in tokenizer),
+	or the number of row_merge_fts_doc_tokenize_by_parser() calls */
+	ulint			processed_len;
	ulint			init_pos;       /*!< doc start position */
	ulint			buf_used;       /*!< the sort buffer (ID) when
						tokenization stops, which
@@ -115,6 +118,7 @@ struct fts_tokenize_ctx {
	ib_rbt_t*		cached_stopword;/*!< in: stopword list */
	dfield_t		sort_field[FTS_NUM_FIELDS_SORT];
						/*!< in: sort field */
+	/** parsed tokens (when using an external parser) */
	fts_token_list_t	fts_token_list;

	fts_tokenize_ctx() :
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index 86e96624..406ff60f 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -491,7 +491,10 @@ row_merge_fts_doc_tokenize(

	/* Tokenize the data and add each word string, its corresponding
	doc id and position to sort buffer */
-	while (t_ctx->processed_len < doc->text.f_len) {
+	while (parser
+               ? (!t_ctx->processed_len
+                  || UT_LIST_GET_LEN(t_ctx->fts_token_list))
+               : t_ctx->processed_len < doc->text.f_len) {
		ulint		idx = 0;
		ulint		cur_len;
		doc_id_t	write_doc_id;
@@ -831,7 +834,8 @@ void fts_parallel_tokenization(
			/* Not yet finish processing the "doc" on hand,
			continue processing it */
			ut_ad(doc.text.f_str);
-			ut_ad(t_ctx.processed_len < doc.text.f_len);
+			ut_ad(buf[0]->index->parser
+			      || t_ctx.processed_len < doc.text.f_len);
		}

		processed = row_merge_fts_doc_tokenize(
@@ -841,7 +845,8 @@ void fts_parallel_tokenization(

		/* Current sort buffer full, need to recycle */
		if (!processed) {
-			ut_ad(t_ctx.processed_len < doc.text.f_len);
+			ut_ad(buf[0]->index->parser
+			      || t_ctx.processed_len < doc.text.f_len);
			ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
			break;
		}
--
2.40.0