Skip to content

Commit a0a060a

Browse files
committed
diff/push: allow safe type conversions to/from vector (MySQL 9)
MySQL 9 added a vector column type, with vector(n) consisting of n 4-byte floats. When altering a table, vector columns may be converted to other binary column types safely (and vice versa), as long as the new length cannot possibly cause truncation. Going from vector(x) to vector(y) is also safe, as long as x < y. This commit improves `skeema diff` and `skeema push` to no longer require the --allow-unsafe option when performing these vector-related column type modifications. Additionally, this commit refactors some old code in the same file to no longer use regular expressions. Older Skeema code (especially circa 2017) tended to lean on regexp logic in places where Go 1.18's strings.Cut can handle the task fairly succinctly and more efficiently.
1 parent 665f978 commit a0a060a

File tree

2 files changed

+110
-90
lines changed

2 files changed

+110
-90
lines changed

internal/tengo/table_alter_clause.go

Lines changed: 97 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package tengo
22

33
import (
44
"fmt"
5-
"regexp"
65
"strconv"
76
"strings"
87
)
@@ -375,8 +374,6 @@ type ModifyColumn struct {
375374
InUniqueConstraint bool // true if column is part of a unique index (or PK) in both old and new version of table
376375
}
377376

378-
var reDisplayWidth = regexp.MustCompile(`(tinyint|smallint|mediumint|int|bigint)\((\d+)\)( unsigned)?( zerofill)?`)
379-
380377
// Clause returns a MODIFY COLUMN clause of an ALTER TABLE statement.
381378
func (mc ModifyColumn) Clause(mods StatementModifiers) string {
382379
var positionClause string
@@ -419,6 +416,48 @@ func (mc ModifyColumn) Clause(mods StatementModifiers) string {
419416
return "MODIFY COLUMN " + mc.NewColumn.Definition(mods.Flavor) + positionClause
420417
}
421418

419+
// splitTypeSize is a helper function used by ModifyColumn.Unsafe(). It
420+
// separates the column type from its size modifier.
421+
func splitTypeSize(input string) (typ string, size uint64, ok bool) {
422+
before, after, ok := strings.Cut(input, "(")
423+
if !ok {
424+
return "", 0, false
425+
}
426+
after, _, ok = strings.Cut(after, ")") // strip modifiers e.g. " unsigned"
427+
if !ok {
428+
return "", 0, false
429+
}
430+
size, err := strconv.ParseUint(after, 10, 64)
431+
if err != nil {
432+
return "", 0, false
433+
}
434+
return before, size, true
435+
}
436+
437+
// splitType2Sizes is a helper function used by ModifyColumn.Unsafe(). It
438+
// separates the column type from its comma-separated size modifiers, for use
439+
// with types that have 2 such values (e.g. decimal).
440+
func splitType2Sizes(input string) (typ string, size1, size2 uint64, ok bool) {
441+
before, after, ok := strings.Cut(input, "(")
442+
if !ok {
443+
return "", 0, 0, false
444+
}
445+
after, _, ok = strings.Cut(after, ")") // strip modifiers e.g. " unsigned"
446+
if !ok {
447+
return "", 0, 0, false
448+
}
449+
str1, str2, ok := strings.Cut(after, ",")
450+
if !ok {
451+
return "", 0, 0, false
452+
}
453+
size1, err1 := strconv.ParseUint(str1, 10, 64)
454+
size2, err2 := strconv.ParseUint(str2, 10, 64)
455+
if err1 != nil || err2 != nil {
456+
return "", 0, 0, false
457+
}
458+
return before, size1, size2, true
459+
}
460+
422461
// Unsafe returns true if this clause is potentially destroys/corrupts existing
423462
// data, or restricts the range of data that may be stored. (Although the server
424463
// can also catch the latter case and prevent the ALTER, this only happens if
@@ -493,89 +532,56 @@ func (mc ModifyColumn) Unsafe(mods StatementModifiers) (unsafe bool, reason stri
493532
// decimal(a,b) -> decimal(x,y) unsafe if x < a or y < b: reduces range of
494533
// values that may be stored in the column
495534
if bothSamePrefix("decimal") {
496-
re := regexp.MustCompile(`^decimal\((\d+),(\d+)\)`)
497-
oldMatches := re.FindStringSubmatch(oldType)
498-
newMatches := re.FindStringSubmatch(newType)
499-
if oldMatches == nil || newMatches == nil {
500-
return true, genericReason
501-
}
502-
oldPrecision, _ := strconv.Atoi(oldMatches[1])
503-
oldScale, _ := strconv.Atoi(oldMatches[2])
504-
newPrecision, _ := strconv.Atoi(newMatches[1])
505-
newScale, _ := strconv.Atoi(newMatches[2])
506-
if newPrecision < oldPrecision || newScale < oldScale {
535+
_, oldPrecision, oldScale, oldOK := splitType2Sizes(oldType)
536+
_, newPrecision, newScale, newOK := splitType2Sizes(newType)
537+
if !oldOK || !newOK || newPrecision < oldPrecision || newScale < oldScale {
507538
return true, genericReason
508539
}
509540
return false, ""
510541
}
511542

512543
// bit(x) -> bit(y) unsafe if y < x
513544
if bothSamePrefix("bit") {
514-
re := regexp.MustCompile(`^bit\((\d+)\)`)
515-
oldMatches := re.FindStringSubmatch(oldType)
516-
newMatches := re.FindStringSubmatch(newType)
517-
if oldMatches == nil || newMatches == nil {
518-
return true, genericReason
519-
}
520-
oldSize, _ := strconv.Atoi(oldMatches[1])
521-
newSize, _ := strconv.Atoi(newMatches[1])
522-
if newSize < oldSize {
545+
_, oldSize, oldOK := splitTypeSize(oldType)
546+
_, newSize, newOK := splitTypeSize(newType)
547+
if !oldOK || !newOK || newSize < oldSize {
523548
return true, genericReason
524549
}
525550
return false, ""
526551
}
527552

528553
// time, timestamp, datetime: unsafe if decreasing or removing fractional
529554
// second precision (which reduces range of allowed values), but always safe
530-
// if adding fsp when none was there before.
555+
// if adding FSP when none was there before.
531556
if bothSamePrefix("time", "timestamp", "datetime") {
532557
// Since "time" and "timestamp" both begin with prefix "time", bothSamePrefix
533558
// will be tricked and we need to handle that mismatch explicitly
534559
if strings.HasPrefix(oldType, "timestamp") != strings.HasPrefix(newType, "timestamp") {
535560
return true, genericReason
536561
}
537-
if !strings.ContainsRune(oldType, '(') {
562+
_, oldFSP, oldHasFSP := splitTypeSize(oldType)
563+
if !oldHasFSP {
538564
return false, ""
539-
} else if !strings.ContainsRune(newType, '(') {
540-
return true, genericReason
541-
}
542-
re := regexp.MustCompile(`^[^(]+\((\d+)\)`)
543-
oldMatches := re.FindStringSubmatch(oldType)
544-
newMatches := re.FindStringSubmatch(newType)
545-
if oldMatches == nil || newMatches == nil {
546-
return true, genericReason
547565
}
548-
oldSize, _ := strconv.Atoi(oldMatches[1])
549-
newSize, _ := strconv.Atoi(newMatches[1])
550-
if newSize < oldSize {
566+
_, newFSP, newHasFSP := splitTypeSize(newType)
567+
if !newHasFSP || newFSP < oldFSP {
551568
return true, genericReason
552569
}
553570
return false, ""
554571
}
555572

556573
// float or double:
557-
// double -> double(x,y) or float -> float(x,y) unsafe
558-
// double(x,y) -> double or float(x,y) -> float IS safe (no parens = hardware max used)
559-
// double(a,b) -> double(x,y) or float(a,b) -> float(x,y) unsafe if x < a or y < b
574+
// double -> double(x,y); or float -> float(x,y) always unsafe
575+
// double(x,y) -> double; or float(x,y) -> float IS safe (no parens = hardware max used)
576+
// double(a,b) -> double(x,y); or float(a,b) -> float(x,y) unsafe if x < a or y < b
560577
// Converting from float to double may be safe (same rules as above), but double to float always unsafe
561578
// No extra check for unsigned->signed needed; although float/double support these, they don't affect max values
562579
if bothSamePrefix("float", "double") || (strings.HasPrefix(oldType, "float") && strings.HasPrefix(newType, "double")) {
563-
if !strings.ContainsRune(newType, '(') { // no parens = max allowed for type
580+
_, oldPrecision, oldScale, oldParens := splitType2Sizes(oldType)
581+
_, newPrecision, newScale, newParens := splitType2Sizes(newType)
582+
if !newParens { // no parens = max allowed for type
564583
return false, ""
565-
} else if !strings.ContainsRune(oldType, '(') {
566-
return true, genericReason
567-
}
568-
re := regexp.MustCompile(`^(?:float|double)\((\d+),(\d+)\)`)
569-
oldMatches := re.FindStringSubmatch(oldType)
570-
newMatches := re.FindStringSubmatch(newType)
571-
if oldMatches == nil || newMatches == nil {
572-
return true, genericReason
573-
}
574-
oldPrecision, _ := strconv.Atoi(oldMatches[1])
575-
oldScale, _ := strconv.Atoi(oldMatches[2])
576-
newPrecision, _ := strconv.Atoi(newMatches[1])
577-
newScale, _ := strconv.Atoi(newMatches[2])
578-
if newPrecision < oldPrecision || newScale < oldScale {
584+
} else if !oldParens || newPrecision < oldPrecision || newScale < oldScale {
579585
return true, genericReason
580586
}
581587
return false, ""
@@ -604,28 +610,26 @@ func (mc ModifyColumn) Unsafe(mods StatementModifiers) (unsafe bool, reason stri
604610

605611
// Conversions between string types (char, varchar, *text): unsafe if
606612
// new size < old size
607-
isStringType := func(typ string) (bool, uint64) {
613+
stringTypeSize := func(typ string) (uint64, bool) {
608614
textMap := map[string]uint64{
609615
"tinytext": 255,
610616
"text": 65535,
611617
"mediumtext": 16777215,
612618
"longtext": 4294967295,
613619
}
614620
if textLen, ok := textMap[typ]; ok {
615-
return true, textLen
621+
return textLen, true
616622
}
617-
re := regexp.MustCompile(`^(?:varchar|char)\((\d+)\)`)
618-
matches := re.FindStringSubmatch(typ)
619-
if matches == nil {
620-
return false, 0
623+
base, size, ok := splitTypeSize(typ)
624+
if ok && (base == "varchar" || base == "char") {
625+
return size, true
621626
}
622-
size, err := strconv.ParseUint(matches[1], 10, 64)
623-
return err == nil, size
627+
return 0, false
624628
}
625-
oldString, oldStringSize := isStringType(oldType)
626-
newString, newStringSize := isStringType(newType)
627-
if oldString && newString {
628-
if newStringSize < oldStringSize {
629+
oldSize, oldIsString := stringTypeSize(oldType)
630+
newSize, newIsString := stringTypeSize(newType)
631+
if oldIsString && newIsString {
632+
if newSize < oldSize {
629633
return true, genericReason
630634
}
631635
return false, ""
@@ -648,7 +652,7 @@ func (mc ModifyColumn) Unsafe(mods StatementModifiers) (unsafe bool, reason stri
648652
if isConversionBetween("inet6", "binary(16)", "char(39)", "varchar(39)") { // MariaDB 10.5+ inet6 type
649653
return false, ""
650654
}
651-
if isConversionBetween("inet4", "binary(4)", "char(15)", "varchar(15)") { // MariaDB 10.10+ inet4 type
655+
if isConversionBetween("inet4", "binary(4)", "char(15)", "varchar(15)") { // MariaDB 10.10+ inet4 type. (Also see special case below.)
652656
return false, ""
653657
}
654658
if isConversionBetween("uuid", "binary(16)", "char(32)", "varchar(32)", "char(36)", "varchar(36)") { // MariaDB 10.7+ uuid type
@@ -660,36 +664,43 @@ func (mc ModifyColumn) Unsafe(mods StatementModifiers) (unsafe bool, reason stri
660664
return false, ""
661665
}
662666

663-
// Conversions between variable-length binary types (varbinary, *blob):
664-
// unsafe if new size < old size
665-
// Note: This logic intentionally does not handle fixed-length binary(x)
666-
// conversions. Any changes with binary(x), even to binary(y) with y>x, are
667-
// treated as unsafe. The right-zero-padding behavior of binary type means any
668-
// size change effectively modifies the stored values if they are big-endian.
669-
isVarBinType := func(typ string) (bool, uint64) {
667+
// Conversions between binary types (binary, varbinary, *blob, vector):
668+
// * unsafe if new size < old size
669+
// * unsafe if converting a fixed-length binary col to/from anything other
670+
// than a vector, due to the right-side zero-padding behavior of binary
671+
// having unintended effects. Even converting binary(x) to binary(y) with y>x
672+
// is unsafe because the zero-padding potentially impacts any trailing big-
673+
// endian value. But we permit this for vectors since they have a well-
674+
// defined encoding.
675+
binaryTypeSize := func(typ string) (base string, size uint64, isBinary bool) {
670676
blobMap := map[string]uint64{
671677
"tinyblob": 255,
672678
"blob": 65535,
673679
"mediumblob": 16777215,
674680
"longblob": 4294967295,
675681
}
676682
if blobLen, ok := blobMap[typ]; ok {
677-
return true, blobLen
678-
}
679-
re := regexp.MustCompile(`^varbinary\((\d+)\)`)
680-
matches := re.FindStringSubmatch(typ)
681-
if matches == nil {
682-
return false, 0
683+
return typ, blobLen, true
684+
}
685+
base, size, ok := splitTypeSize(typ)
686+
if ok {
687+
if base == "binary" || base == "varbinary" {
688+
return base, size, true
689+
} else if base == "vector" {
690+
return base, size * 4, true // each vector dimension is a 4-byte float
691+
}
683692
}
684-
size, err := strconv.ParseUint(matches[1], 10, 64)
685-
return err == nil, size
693+
return "", 0, false
686694
}
687-
oldVarBin, oldVarBinSize := isVarBinType(oldType)
688-
newVarBin, newVarBinSize := isVarBinType(newType)
689-
if oldVarBin && newVarBin {
690-
if newVarBinSize < oldVarBinSize {
695+
oldBase, oldSize, oldIsBinary := binaryTypeSize(oldType)
696+
newBase, newSize, newIsBinary := binaryTypeSize(newType)
697+
if oldIsBinary && newIsBinary {
698+
if newSize < oldSize {
691699
return true, genericReason
692700
}
701+
if (oldBase == "binary" && newBase != "vector") || (newBase == "binary" && oldBase != "vector") {
702+
return true, "modification to column " + mc.OldColumn.Name + " may have unintended effects due to zero-padding behavior of binary type"
703+
}
693704
return false, ""
694705
}
695706

internal/tengo/table_diff_test.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -699,8 +699,8 @@ func TestModifyColumnUnsafe(t *testing.T) {
699699
{"int unsigned", "int"},
700700
{"bigint(11)", "bigint(11) unsigned"},
701701
{"int(11)", "bigint(20) unsigned"},
702-
{"enum('a', 'b', 'c')", "enum('a', 'aa', 'b', 'c'"},
703-
{"set('abc', 'def', 'ghi')", "set('abc', 'def')"},
702+
{"enum('a','b','c')", "enum('a','aa','b','c'"},
703+
{"set('abc','def','ghi')", "set('abc','def')"},
704704
{"decimal(10,5)", "decimal(10,4)"},
705705
{"decimal(10,5)", "decimal(9,5)"},
706706
{"decimal(10,5)", "decimal(9,6)"},
@@ -731,6 +731,8 @@ func TestModifyColumnUnsafe(t *testing.T) {
731731
{"tinytext", "char(200)"},
732732
{"tinyblob", "longtext"},
733733
{"binary(5)", "binary(10)"},
734+
{"binary(5)", "varbinary(10)"},
735+
{"tinyblob", "binary(4000)"},
734736
{"bit(10)", "bit(9)"},
735737
{"binary(17)", "inet6"},
736738
{"inet6", "varbinary(16)"},
@@ -740,6 +742,10 @@ func TestModifyColumnUnsafe(t *testing.T) {
740742
{"inet4", "inet6"}, // unsafe with empty StatementModifiers; see add'l testing later below
741743
{"char(31)", "uuid"},
742744
{"uuid", "binary(15)"},
745+
{"vector(10)", "binary(36)"},
746+
{"vector(64)", "tinyblob"},
747+
{"tinyblob", "vector(63)"},
748+
{"vector(4)", "varchar(4000)"},
743749
}
744750
for _, types := range expectUnsafe {
745751
assertUnsafe(types[0], types[1], true)
@@ -750,8 +756,8 @@ func TestModifyColumnUnsafe(t *testing.T) {
750756
{"mediumint(4)", "mediumint(3)"},
751757
{"int zerofill", "int"},
752758
{"int(10) unsigned", "bigint(20)"},
753-
{"enum('a', 'b', 'c')", "enum('a', 'b', 'c', 'd')"},
754-
{"set('abc', 'def', 'ghi')", "set('abc', 'def', 'ghi', 'jkl')"},
759+
{"enum('a','b','c')", "enum('a','b','c','d')"},
760+
{"set('abc','def','ghi')", "set('abc','def','ghi','jkl')"},
755761
{"decimal(9,4)", "decimal(10,4)"},
756762
{"decimal(9,4)", "decimal(9,5)"},
757763
{"decimal(9,4) unsigned", "decimal(9,4)"},
@@ -786,6 +792,9 @@ func TestModifyColumnUnsafe(t *testing.T) {
786792
{"varchar(15)", "inet4"},
787793
{"uuid", "varchar(32)"},
788794
{"binary(16)", "uuid"},
795+
{"vector(10)", "binary(40)"},
796+
{"vector(63)", "tinyblob"},
797+
{"tinyblob", "vector(64)"},
789798
}
790799
for _, types := range expectSafe {
791800
assertUnsafe(types[0], types[1], false)

0 commit comments

Comments
 (0)