From 978b4176f1a31a497aaadd33f21659b318832c95 Mon Sep 17 00:00:00 2001
From: Daenney <daenney@users.noreply.github.com>
Date: Wed, 12 Jun 2024 14:21:34 +0200
Subject: [PATCH] [chore] Upgrade wasm-sqlite to v0.16.2 (#2997)

---
 go.mod                                        |   4 +-
 go.sum                                        |   8 +-
 .../ncruces/go-sqlite3/internal/util/json.go  |   2 +-
 vendor/github.com/ncruces/go-sqlite3/json.go  |   3 +-
 .../github.com/ncruces/go-sqlite3/pointer.go  |   3 +-
 vendor/github.com/ncruces/go-sqlite3/stmt.go  |   2 +-
 vendor/github.com/ncruces/go-sqlite3/value.go |   2 +-
 .../ncruces/go-sqlite3/vfs/memdb/memdb.go     |  29 +-
 .../github.com/ncruces/go-sqlite3/vfs/shm.go  |   3 +
 .../ncruces/go-sqlite3/vfs/shm_bsd.go         |  16 +-
 .../github.com/tetratelabs/wazero/config.go   |  19 +-
 .../wazero/experimental/checkpoint.go         |  13 -
 .../wazero/experimental/listener.go           |   6 -
 .../experimental/sys/syscall_errno_windows.go |   6 +-
 .../engine/interpreter/interpreter.go         |   3 +
 .../wazevo/backend/executable_context.go      |  28 +-
 .../wazevo/backend/isa/amd64/machine.go       |  16 +-
 .../engine/wazevo/backend/isa/arm64/abi.go    |  30 +-
 .../backend/isa/arm64/abi_entry_preamble.go   |  29 +-
 .../wazevo/backend/isa/arm64/abi_go_call.go   | 119 ++---
 .../engine/wazevo/backend/isa/arm64/instr.go  | 438 +++++++++---------
 .../backend/isa/arm64/instr_encoding.go       | 136 +++---
 .../backend/isa/arm64/lower_constant.go       |   6 +-
 .../wazevo/backend/isa/arm64/lower_instr.go   | 379 +++++++--------
 .../wazevo/backend/isa/arm64/lower_mem.go     |  59 ++-
 .../wazevo/backend/isa/arm64/machine.go       |  17 +-
 .../isa/arm64/machine_pro_epi_logue.go        |  51 +-
 .../backend/isa/arm64/machine_regalloc.go     |  10 +-
 .../engine/wazevo/backend/regalloc.go         |  12 +-
 .../wazevo/backend/regalloc/regalloc.go       | 140 +++---
 .../engine/wazevo/backend/regalloc/regset.go  |  44 +-
 .../internal/engine/wazevo/call_engine.go     |  18 -
 .../internal/engine/wazevo/engine_cache.go    |   7 +
 .../engine/wazevo/frontend/frontend.go        |  21 +-
 .../internal/engine/wazevo/frontend/lower.go  |  12 +-
 .../internal/engine/wazevo/module_engine.go   |  31 +-
 .../internal/engine/wazevo/ssa/basic_block.go |  73 +--
 .../internal/engine/wazevo/ssa/builder.go     | 104 +++--
 .../wazero/internal/engine/wazevo/ssa/pass.go |  41 +-
 .../engine/wazevo/ssa/pass_blk_layouts.go     |  16 +-
 .../internal/engine/wazevo/ssa/pass_cfg.go    |  49 +-
 .../wazero/internal/engine/wazevo/ssa/type.go |   3 +
 .../internal/engine/wazevo/wazevoapi/pool.go  |   4 +-
 .../wazero/internal/platform/cpuid.go         |   5 +
 .../wazero/internal/platform/cpuid_amd64.go   |  36 +-
 .../internal/platform/cpuid_unsupported.go    |   9 +-
 .../wazero/internal/platform/mmap_unix.go     |   2 -
 .../internal/platform/mmap_unsupported.go     |   2 -
 .../wazero/internal/platform/mmap_windows.go  |   2 -
 .../tetratelabs/wazero/internal/sysfs/file.go |   3 -
 .../wazero/internal/wasm/engine.go            |   3 +
 .../wazero/internal/wasm/func_validation.go   |   7 -
 .../wazero/internal/wasm/memory.go            |  23 +-
 .../wazero/internal/wasm/module.go            |   7 +-
 vendor/modules.txt                            |   4 +-
 55 files changed, 1075 insertions(+), 1040 deletions(-)

diff --git a/go.mod b/go.mod
index 1fe0bf5d2..149a88117 100644
--- a/go.mod
+++ b/go.mod
@@ -44,7 +44,7 @@ require (
 	github.com/miekg/dns v1.1.59
 	github.com/minio/minio-go/v7 v7.0.71
 	github.com/mitchellh/mapstructure v1.5.0
-	github.com/ncruces/go-sqlite3 v0.16.1
+	github.com/ncruces/go-sqlite3 v0.16.2
 	github.com/oklog/ulid v1.3.1
 	github.com/prometheus/client_golang v1.19.1
 	github.com/spf13/cobra v1.8.0
@@ -199,7 +199,7 @@ require (
 	github.com/superseriousbusiness/go-jpeg-image-structure/v2 v2.0.0-20220321154430-d89a106fdabe // indirect
 	github.com/superseriousbusiness/go-png-image-structure/v2 v2.0.1-SSB // indirect
 	github.com/tdewolff/parse/v2 v2.7.14 // indirect
-	github.com/tetratelabs/wazero v1.7.2 // indirect
+	github.com/tetratelabs/wazero v1.7.3 // indirect
 	github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc // indirect
 	github.com/toqueteos/webbrowser v1.2.0 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
diff --git a/go.sum b/go.sum
index 81d950a8b..df382c545 100644
--- a/go.sum
+++ b/go.sum
@@ -445,8 +445,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/moul/http2curl v1.0.0 h1:dRMWoAtb+ePxMlLkrCbAqh4TlPHXvoGUSQ323/9Zahs=
 github.com/moul/http2curl v1.0.0/go.mod h1:8UbvGypXm98wA/IqH45anm5Y2Z6ep6O31QGOAZ3H0fQ=
-github.com/ncruces/go-sqlite3 v0.16.1 h1:1wHv7s8y+fWK44UIliotJ42ZV41A5T0sjIAqGmnMrkc=
-github.com/ncruces/go-sqlite3 v0.16.1/go.mod h1:feFXbBcbLtxNk6XWG1ROt8MS9+E45yCW3G8o4ixIqZ8=
+github.com/ncruces/go-sqlite3 v0.16.2 h1:HesVRr0BC6QSGSEQfEXOntFWS9wd4Z8ms4nJzfUv4Rg=
+github.com/ncruces/go-sqlite3 v0.16.2/go.mod h1:wkUIvOrAjFQnefVlivJfcowKUcfMHs4mvLfhVanzHHI=
 github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4=
 github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
 github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt7M=
@@ -562,8 +562,8 @@ github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739 h1:IkjBCtQOOjIn03
 github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739/go.mod h1:XPuWBzvdUzhCuxWO1ojpXsyzsA5bFoS3tO/Q3kFuTG8=
 github.com/technologize/otel-go-contrib v1.1.1 h1:wZH9aSPNWZWIkEh3vfaKfMb15AJ80jJ1aVj/4GZdqIw=
 github.com/technologize/otel-go-contrib v1.1.1/go.mod h1:dCN/wj2WyUO8aFZFdIN+6tfJHImjTML/8r2YVYAy3So=
-github.com/tetratelabs/wazero v1.7.2 h1:1+z5nXJNwMLPAWaTePFi49SSTL0IMx/i3Fg8Yc25GDc=
-github.com/tetratelabs/wazero v1.7.2/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y=
+github.com/tetratelabs/wazero v1.7.3 h1:PBH5KVahrt3S2AHgEjKu4u+LlDbbk+nsGE3KLucy6Rw=
+github.com/tetratelabs/wazero v1.7.3/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y=
 github.com/tidwall/btree v0.0.0-20191029221954-400434d76274 h1:G6Z6HvJuPjG6XfNGi/feOATzeJrfgTNJY+rGrHbA04E=
 github.com/tidwall/btree v0.0.0-20191029221954-400434d76274/go.mod h1:huei1BkDWJ3/sLXmO+bsCNELL+Bp2Kks9OLyQFkzvA8=
 github.com/tidwall/buntdb v1.1.2 h1:noCrqQXL9EKMtcdwJcmuVKSEjqu1ua99RHHgbLTEHRo=
diff --git a/vendor/github.com/ncruces/go-sqlite3/internal/util/json.go b/vendor/github.com/ncruces/go-sqlite3/internal/util/json.go
index c0ba38cf0..7f6849a42 100644
--- a/vendor/github.com/ncruces/go-sqlite3/internal/util/json.go
+++ b/vendor/github.com/ncruces/go-sqlite3/internal/util/json.go
@@ -26,7 +26,7 @@ func (j JSON) Scan(value any) error {
 		buf = v.AppendFormat(buf, time.RFC3339Nano)
 		buf = append(buf, '"')
 	case nil:
-		buf = append(buf, "null"...)
+		buf = []byte("null")
 	default:
 		panic(AssertErr())
 	}
diff --git a/vendor/github.com/ncruces/go-sqlite3/json.go b/vendor/github.com/ncruces/go-sqlite3/json.go
index 9b2565e87..2b762c092 100644
--- a/vendor/github.com/ncruces/go-sqlite3/json.go
+++ b/vendor/github.com/ncruces/go-sqlite3/json.go
@@ -5,7 +5,8 @@ import "github.com/ncruces/go-sqlite3/internal/util"
 // JSON returns a value that can be used as an argument to
 // [database/sql.DB.Exec], [database/sql.Row.Scan] and similar methods to
 // store value as JSON, or decode JSON into value.
-// JSON should NOT be used with [BindJSON] or [ResultJSON].
+// JSON should NOT be used with [Stmt.BindJSON], [Stmt.ColumnJSON],
+// [Value.JSON], or [Context.ResultJSON].
 func JSON(value any) any {
 	return util.JSON{Value: value}
 }
diff --git a/vendor/github.com/ncruces/go-sqlite3/pointer.go b/vendor/github.com/ncruces/go-sqlite3/pointer.go
index 611c1528c..0e2418b99 100644
--- a/vendor/github.com/ncruces/go-sqlite3/pointer.go
+++ b/vendor/github.com/ncruces/go-sqlite3/pointer.go
@@ -4,7 +4,8 @@ import "github.com/ncruces/go-sqlite3/internal/util"
 
 // Pointer returns a pointer to a value that can be used as an argument to
 // [database/sql.DB.Exec] and similar methods.
-// Pointer should NOT be used with [BindPointer] or [ResultPointer].
+// Pointer should NOT be used with [Stmt.BindPointer],
+// [Value.Pointer], or [Context.ResultPointer].
 //
 // https://sqlite.org/bindptr.html
 func Pointer[T any](value T) any {
diff --git a/vendor/github.com/ncruces/go-sqlite3/stmt.go b/vendor/github.com/ncruces/go-sqlite3/stmt.go
index ac40e3802..381a7d06b 100644
--- a/vendor/github.com/ncruces/go-sqlite3/stmt.go
+++ b/vendor/github.com/ncruces/go-sqlite3/stmt.go
@@ -564,7 +564,7 @@ func (s *Stmt) ColumnJSON(col int, ptr any) error {
 	var data []byte
 	switch s.ColumnType(col) {
 	case NULL:
-		data = append(data, "null"...)
+		data = []byte("null")
 	case TEXT:
 		data = s.ColumnRawText(col)
 	case BLOB:
diff --git a/vendor/github.com/ncruces/go-sqlite3/value.go b/vendor/github.com/ncruces/go-sqlite3/value.go
index d0edf215b..1894ff4f1 100644
--- a/vendor/github.com/ncruces/go-sqlite3/value.go
+++ b/vendor/github.com/ncruces/go-sqlite3/value.go
@@ -177,7 +177,7 @@ func (v Value) JSON(ptr any) error {
 	var data []byte
 	switch v.Type() {
 	case NULL:
-		data = append(data, "null"...)
+		data = []byte("null")
 	case TEXT:
 		data = v.RawText()
 	case BLOB:
diff --git a/vendor/github.com/ncruces/go-sqlite3/vfs/memdb/memdb.go b/vendor/github.com/ncruces/go-sqlite3/vfs/memdb/memdb.go
index 8dc57ab9c..f21335d8e 100644
--- a/vendor/github.com/ncruces/go-sqlite3/vfs/memdb/memdb.go
+++ b/vendor/github.com/ncruces/go-sqlite3/vfs/memdb/memdb.go
@@ -75,11 +75,6 @@ func (memVFS) FullPathname(name string) (string, error) {
 type memDB struct {
 	name string
 
-	// +checklocks:lockMtx
-	pending *memFile
-	// +checklocks:lockMtx
-	reserved *memFile
-
 	// +checklocks:dataMtx
 	data []*[sectorSize]byte
 
@@ -88,6 +83,10 @@ type memDB struct {
 
 	// +checklocks:lockMtx
 	shared int
+	// +checklocks:lockMtx
+	reserved bool
+	// +checklocks:lockMtx
+	pending bool
 
 	// +checklocks:memoryMtx
 	refs int
@@ -214,24 +213,24 @@ func (m *memFile) Lock(lock vfs.LockLevel) error {
 
 	switch lock {
 	case vfs.LOCK_SHARED:
-		if m.pending != nil {
+		if m.pending {
 			return sqlite3.BUSY
 		}
 		m.shared++
 
 	case vfs.LOCK_RESERVED:
-		if m.reserved != nil {
+		if m.reserved {
 			return sqlite3.BUSY
 		}
-		m.reserved = m
+		m.reserved = true
 
 	case vfs.LOCK_EXCLUSIVE:
 		if m.lock < vfs.LOCK_PENDING {
-			if m.pending != nil {
+			if m.pending {
 				return sqlite3.BUSY
 			}
 			m.lock = vfs.LOCK_PENDING
-			m.pending = m
+			m.pending = true
 		}
 
 		for before := time.Now(); m.shared > 1; {
@@ -256,11 +255,11 @@ func (m *memFile) Unlock(lock vfs.LockLevel) error {
 	m.lockMtx.Lock()
 	defer m.lockMtx.Unlock()
 
-	if m.pending == m {
-		m.pending = nil
+	if m.pending && m.lock >= vfs.LOCK_PENDING {
+		m.pending = false
 	}
-	if m.reserved == m {
-		m.reserved = nil
+	if m.reserved && m.lock >= vfs.LOCK_RESERVED {
+		m.reserved = false
 	}
 	if lock < vfs.LOCK_SHARED {
 		m.shared--
@@ -275,7 +274,7 @@ func (m *memFile) CheckReservedLock() (bool, error) {
 	}
 	m.lockMtx.Lock()
 	defer m.lockMtx.Unlock()
-	return m.reserved != nil, nil
+	return m.reserved, nil
 }
 
 func (m *memFile) SectorSize() int {
diff --git a/vendor/github.com/ncruces/go-sqlite3/vfs/shm.go b/vendor/github.com/ncruces/go-sqlite3/vfs/shm.go
index 58da34df4..7b0d4b677 100644
--- a/vendor/github.com/ncruces/go-sqlite3/vfs/shm.go
+++ b/vendor/github.com/ncruces/go-sqlite3/vfs/shm.go
@@ -125,6 +125,9 @@ func (s *vfsShm) shmMap(ctx context.Context, mod api.Module, id, size int32, ext
 		return 0, _IOERR_SHMMAP
 	}
 	s.regions = append(s.regions, r)
+	if s.readOnly {
+		return r.Ptr, _READONLY
+	}
 	return r.Ptr, _OK
 }
 
diff --git a/vendor/github.com/ncruces/go-sqlite3/vfs/shm_bsd.go b/vendor/github.com/ncruces/go-sqlite3/vfs/shm_bsd.go
index 3b45b3087..8c2abee81 100644
--- a/vendor/github.com/ncruces/go-sqlite3/vfs/shm_bsd.go
+++ b/vendor/github.com/ncruces/go-sqlite3/vfs/shm_bsd.go
@@ -101,13 +101,13 @@ func (s *vfsShm) shmOpen() (rc _ErrorCode) {
 		return _OK
 	}
 
-	// Open file read-write, as it will be shared.
+	// Always open file read-write, as it will be shared.
 	f, err := os.OpenFile(s.path,
 		unix.O_RDWR|unix.O_CREAT|unix.O_NOFOLLOW, 0666)
 	if err != nil {
 		return _CANTOPEN
 	}
-	// Close if file if it's not nil.
+	// Closes file if it's not nil.
 	defer func() { f.Close() }()
 
 	fi, err := f.Stat()
@@ -145,17 +145,14 @@ func (s *vfsShm) shmOpen() (rc _ErrorCode) {
 		info: fi,
 		refs: 1,
 	}
-	f = nil
-	add := true
+	f = nil // Don't close the file.
 	for i, g := range vfsShmFiles {
 		if g == nil {
 			vfsShmFiles[i] = s.vfsShmFile
-			add = false
+			return rc
 		}
 	}
-	if add {
-		vfsShmFiles = append(vfsShmFiles, s.vfsShmFile)
-	}
+	vfsShmFiles = append(vfsShmFiles, s.vfsShmFile)
 	return rc
 }
 
@@ -195,6 +192,9 @@ func (s *vfsShm) shmMap(ctx context.Context, mod api.Module, id, size int32, ext
 		return 0, _IOERR_SHMMAP
 	}
 	s.regions = append(s.regions, r)
+	if s.readOnly {
+		return r.Ptr, _READONLY
+	}
 	return r.Ptr, _OK
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/config.go b/vendor/github.com/tetratelabs/wazero/config.go
index 819a76df5..d3656849c 100644
--- a/vendor/github.com/tetratelabs/wazero/config.go
+++ b/vendor/github.com/tetratelabs/wazero/config.go
@@ -148,7 +148,7 @@ type RuntimeConfig interface {
 	//	customSections := c.CustomSections()
 	WithCustomSections(bool) RuntimeConfig
 
-	// WithCloseOnContextDone ensures the executions of functions to be closed under one of the following circumstances:
+	// WithCloseOnContextDone ensures the executions of functions to be terminated under one of the following circumstances:
 	//
 	// 	- context.Context passed to the Call method of api.Function is canceled during execution. (i.e. ctx by context.WithCancel)
 	// 	- context.Context passed to the Call method of api.Function reaches timeout during execution. (i.e. ctx by context.WithTimeout or context.WithDeadline)
@@ -159,6 +159,8 @@ type RuntimeConfig interface {
 	// entire underlying OS thread which runs the api.Function call. See "Why it's safe to execute runtime-generated
 	// machine codes against async Goroutine preemption" section in RATIONALE.md for detail.
 	//
+	// Upon the termination of the function executions, api.Module is closed.
+	//
 	// Note that this comes with a bit of extra cost when enabled. The reason is that internally this forces
 	// interpreter and compiler runtimes to insert the periodical checks on the conditions above. For that reason,
 	// this is disabled by default.
@@ -217,9 +219,18 @@ const (
 // part. wazero automatically performs ahead-of-time compilation as needed when
 // Runtime.CompileModule is invoked.
 //
-// Warning: This panics at runtime if the runtime.GOOS or runtime.GOARCH does not
-// support compiler. Use NewRuntimeConfig to safely detect and fallback to
-// NewRuntimeConfigInterpreter if needed.
+// # Warning
+//
+//   - This panics at runtime if the runtime.GOOS or runtime.GOARCH does not
+//     support compiler. Use NewRuntimeConfig to safely detect and fallback to
+//     NewRuntimeConfigInterpreter if needed.
+//
+//   - If you are using wazero in buildmode=c-archive or c-shared, make sure that you set up the alternate signal stack
+//     by using, e.g. `sigaltstack` combined with `SA_ONSTACK` flag on `sigaction` on Linux,
+//     before calling any api.Function. This is because the Go runtime does not set up the alternate signal stack
+//     for c-archive or c-shared modes, and wazero uses the different stack than the calling Goroutine.
+//     Hence, the signal handler might get invoked on the wazero's stack, which may cause a stack overflow.
+//     https://github.com/tetratelabs/wazero/blob/2092c0a879f30d49d7b37f333f4547574b8afe0d/internal/integration_test/fuzz/fuzz/tests/sigstack.rs#L19-L36
 func NewRuntimeConfigCompiler() RuntimeConfig {
 	ret := engineLessConfig.clone()
 	ret.engineKind = engineKindCompiler
diff --git a/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go b/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go
index 443c5a294..c75db615e 100644
--- a/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/checkpoint.go
@@ -21,13 +21,6 @@ type Snapshotter interface {
 	Snapshot() Snapshot
 }
 
-// EnableSnapshotterKey is a context key to indicate that snapshotting should be enabled.
-// The context.Context passed to a exported function invocation should have this key set
-// to a non-nil value, and host functions will be able to retrieve it using SnapshotterKey.
-//
-// Deprecated: use WithSnapshotter to enable snapshots.
-type EnableSnapshotterKey = expctxkeys.EnableSnapshotterKey
-
 // WithSnapshotter enables snapshots.
 // Passing the returned context to a exported function invocation enables snapshots,
 // and allows host functions to retrieve the Snapshotter using GetSnapshotter.
@@ -35,12 +28,6 @@ func WithSnapshotter(ctx context.Context) context.Context {
 	return context.WithValue(ctx, expctxkeys.EnableSnapshotterKey{}, struct{}{})
 }
 
-// SnapshotterKey is a context key to access a Snapshotter from a host function.
-// It is only present if EnableSnapshotter was set in the function invocation context.
-//
-// Deprecated: use GetSnapshotter to get the snapshotter.
-type SnapshotterKey = expctxkeys.SnapshotterKey
-
 // GetSnapshotter gets the Snapshotter from a host function.
 // It is only present if WithSnapshotter was called with the function invocation context.
 func GetSnapshotter(ctx context.Context) Snapshotter {
diff --git a/vendor/github.com/tetratelabs/wazero/experimental/listener.go b/vendor/github.com/tetratelabs/wazero/experimental/listener.go
index b2ba1fe83..55fc6b668 100644
--- a/vendor/github.com/tetratelabs/wazero/experimental/listener.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/listener.go
@@ -24,12 +24,6 @@ type StackIterator interface {
 	ProgramCounter() ProgramCounter
 }
 
-// FunctionListenerFactoryKey is a context.Context Value key.
-// Its associated value should be a FunctionListenerFactory.
-//
-// Deprecated: use WithFunctionListenerFactory to enable snapshots.
-type FunctionListenerFactoryKey = expctxkeys.FunctionListenerFactoryKey
-
 // WithFunctionListenerFactory registers a FunctionListenerFactory
 // with the context.
 func WithFunctionListenerFactory(ctx context.Context, factory FunctionListenerFactory) context.Context {
diff --git a/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go b/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go
index 761a1f9dc..5ebc1780f 100644
--- a/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/sys/syscall_errno_windows.go
@@ -23,6 +23,10 @@ const (
 	// instead of syscall.ENOTDIR
 	_ERROR_DIRECTORY = syscall.Errno(0x10B)
 
+	// _ERROR_NOT_A_REPARSE_POINT is a Windows error returned by os.Readlink
+	// instead of syscall.EINVAL
+	_ERROR_NOT_A_REPARSE_POINT = syscall.Errno(0x1126)
+
 	// _ERROR_INVALID_SOCKET is a Windows error returned by winsock_select
 	// when a given handle is not a socket.
 	_ERROR_INVALID_SOCKET = syscall.Errno(0x2736)
@@ -51,7 +55,7 @@ func errorToErrno(err error) Errno {
 			return EBADF
 		case syscall.ERROR_PRIVILEGE_NOT_HELD:
 			return EPERM
-		case _ERROR_NEGATIVE_SEEK, _ERROR_INVALID_NAME:
+		case _ERROR_NEGATIVE_SEEK, _ERROR_INVALID_NAME, _ERROR_NOT_A_REPARSE_POINT:
 			return EINVAL
 		}
 		errno, _ := syscallToErrno(err)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
index a89ddc457..18c5f4252 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
@@ -98,6 +98,9 @@ func (e *moduleEngine) SetGlobalValue(idx wasm.Index, lo, hi uint64) {
 // OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
 func (e *moduleEngine) OwnsGlobals() bool { return false }
 
+// MemoryGrown implements wasm.ModuleEngine.
+func (e *moduleEngine) MemoryGrown() {}
+
 // callEngine holds context per moduleEngine.Call, and shared across all the
 // function calls originating from the same moduleEngine.Call execution.
 //
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
index 81c6a6b62..8e9571b20 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
@@ -43,7 +43,7 @@ type ExecutableContextT[Instr any] struct {
 	labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
 	NextLabel         Label
 	// LabelPositions maps a label to the instructions of the region which the label represents.
-	LabelPositions     map[Label]*LabelPosition[Instr]
+	LabelPositions     []*LabelPosition[Instr]
 	OrderedBlockLabels []*LabelPosition[Instr]
 
 	// PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
@@ -67,7 +67,6 @@ func NewExecutableContextT[Instr any](
 		setNext:           setNext,
 		setPrev:           setPrev,
 		labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
-		LabelPositions:    make(map[Label]*LabelPosition[Instr]),
 		NextLabel:         LabelInvalid,
 	}
 }
@@ -97,11 +96,7 @@ func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
 	end := e.allocateNop0()
 	e.PerBlockHead, e.PerBlockEnd = end, end
 
-	labelPos, ok := e.LabelPositions[l]
-	if !ok {
-		labelPos = e.AllocateLabelPosition(l)
-		e.LabelPositions[l] = labelPos
-	}
+	labelPos := e.GetOrAllocateLabelPosition(l)
 	e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
 	labelPos.Begin, labelPos.End = end, end
 	labelPos.SB = blk
@@ -146,8 +141,8 @@ func (e *ExecutableContextT[T]) FlushPendingInstructions() {
 func (e *ExecutableContextT[T]) Reset() {
 	e.labelPositionPool.Reset()
 	e.InstructionPool.Reset()
-	for l := Label(0); l <= e.NextLabel; l++ {
-		delete(e.LabelPositions, l)
+	for i := range e.LabelPositions {
+		e.LabelPositions[i] = nil
 	}
 	e.PendingInstructions = e.PendingInstructions[:0]
 	e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
@@ -163,10 +158,17 @@ func (e *ExecutableContextT[T]) AllocateLabel() Label {
 	return e.NextLabel
 }
 
-func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
-	l := e.labelPositionPool.Allocate()
-	l.L = la
-	return l
+func (e *ExecutableContextT[T]) GetOrAllocateLabelPosition(l Label) *LabelPosition[T] {
+	if len(e.LabelPositions) <= int(l) {
+		e.LabelPositions = append(e.LabelPositions, make([]*LabelPosition[T], int(l)+1-len(e.LabelPositions))...)
+	}
+	ret := e.LabelPositions[l]
+	if ret == nil {
+		ret = e.labelPositionPool.Allocate()
+		ret.L = l
+		e.LabelPositions[l] = ret
+	}
+	return ret
 }
 
 func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
index 310ad2203..61ae6f406 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
@@ -1906,8 +1906,10 @@ func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
 func (m *machine) Format() string {
 	ectx := m.ectx
 	begins := map[*instruction]backend.Label{}
-	for l, pos := range ectx.LabelPositions {
-		begins[pos.Begin] = l
+	for _, pos := range ectx.LabelPositions {
+		if pos != nil {
+			begins[pos.Begin] = pos.L
+		}
 	}
 
 	irBlocks := map[backend.Label]ssa.BasicBlockID{}
@@ -1950,7 +1952,10 @@ func (m *machine) encodeWithoutSSA(root *instruction) {
 		offset := int64(len(*bufPtr))
 		if cur.kind == nop0 {
 			l := cur.nop0Label()
-			if pos, ok := ectx.LabelPositions[l]; ok {
+			if int(l) >= len(ectx.LabelPositions) {
+				continue
+			}
+			if pos := ectx.LabelPositions[l]; pos != nil {
 				pos.BinaryOffset = offset
 			}
 		}
@@ -2005,7 +2010,7 @@ func (m *machine) Encode(ctx context.Context) (err error) {
 			switch cur.kind {
 			case nop0:
 				l := cur.nop0Label()
-				if pos, ok := ectx.LabelPositions[l]; ok {
+				if pos := ectx.LabelPositions[l]; pos != nil {
 					pos.BinaryOffset = offset
 				}
 			case sourceOffsetInfo:
@@ -2165,8 +2170,7 @@ func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nol
 func (m *machine) allocateLabel() *labelPosition {
 	ectx := m.ectx
 	l := ectx.AllocateLabel()
-	pos := ectx.AllocateLabelPosition(l)
-	ectx.LabelPositions[l] = pos
+	pos := ectx.GetOrAllocateLabelPosition(l)
 	return pos
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
index 6615471c6..4eaa13ce1 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@@ -101,13 +101,14 @@ func (m *machine) LowerParams(args []ssa.Value) {
 			bits := arg.Type.Bits()
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
 			load := m.allocateInstr()
 			switch arg.Type {
 			case ssa.TypeI32, ssa.TypeI64:
-				load.asULoad(operandNR(reg), amode, bits)
+				load.asULoad(reg, amode, bits)
 			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-				load.asFpuLoad(operandNR(reg), amode, bits)
+				load.asFpuLoad(reg, amode, bits)
 			default:
 				panic("BUG")
 			}
@@ -169,7 +170,8 @@ func (m *machine) LowerReturns(rets []ssa.Value) {
 
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
 			store := m.allocateInstr()
 			store.asStore(operandNR(reg), amode, bits)
 			m.insert(store)
@@ -215,9 +217,9 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 		ldr := m.allocateInstr()
 		switch r.Type {
 		case ssa.TypeI32, ssa.TypeI64:
-			ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asULoad(reg, amode, r.Type.Bits())
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asFpuLoad(reg, amode, r.Type.Bits())
 		default:
 			panic("BUG")
 		}
@@ -225,7 +227,7 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 	}
 }
 
-func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, *addressMode) {
 	exct := m.executableContext
 	exct.PendingInstructions = exct.PendingInstructions[:0]
 	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
@@ -235,15 +237,15 @@ func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset
 	return cur, mode
 }
 
-func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) *addressMode {
 	if rn.RegType() != regalloc.RegTypeInt {
 		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
 	}
-	var amode addressMode
+	amode := m.amodePool.Allocate()
 	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
 	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
 	} else {
 		var indexReg regalloc.VReg
 		if allowTmpRegUse {
@@ -253,7 +255,7 @@ func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn reg
 			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
 			m.lowerConstantI64(indexReg, offset)
 		}
-		amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+		*amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
 	}
 	return amode
 }
@@ -315,7 +317,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+		alu.asALU(ao, rd, operandNR(spVReg), imm12Operand, true)
 		m.insert(alu)
 	} else {
 		m.lowerConstantI64(tmpRegVReg, diff)
@@ -326,7 +328,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		alu.asALU(ao, rd, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		m.insert(alu)
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
index 7a9cceb33..f8b5d97ac 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@@ -59,25 +59,26 @@ func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regallo
 	} else {
 		postIndexImm = 8
 	}
-	loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+	loadMode := m.amodePool.Allocate()
+	*loadMode = addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
 
 	instr := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32:
-		instr.asULoad(loadTargetReg, loadMode, 32)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeI64:
-		instr.asULoad(loadTargetReg, loadMode, 64)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeF32:
-		instr.asFpuLoad(loadTargetReg, loadMode, 32)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeF64:
-		instr.asFpuLoad(loadTargetReg, loadMode, 64)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeV128:
-		instr.asFpuLoad(loadTargetReg, loadMode, 128)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 128)
 	}
 	cur = linkInstr(cur, instr)
 
 	if isStackArg {
-		var storeMode addressMode
+		var storeMode *addressMode
 		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
 		toStack := m.allocateInstr()
 		toStack.asStore(loadTargetReg, storeMode, bits)
@@ -113,21 +114,22 @@ func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr reg
 	}
 
 	if isStackArg {
-		var loadMode addressMode
+		var loadMode *addressMode
 		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
 		toReg := m.allocateInstr()
 		switch typ {
 		case ssa.TypeI32, ssa.TypeI64:
-			toReg.asULoad(storeTargetReg, loadMode, bits)
+			toReg.asULoad(storeTargetReg.reg(), loadMode, bits)
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+			toReg.asFpuLoad(storeTargetReg.reg(), loadMode, bits)
 		default:
 			panic("TODO?")
 		}
 		cur = linkInstr(cur, toReg)
 	}
 
-	mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
 	instr := m.allocateInstr()
 	instr.asStore(storeTargetReg, mode, bits)
 	cur = linkInstr(cur, instr)
@@ -214,11 +216,12 @@ func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction
 
 func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
 	instr := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
 	if store {
 		instr.asStore(operandNR(d), mode, 64)
 	} else {
-		instr.asULoad(operandNR(d), mode, 64)
+		instr.asULoad(d, mode, 64)
 	}
 	return linkInstr(prev, instr)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
index 466b1f960..99e6bb482 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@@ -87,7 +87,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		// Module context is always the second argument.
 		moduleCtrPtr := x1VReg
 		store := m.allocateInstr()
-		amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
 		store.asStore(operandNR(moduleCtrPtr), amode, 64)
 		cur = linkInstr(cur, store)
 	}
@@ -120,11 +121,9 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		} else {
 			sizeInBits = 64
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindPostIndex,
-				rn:   arg0ret0AddrReg, imm: int64(sizeInBits / 8),
-			}, sizeInBits)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8)}
+		store.asStore(operandNR(v), amode, sizeInBits)
 		cur = linkInstr(cur, store)
 	}
 
@@ -139,7 +138,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		frameSizeReg = xzrVReg
 		sliceSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg, -16, true)
+	_amode := addressModePreOrPostIndex(m, spVReg, -16, true)
 	storeP := m.allocateInstr()
 	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
 	cur = linkInstr(cur, storeP)
@@ -165,8 +164,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
 	ldr := m.allocateInstr()
 	// And load the return address.
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	amode := addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */)
+	ldr.asULoad(lrVReg, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
@@ -183,23 +182,24 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		r := &abi.Rets[i]
 		if r.Kind == backend.ABIArgKindReg {
 			loadIntoReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asULoad(r.Reg, mode, 32)
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asULoad(r.Reg, mode, 64)
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 32)
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 64)
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 128)
 			default:
 				panic("TODO")
 			}
@@ -208,28 +208,29 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 			// First we need to load the value to a temporary just like ^^.
 			intTmp, floatTmp := x11VReg, v11VReg
 			loadIntoTmpReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			var resultReg regalloc.VReg
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+				loadIntoTmpReg.asULoad(intTmp, mode, 32)
 				resultReg = intTmp
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+				loadIntoTmpReg.asULoad(intTmp, mode, 64)
 				resultReg = intTmp
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 32)
 				resultReg = floatTmp
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 64)
 				resultReg = floatTmp
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 128)
 				resultReg = floatTmp
 			default:
 				panic("TODO")
@@ -258,12 +259,13 @@ func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regal
 		case regalloc.RegTypeFloat:
 			sizeInBits = 128
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		store.asStore(operandNR(v), mode, sizeInBits)
 		store.prev = cur
 		cur.next = store
 		cur = store
@@ -276,7 +278,7 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
 	for _, v := range regs {
 		load := m.allocateInstr()
-		var as func(dst operand, amode addressMode, sizeInBits byte)
+		var as func(dst regalloc.VReg, amode *addressMode, sizeInBits byte)
 		var sizeInBits byte
 		switch v.RegType() {
 		case regalloc.RegTypeInt:
@@ -286,12 +288,13 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 			as = load.asFpuLoad
 			sizeInBits = 128
 		}
-		as(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		as(v, mode, sizeInBits)
 		cur = linkInstr(cur, load)
 		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
 	}
@@ -324,11 +327,9 @@ func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode
 
 	// Set the exit status on the execution context.
 	setExistStatus := m.allocateInstr()
-	setExistStatus.asStore(operandNR(constReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64()}
+	setExistStatus.asStore(operandNR(constReg), mode, 32)
 	cur = linkInstr(cur, setExistStatus)
 	return cur
 }
@@ -340,12 +341,13 @@ func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
 	cur = linkInstr(cur, adr)
 
 	storeReturnAddr := m.allocateInstr()
-	storeReturnAddr.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			// Execution context is always the first argument.
-			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		// Execution context is always the first argument.
+		rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeReturnAddr.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, storeReturnAddr)
 
 	// Exit the execution.
@@ -364,11 +366,12 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 	cur = linkInstr(cur, movSp)
 
 	strSp := m.allocateInstr()
-	strSp.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSp.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, strSp)
 	return cur
 }
@@ -376,27 +379,28 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
 	load := m.allocateInstr()
 	var result regalloc.VReg
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
 	switch arg.Type {
 	case ssa.TypeI32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 32)
+		load.asULoad(intVReg, mode, 32)
 		result = intVReg
 	case ssa.TypeI64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 64)
+		load.asULoad(intVReg, mode, 64)
 		result = intVReg
 	case ssa.TypeF32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 32)
+		load.asFpuLoad(floatVReg, mode, 32)
 		result = floatVReg
 	case ssa.TypeF64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 64)
+		load.asFpuLoad(floatVReg, mode, 64)
 		result = floatVReg
 	case ssa.TypeV128:
 		mode.imm = 16
-		load.asFpuLoad(operandNR(floatVReg), mode, 128)
+		load.asFpuLoad(floatVReg, mode, 128)
 		result = floatVReg
 	default:
 		panic("TODO")
@@ -408,7 +412,8 @@ func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg r
 
 func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
 	store := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
 	var sizeInBits byte
 	switch result.Type {
 	case ssa.TypeI32, ssa.TypeF32:
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
index 8aabc5997..7121cb538 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -3,10 +3,12 @@ package arm64
 import (
 	"fmt"
 	"math"
+	"unsafe"
 
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
 )
 
 type (
@@ -22,9 +24,9 @@ type (
 	// TODO: optimize the layout later once the impl settles.
 	instruction struct {
 		prev, next          *instruction
-		u1, u2, u3          uint64
-		rd, rm, rn, ra      operand
-		amode               addressMode
+		u1, u2              uint64
+		rd                  regalloc.VReg
+		rm, rn              operand
 		kind                instructionKind
 		addedBeforeRegAlloc bool
 	}
@@ -174,7 +176,7 @@ func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		*regs = append(*regs, i.rd.nr())
+		*regs = append(*regs, i.rd)
 	case defKindCall:
 		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
 		for i := byte(0); i < retIntRealRegs; i++ {
@@ -194,7 +196,7 @@ func (i *instruction) AssignDef(reg regalloc.VReg) {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		i.rd = i.rd.assignReg(reg)
+		i.rd = reg
 	case defKindCall:
 		panic("BUG: call instructions shouldn't be assigned")
 	default:
@@ -329,7 +331,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 		if rm := i.rm.reg(); rm.Valid() {
 			*regs = append(*regs, rm)
 		}
-		if ra := i.ra.reg(); ra.Valid() {
+		if ra := regalloc.VReg(i.u2); ra.Valid() {
 			*regs = append(*regs, ra)
 		}
 	case useKindRNRN1RM:
@@ -341,18 +343,20 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 			*regs = append(*regs, rm)
 		}
 	case useKindAMode:
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindRNAMode:
 		*regs = append(*regs, i.rn.reg())
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindCond:
@@ -374,7 +378,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 	case useKindRDRewrite:
 		*regs = append(*regs, i.rn.reg())
 		*regs = append(*regs, i.rm.reg())
-		*regs = append(*regs, i.rd.reg())
+		*regs = append(*regs, i.rd)
 	default:
 		panic(fmt.Sprintf("useKind for %v not defined", i))
 	}
@@ -408,8 +412,8 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if rd := i.rd.reg(); rd.Valid() {
-				i.rd = i.rd.assignReg(reg)
+			if rd := i.rd; rd.Valid() {
+				i.rd = reg
 			}
 		}
 	case useKindRNRN1RM:
@@ -435,32 +439,36 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if ra := i.ra.reg(); ra.Valid() {
-				i.ra = i.ra.assignReg(reg)
+			if ra := regalloc.VReg(i.u2); ra.Valid() {
+				i.u2 = uint64(reg)
 			}
 		}
 	case useKindAMode:
 		if index == 0 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			}
 		}
 	case useKindRNAMode:
 		if index == 0 {
 			i.rn = i.rn.assignReg(reg)
 		} else if index == 1 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			} else {
 				panic("BUG")
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			} else {
 				panic("BUG")
 			}
@@ -503,35 +511,35 @@ func (i *instruction) callFuncRef() ssa.FuncRef {
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movZ
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movK
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movN
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -553,21 +561,21 @@ func (i *instruction) asRet() {
 	i.kind = ret
 }
 
-func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = storeP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = loadP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asStore(src operand, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = store8
@@ -589,10 +597,10 @@ func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
 		i.kind = fpuStore128
 	}
 	i.rn = src
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asSLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = sLoad8
@@ -604,10 +612,10 @@ func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
 		panic("BUG")
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asULoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = uLoad8
@@ -619,10 +627,10 @@ func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
 		i.kind = uLoad64
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asFpuLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 32:
 		i.kind = fpuLoad32
@@ -632,10 +640,18 @@ func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte)
 		i.kind = fpuLoad128
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
+func (i *instruction) getAmode() *addressMode {
+	return wazevoapi.PtrFromUintptr[addressMode](uintptr(i.u1))
+}
+
+func (i *instruction) setAmode(a *addressMode) {
+	i.u1 = uint64(uintptr(unsafe.Pointer(a)))
+}
+
+func (i *instruction) asVecLoad1R(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	// NOTE: currently only has support for no-offset loads, though it is suspicious that
 	// we would need to support offset load (that is only available for post-index).
 	i.kind = vecLoad1R
@@ -646,32 +662,32 @@ func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
 
 func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
 	i.kind = cSet
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(c)
 	if mask {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = cSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asFpuCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = fpuCSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
@@ -691,7 +707,7 @@ func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, tar
 }
 
 func (i *instruction) brTableSequenceOffsetsResolved() {
-	i.u3 = 1 // indicate that the offsets are resolved, for debugging.
+	i.rm.data = 1 // indicate that the offsets are resolved, for debugging.
 }
 
 func (i *instruction) brLabel() label {
@@ -701,7 +717,7 @@ func (i *instruction) brLabel() label {
 // brOffsetResolved is called when the target label is resolved.
 func (i *instruction) brOffsetResolve(offset int64) {
 	i.u2 = uint64(offset)
-	i.u3 = 1 // indicate that the offset is resolved, for debugging.
+	i.rm.data = 1 // indicate that the offset is resolved, for debugging.
 }
 
 func (i *instruction) brOffset() int64 {
@@ -714,7 +730,7 @@ func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
 	i.u1 = c.asUint64()
 	i.u2 = uint64(target)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -728,17 +744,17 @@ func (i *instruction) condBrLabel() label {
 
 // condBrOffsetResolve is called when the target label is resolved.
 func (i *instruction) condBrOffsetResolve(offset int64) {
-	i.rd.data = uint64(offset)
-	i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
+	i.rn.data = uint64(offset)
+	i.rn.data2 = 1 // indicate that the offset is resolved, for debugging.
 }
 
 // condBrOffsetResolved returns true if condBrOffsetResolve is already called.
 func (i *instruction) condBrOffsetResolved() bool {
-	return i.rd.data2 == 1
+	return i.rn.data2 == 1
 }
 
 func (i *instruction) condBrOffset() int64 {
-	return int64(i.rd.data)
+	return int64(i.rn.data)
 }
 
 func (i *instruction) condBrCond() cond {
@@ -746,33 +762,33 @@ func (i *instruction) condBrCond() cond {
 }
 
 func (i *instruction) condBr64bit() bool {
-	return i.u3 == 1
+	return i.u2&(1<<32) != 0
 }
 
 func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst32
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst64
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
 	i.kind = loadFpuConst128
 	i.u1 = lo
 	i.u2 = hi
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
 	i.kind = fpuCmp
 	i.rn, i.rm = rn, rm
 	if is64bit {
-		i.u3 = 1
+		i.u1 = 1
 	}
 }
 
@@ -783,12 +799,12 @@ func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, i
 	i.u1 = uint64(c)
 	i.u2 = uint64(flag)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALU(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR
@@ -804,22 +820,22 @@ func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
+func (i *instruction) asALURRRR(aluOp aluOp, rd regalloc.VReg, rn, rm operand, ra regalloc.VReg, dst64bit bool) {
 	i.kind = aluRRRR
 	i.u1 = uint64(aluOp)
-	i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
+	i.rd, i.rn, i.rm, i.u2 = rd, rn, rm, uint64(ra)
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
 // asALUShift setups a shift based ALU instruction.
-func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALUShift(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
@@ -831,17 +847,17 @@ func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool)
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
 	i.kind = aluRRBitmaskImm
 	i.u1 = uint64(aluOp)
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u2 = imm
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
@@ -852,76 +868,76 @@ func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
 
 func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
 	i.kind = movFromFPSR
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
 	i.kind = bitRR
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(bitOp)
 	if is64bit {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asFpuRRR(op fpuBinOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	i.kind = fpuRRR
 	i.u1 = uint64(op)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) {
+func (i *instruction) asFpuRR(op fpuUniOp, rd regalloc.VReg, rn operand, dst64bit bool) {
 	i.kind = fpuRR
 	i.u1 = uint64(op)
 	i.rd, i.rn = rd, rn
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
 func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
 	i.kind = extend
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(fromBits)
 	i.u2 = uint64(toBits)
 	if signed {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asMove32(rd, rn regalloc.VReg) {
 	i.kind = mov32
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
 	i.kind = mov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
 func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
 	i.kind = fpuMov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
 	i.kind = fpuMov128
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
-func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asMovToVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = movToVec
 	i.rd = rd
 	i.rn = rn
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) {
+func (i *instruction) asMovFromVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex, signed bool) {
 	if signed {
 		i.kind = movFromVecSigned
 	} else {
@@ -932,48 +948,48 @@ func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vec
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecDup(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecDup
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asVecDupElement(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = vecDupElement
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) {
+func (i *instruction) asVecExtract(rd regalloc.VReg, rn, rm operand, arr vecArrangement, index uint32) {
 	i.kind = vecExtract
 	i.u1 = uint64(arr)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
+func (i *instruction) asVecMovElement(rd regalloc.VReg, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
 	i.kind = vecMovElement
 	i.u1 = uint64(arr)
-	i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex)
+	i.u2 = uint64(rdIndex) | uint64(rnIndex)<<32
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecMisc(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecMisc
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecLanes(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecLanes
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecShiftImm(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecShiftImm
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
@@ -981,7 +997,7 @@ func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrange
 	return i
 }
 
-func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecTbl(nregs byte, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	switch nregs {
 	case 0, 1:
 		i.kind = vecTbl
@@ -1000,14 +1016,14 @@ func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangemen
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecPermute(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecPermute
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecRRR(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecRRR
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1017,7 +1033,7 @@ func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement)
 
 // asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
 // IMPORTANT: the destination register must be already defined before this instruction.
-func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecRRRRewrite(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecRRRRewrite
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1033,8 +1049,8 @@ func (i *instruction) IsCopy() bool {
 
 // String implements fmt.Stringer.
 func (i *instruction) String() (str string) {
-	is64SizeBitToSize := func(u3 uint64) byte {
-		if u3 == 0 {
+	is64SizeBitToSize := func(v uint64) byte {
+		if v == 0 {
 			return 32
 		}
 		return 64
@@ -1049,46 +1065,46 @@ func (i *instruction) String() (str string) {
 			str = "nop0"
 		}
 	case aluRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size),
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size))
 	case aluRRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(regalloc.VReg(i.u2), size))
 	case aluRRImm12:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
 	case aluRRBitmaskImm:
-		size := is64SizeBitToSize(i.u3)
-		rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
+		size := is64SizeBitToSize(i.u1 >> 32)
+		rd, rn := formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size)
 		if size == 32 {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
 		} else {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
 		}
 	case aluRRImmShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %#x",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.shiftImm(),
 		)
 	case aluRRRShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size),
 		)
 	case aluRRRExtend:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			// Regardless of the source size, the register is formatted in 32-bit.
 			i.rm.format(32),
@@ -1097,57 +1113,57 @@ func (i *instruction) String() (str string) {
 		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s",
 			bitOp(i.u1),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 		)
 	case uLoad8:
-		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad8:
-		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad16:
-		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad16:
-		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad32:
-		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case store8:
-		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8))
+		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(8))
 	case store16:
-		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16))
+		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(16))
 	case store32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(32))
 	case store64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case storeP64:
 		str = fmt.Sprintf("stp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case loadP64:
 		str = fmt.Sprintf("ldp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case mov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegSized(i.rd.nr(), 64),
+			formatVRegSized(i.rd, 64),
 			formatVRegSized(i.rn.nr(), 64))
 	case mov32:
-		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd, 32), formatVRegSized(i.rn.nr(), 32))
 	case movZ:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movN:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movK:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case extend:
 		fromBits, toBits := byte(i.u1), byte(i.u2)
 
 		var signedStr string
-		if i.u3 == 1 {
+		if i.u2>>32 == 1 {
 			signedStr = "s"
 		} else {
 			signedStr = "u"
@@ -1161,39 +1177,39 @@ func (i *instruction) String() (str string) {
 		case 32:
 			fromStr = "w"
 		}
-		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd, toBits), formatVRegSized(i.rn.nr(), 32))
 	case cSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("csel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
 		)
 	case cSet:
 		if i.u2 != 0 {
-			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		} else {
-			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		}
 	case cCmpImm:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
 			formatVRegSized(i.rn.nr(), size), i.rm.data,
 			i.u2&0b1111,
 			condFlag(i.u1))
 	case fpuMov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement8B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
 	case fpuMov128:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
 	case fpuMovFromVec:
 		panic("TODO")
 	case fpuRR:
-		dstSz := is64SizeBitToSize(i.u3)
+		dstSz := is64SizeBitToSize(i.u2)
 		srcSz := dstSz
 		op := fpuUniOp(i.u1)
 		switch op {
@@ -1203,38 +1219,38 @@ func (i *instruction) String() (str string) {
 			srcSz = 64
 		}
 		str = fmt.Sprintf("%s %s, %s", op.String(),
-			formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz))
+			formatVRegSized(i.rd, dstSz), formatVRegSized(i.rn.nr(), srcSz))
 	case fpuRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuRRI:
 		panic("TODO")
 	case fpuRRRR:
 		panic("TODO")
 	case fpuCmp:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1)
 		str = fmt.Sprintf("fcmp %s, %s",
 			formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case fpuStore32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(64))
 	case fpuLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case fpuStore64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case fpuLoad128:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 128), i.getAmode().format(64))
 	case fpuStore128:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.getAmode().format(64))
 	case loadFpuConst32:
-		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
+		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd, 32), math.Float32frombits(uint32(i.u1)))
 	case loadFpuConst64:
-		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
+		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd, 64), math.Float64frombits(i.u1))
 	case loadFpuConst128:
 		str = fmt.Sprintf("ldr %s, #8; b 32; data.v128  %016x %016x",
-			formatVRegSized(i.rd.nr(), 128), i.u1, i.u2)
+			formatVRegSized(i.rd, 128), i.u1, i.u2)
 	case fpuToInt:
 		var op, src, dst string
 		if signed := i.u1 == 1; signed {
@@ -1242,15 +1258,15 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "fcvtzu"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
 		} else {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegSized(i.rd.nr(), 64)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegSized(i.rd, 64)
 		} else {
-			dst = formatVRegSized(i.rd.nr(), 32)
+			dst = formatVRegSized(i.rd, 32)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 
@@ -1261,21 +1277,21 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "ucvtf"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegSized(i.rn.nr(), 64)
 		} else {
 			src = formatVRegSized(i.rn.nr(), 32)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegWidthVec(i.rd, vecArrangementD)
 		} else {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS)
+			dst = formatVRegWidthVec(i.rd, vecArrangementS)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 	case fpuCSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("fcsel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
@@ -1291,7 +1307,7 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
+		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd, arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
 	case movFromVec, movFromVecSigned:
 		var size byte
 		var opcode string
@@ -1315,23 +1331,23 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
+		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd, size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
 	case vecDup:
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegSized(i.rn.nr(), 64),
 		)
 	case vecDupElement:
 		arr := vecArrangement(i.u1)
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
 		)
 	case vecDupFromFpu:
 		panic("TODO")
 	case vecExtract:
 		str = fmt.Sprintf("ext %s, %s, %s, #%d",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
 			uint32(i.u2),
@@ -1340,15 +1356,15 @@ func (i *instruction) String() (str string) {
 		panic("TODO")
 	case vecMovElement:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)),
-			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndex(i.u2&0xffffffff)),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u2>>32)),
 		)
 	case vecMiscNarrow:
 		panic("TODO")
 	case vecRRR, vecRRRRewrite:
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
 		)
@@ -1356,12 +1372,12 @@ func (i *instruction) String() (str string) {
 		vop := vecOp(i.u1)
 		if vop == vecOpCmeq0 {
 			str = fmt.Sprintf("cmeq %s, %s, #0",
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		} else {
 			str = fmt.Sprintf("%s %s, %s",
 				vop,
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		}
 	case vecLanes:
@@ -1379,24 +1395,24 @@ func (i *instruction) String() (str string) {
 		}
 		str = fmt.Sprintf("%s %s, %s",
 			vecOp(i.u1),
-			formatVRegWidthVec(i.rd.nr(), destArr),
+			formatVRegWidthVec(i.rd, destArr),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
 	case vecShiftImm:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, #%d",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			i.rm.shiftImm())
 	case vecTbl:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("tbl %s, { %s }, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case vecTbl2:
 		arr := vecArrangement(i.u2)
-		rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr()
+		rd, rn, rm := i.rd, i.rn.nr(), i.rm.nr()
 		rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
 		str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
 			formatVRegVec(rd, arr, vecIndexNone),
@@ -1407,13 +1423,13 @@ func (i *instruction) String() (str string) {
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case movToFPSR:
 		str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
 	case movFromFPSR:
-		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64))
+		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd, 64))
 	case call:
 		str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
 	case callInd:
@@ -1422,15 +1438,15 @@ func (i *instruction) String() (str string) {
 		str = "ret"
 	case br:
 		target := label(i.u1)
-		if i.u3 != 0 {
+		if i.rm.data != 0 {
 			str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
 		} else {
 			str = fmt.Sprintf("b %s", target.String())
 		}
 	case condBr:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		c := cond(i.u1)
-		target := label(i.u2)
+		target := label(i.u2 & 0xffffffff)
 		switch c.kind() {
 		case condKindRegisterZero:
 			if !i.condBrOffsetResolved() {
@@ -1456,7 +1472,7 @@ func (i *instruction) String() (str string) {
 			}
 		}
 	case adr:
-		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
+		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd, 64), int64(i.u1))
 	case brTableSequence:
 		targetIndex := i.u1
 		str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
@@ -1473,7 +1489,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicCas:
 		m := "casal"
 		size := byte(32)
@@ -1485,7 +1501,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
 	case atomicLoad:
 		m := "ldar"
 		size := byte(32)
@@ -1497,7 +1513,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicStore:
 		m := "stlr"
 		size := byte(32)
@@ -1517,9 +1533,9 @@ func (i *instruction) String() (str string) {
 	case emitSourceOffsetInfo:
 		str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
 	case vecLoad1R:
-		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
 	case loadConstBlockArg:
-		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1)
+		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd, 64), i.u1)
 	default:
 		panic(i.kind)
 	}
@@ -1528,26 +1544,26 @@ func (i *instruction) String() (str string) {
 
 func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
 	i.kind = adr
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(offset)
 }
 
-func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicRmw
-	i.rd, i.rn, i.rm = rt, rn, rs
+	i.rd, i.rn, i.rm = rt, operandNR(rn), operandNR(rs)
 	i.u1 = uint64(op)
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicCas(rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicCas
-	i.rm, i.rn, i.rd = rt, rn, rs
+	i.rm, i.rn, i.rd = operandNR(rt), operandNR(rn), rs
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) {
+func (i *instruction) asAtomicLoad(rn, rt regalloc.VReg, size uint64) {
 	i.kind = atomicLoad
-	i.rn, i.rd = rn, rt
+	i.rn, i.rd = operandNR(rn), rt
 	i.u2 = size
 }
 
@@ -1755,12 +1771,12 @@ func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.V
 	i.kind = loadConstBlockArg
 	i.u1 = v
 	i.u2 = uint64(typ)
-	i.rd = operandNR(dst)
+	i.rd = dst
 	return i
 }
 
 func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
-	return i.u1, ssa.Type(i.u2), i.rd.nr()
+	return i.u1, ssa.Type(i.u2), i.rd
 }
 
 func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
@@ -1778,7 +1794,7 @@ func (i *instruction) asUDF() *instruction {
 	return i
 }
 
-func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) {
+func (i *instruction) asFpuToInt(rd regalloc.VReg, rn operand, rdSigned, src64bit, dst64bit bool) {
 	i.kind = fpuToInt
 	i.rn = rn
 	i.rd = rd
@@ -1789,11 +1805,11 @@ func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
-func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) {
+func (i *instruction) asIntToFpu(rd regalloc.VReg, rn operand, rnSigned, src64bit, dst64bit bool) {
 	i.kind = intToFpu
 	i.rn = rn
 	i.rd = rd
@@ -1804,7 +1820,7 @@ func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
@@ -1817,7 +1833,7 @@ func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
 // aluOp determines the type of ALU operation. Instructions whose kind is one of
 // aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
 // would use this type.
-type aluOp int
+type aluOp uint32
 
 func (a aluOp) String() string {
 	switch a {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
index 227a96474..f0ede2d6a 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -44,12 +44,12 @@ func (i *instruction) encode(m *machine) {
 	case callInd:
 		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
 	case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], *i.getAmode()))
 	case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.RealReg()], *i.getAmode()))
 	case vecLoad1R:
 		c.Emit4Bytes(encodeVecLoad1R(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1)))
 	case condBr:
@@ -75,22 +75,22 @@ func (i *instruction) encode(m *machine) {
 			panic("BUG")
 		}
 	case movN:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movZ:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movK:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case mov32:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to]))
 	case mov64:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		toIsSp := to == sp
 		fromIsSp := from == sp
 		c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp))
 	case loadP64, storeP64:
 		rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
-		amode := i.amode
+		amode := i.getAmode()
 		rn := regNumberInEncoding[amode.rn.RealReg()]
 		var pre bool
 		switch amode.kind {
@@ -102,21 +102,21 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm))
 	case loadFpuConst32:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
 			encodeLoadFpuConst32(c, rd, i.u1)
 		}
 	case loadFpuConst64:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
-			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1)
+			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.RealReg()], i.u1)
 		}
 	case loadFpuConst128:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		lo, hi := i.u1, i.u2
 		if lo == 0 && hi == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B))
@@ -126,35 +126,35 @@ func (i *instruction) encode(m *machine) {
 	case aluRRRR:
 		c.Emit4Bytes(encodeAluRRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.ra.realReg()],
-			uint32(i.u3),
+			regNumberInEncoding[regalloc.VReg(i.u2).RealReg()],
+			uint32(i.u1>>32),
 		))
 	case aluRRImmShift:
 		c.Emit4Bytes(encodeAluRRImm(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
-			uint32(i.u3),
+			uint32(i.u2>>32),
 		))
 	case aluRRR:
 		rn := i.rn.realReg()
 		c.Emit4Bytes(encodeAluRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[rn],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2>>32 == 1,
 			rn == sp,
 		))
 	case aluRRRExtend:
 		rm, exo, to := i.rm.er()
 		c.Emit4Bytes(encodeAluRRRExtend(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[rm.RealReg()],
 			exo,
@@ -164,25 +164,25 @@ func (i *instruction) encode(m *machine) {
 		r, amt, sop := i.rm.sr()
 		c.Emit4Bytes(encodeAluRRRShift(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[r.RealReg()],
 			uint32(amt),
 			sop,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case aluRRBitmaskImm:
 		c.Emit4Bytes(encodeAluBitmaskImmediate(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			i.u2,
-			i.u3 == 1,
+			i.u1>>32 == 1,
 		))
 	case bitRR:
 		c.Emit4Bytes(encodeBitRR(
 			bitOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2)),
 		)
@@ -190,22 +190,22 @@ func (i *instruction) encode(m *machine) {
 		imm12, shift := i.rm.imm12()
 		c.Emit4Bytes(encodeAluRRImm12(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			imm12, shift,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case fpuRRR:
 		c.Emit4Bytes(encodeFpuRRR(
 			fpuBinOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuMov64, fpuMov128:
 		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		rn := regNumberInEncoding[i.rn.realReg()]
 		var q uint32
 		if kind == fpuMov128 {
@@ -213,7 +213,7 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd)
 	case cSet:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		cf := condFlag(i.u1)
 		if i.u2 == 1 {
 			// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV-
@@ -225,12 +225,12 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd)
 		}
 	case extend:
-		c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()]))
+		c.Emit4Bytes(encodeExtend((i.u2>>32) == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.RealReg()], regNumberInEncoding[i.rn.realReg()]))
 	case fpuCmp:
 		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en
 		rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
 		var ftype uint32
-		if i.u3 == 1 {
+		if i.u1 == 1 {
 			ftype = 0b01 // double precision.
 		}
 		c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5)
@@ -242,34 +242,34 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0)
 		}
 	case adr:
-		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1)))
+		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.RealReg()], uint32(i.u1)))
 	case cSel:
 		c.Emit4Bytes(encodeConditionalSelect(
 			kind,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuCSel:
 		c.Emit4Bytes(encodeFpuCSel(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case movToVec:
 		c.Emit4Bytes(encodeMoveToVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
 		))
 	case movFromVec, movFromVecSigned:
 		c.Emit4Bytes(encodeMoveFromVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
@@ -277,18 +277,18 @@ func (i *instruction) encode(m *machine) {
 		))
 	case vecDup:
 		c.Emit4Bytes(encodeVecDup(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1))))
 	case vecDupElement:
 		c.Emit4Bytes(encodeVecDupElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2)))
 	case vecExtract:
 		c.Emit4Bytes(encodeVecExtract(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u1)),
@@ -296,35 +296,35 @@ func (i *instruction) encode(m *machine) {
 	case vecPermute:
 		c.Emit4Bytes(encodeVecPermute(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u2))))
 	case vecMovElement:
 		c.Emit4Bytes(encodeVecMovElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1),
-			uint32(i.u2), uint32(i.u3),
+			uint32(i.u2), uint32(i.u2>>32),
 		))
 	case vecMisc:
 		c.Emit4Bytes(encodeAdvancedSIMDTwoMisc(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecLanes:
 		c.Emit4Bytes(encodeVecLanes(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecShiftImm:
 		c.Emit4Bytes(encodeVecShiftImm(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
 			vecArrangement(i.u2),
@@ -332,7 +332,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl:
 		c.Emit4Bytes(encodeVecTbl(
 			1,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -340,7 +340,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl2:
 		c.Emit4Bytes(encodeVecTbl(
 			2,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -353,9 +353,9 @@ func (i *instruction) encode(m *machine) {
 	case fpuRR:
 		c.Emit4Bytes(encodeFloatDataOneSource(
 			fpuUniOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case vecRRR:
 		if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal {
@@ -365,14 +365,14 @@ func (i *instruction) encode(m *machine) {
 	case vecRRRRewrite:
 		c.Emit4Bytes(encodeVecRRR(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2),
 		))
 	case cCmpImm:
 		// Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-		sf := uint32(i.u3 & 0b1)
+		sf := uint32((i.u2 >> 32) & 0b1)
 		nzcv := uint32(i.u2 & 0b1111)
 		cond := uint32(condFlag(i.u1))
 		imm := uint32(i.rm.data & 0b11111)
@@ -381,7 +381,7 @@ func (i *instruction) encode(m *machine) {
 			sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv,
 		)
 	case movFromFPSR:
-		rt := regNumberInEncoding[i.rd.realReg()]
+		rt := regNumberInEncoding[i.rd.RealReg()]
 		c.Emit4Bytes(encodeSystemRegisterMove(rt, true))
 	case movToFPSR:
 		rt := regNumberInEncoding[i.rn.realReg()]
@@ -390,13 +390,13 @@ func (i *instruction) encode(m *machine) {
 		c.Emit4Bytes(encodeAtomicRmw(
 			atomicRmwOp(i.u1),
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
 		))
 	case atomicCas:
 		c.Emit4Bytes(encodeAtomicCas(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
@@ -404,7 +404,7 @@ func (i *instruction) encode(m *machine) {
 	case atomicLoad:
 		c.Emit4Bytes(encodeAtomicLoadStore(
 			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			uint32(i.u2),
 			1,
 		))
@@ -810,7 +810,7 @@ func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32
 // encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
 func encodeCnvBetweenFloatInt(i *instruction) uint32 {
-	rd := regNumberInEncoding[i.rd.realReg()]
+	rd := regNumberInEncoding[i.rd.RealReg()]
 	rn := regNumberInEncoding[i.rn.realReg()]
 
 	var opcode uint32
@@ -822,8 +822,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b00
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 		if signed {
 			opcode = 0b010
 		} else {
@@ -841,8 +841,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b11
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 
 		if signed {
 			opcode = 0b000
@@ -1787,13 +1787,13 @@ func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) {
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
 //
 // "shift" must have been divided by 16 at this point.
-func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) {
+func encodeMoveWideImmediate(opc uint32, rd uint32, imm uint64, shift, _64bit uint32) (ret uint32) {
 	ret = rd
 	ret |= uint32(imm&0xffff) << 5
-	ret |= (uint32(shift)) << 21
+	ret |= (shift) << 21
 	ret |= 0b100101 << 23
 	ret |= opc << 29
-	ret |= uint32(_64bit) << 31
+	ret |= _64bit << 31
 	return
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
index 698b382d4..6c6824fb0 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@@ -284,18 +284,18 @@ func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
 
 func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVZ(dst, v, uint64(shift), dst64)
+	instr.asMOVZ(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVK(dst, v, uint64(shift), dst64)
+	instr.asMOVK(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVN(dst, v, uint64(shift), dst64)
+	instr.asMOVN(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
index 2bb234e8c..048bf3204 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -52,11 +52,11 @@ func (m *machine) lowerBrTable(i *ssa.Instruction) {
 	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
 	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
+	subs.asALU(aluOpSubS, xzrVReg, indexOperand, operandNR(maxIndexReg), false)
 	m.insert(subs)
 	csel := m.allocateInstr()
 	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
-	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
+	csel.asCSel(adjustedIndex, operandNR(maxIndexReg), indexOperand, hs, false)
 	m.insert(csel)
 
 	brSequence := m.allocateInstr()
@@ -249,7 +249,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
 			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-			rd := operandNR(m.compiler.VRegOf(instr.Return()))
+			rd := m.compiler.VRegOf(instr.Return())
 			m.lowerSelectVec(rc, rn, rm, rd)
 		} else {
 			m.lowerSelect(c, x, y, instr.Return())
@@ -270,7 +270,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
@@ -278,7 +278,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
@@ -286,25 +286,25 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFcvtFromUint:
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFdemote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
 		m.insert(cnt)
 	case ssa.OpcodeFpromote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
 		m.insert(cnt)
@@ -343,15 +343,15 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
 	case ssa.OpcodeSrem, ssa.OpcodeUrem:
 		x, y, ctx := instr.Arg3()
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
+		rd := m.compiler.VRegOf(instr.Return())
+		m.lowerIRem(ctxVReg, rd, rn.nr(), rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
 	case ssa.OpcodeVconst:
 		result := m.compiler.VRegOf(instr.Return())
 		lo, hi := instr.VconstData()
@@ -362,7 +362,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
 		m.insert(ins)
 	case ssa.OpcodeVbxor:
@@ -382,12 +382,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
 		// in case when it is used somewhere else.
 		mov := m.allocateInstr()
-		mov.asFpuMov128(tmp.nr(), creg.nr())
+		mov.asFpuMov128(tmp, creg.nr())
 		m.insert(mov)
 
 		ins := m.allocateInstr()
@@ -396,7 +396,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		mov2 := m.allocateInstr()
 		rd := m.compiler.VRegOf(instr.Return())
-		mov2.asFpuMov128(rd, tmp.nr())
+		mov2.asFpuMov128(rd, tmp)
 		m.insert(mov2)
 	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
 		x, lane := instr.ArgWithLane()
@@ -405,12 +405,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			arr = ssaLaneToArrangement(lane)
 		}
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVcheckTrue(op, rm, rd, arr)
 	case ssa.OpcodeVhighBits:
 		x, lane := instr.ArgWithLane()
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		arr := ssaLaneToArrangement(lane)
 		m.lowerVhighBits(rm, rd, arr)
 	case ssa.OpcodeVIadd:
@@ -441,9 +441,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			panic("unsupported lane " + lane.String())
 		}
 
-		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr)
-		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr)
-		addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr)
+		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo.nr(), vv, operandShiftImm(0), loArr)
+		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi.nr(), vv, operandShiftImm(0), hiArr)
+		addp := m.allocateInstr().asVecRRR(vecOpAddp, m.compiler.VRegOf(instr.Return()), tmpLo, tmpHi, dstArr)
 		m.insert(widenLo)
 		m.insert(widenHi)
 		m.insert(addp)
@@ -493,7 +493,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVIMul(rd, rn, rm, arr)
 	case ssa.OpcodeVIabs:
 		m.lowerVecMisc(vecOpAbs, instr)
@@ -507,7 +507,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVShift(op, rd, rn, rm, arr)
 	case ssa.OpcodeVSqrt:
 		m.lowerVecMisc(vecOpFsqrt, instr)
@@ -547,18 +547,18 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
 	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
 	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		var arr vecArrangement
 		switch lane {
@@ -580,7 +580,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -607,9 +607,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		loQxtn := m.allocateInstr()
 		hiQxtn := m.allocateInstr()
@@ -628,7 +628,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		m.insert(hiQxtn)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmp.nr())
+		mov.asFpuMov128(rd, tmp)
 		m.insert(mov)
 	case ssa.OpcodeFvpromoteLow:
 		x, lane := instr.ArgWithLane()
@@ -637,7 +637,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeFvdemote:
@@ -647,14 +647,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeExtractlane:
 		x, index, signed, lane := instr.ExtractlaneData()
 
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		mov := m.allocateInstr()
 		switch lane {
@@ -680,12 +680,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, index, lane := instr.InsertlaneData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		rd := m.compiler.VRegOf(instr.Return())
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Initially mov rn to tmp.
 		mov1 := m.allocateInstr()
-		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
+		mov1.asFpuMov128(tmpReg, rn.nr())
 		m.insert(mov1)
 
 		// movToVec and vecMovElement do not clear the remaining bits to zero,
@@ -709,14 +709,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		// Finally mov tmp to rd.
 		mov3 := m.allocateInstr()
-		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
+		mov3.asFpuMov128(rd, tmpReg)
 		m.insert(mov3)
 
 	case ssa.OpcodeSwizzle:
 		x, y, lane := instr.Arg2WithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -729,14 +729,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, lane1, lane2 := instr.ShuffleData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		m.lowerShuffle(rd, rn, rm, lane1, lane2)
 
 	case ssa.OpcodeSplat:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		dup := m.allocateInstr()
 		switch lane {
@@ -760,12 +760,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
 			m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp2, vecArrangement4S))
 
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr()))
+		rd := m.compiler.VRegOf(instr.Return())
+		m.insert(m.allocateInstr().asFpuMov128(rd, tmp.nr()))
 
 	case ssa.OpcodeLoadSplat:
 		ptr, offset, lane := instr.LoadSplatData()
@@ -794,7 +794,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	m.executableContext.FlushPendingInstructions()
 }
 
-func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
+func (m *machine) lowerShuffle(rd regalloc.VReg, rn, rm operand, lane1, lane2 uint64) {
 	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
 	vReg, wReg := v29VReg, v30VReg
 
@@ -822,7 +822,7 @@ func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
 	m.insert(tbl2)
 }
 
-func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVShift(op ssa.Opcode, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	var modulo byte
 	switch arr {
 	case vecArrangement16B:
@@ -847,13 +847,13 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	if op != ssa.OpcodeVIshl {
 		// Negate the amount to make this as right shift.
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
+		neg.asALU(aluOpSub, rtmp.nr(), operandNR(xzrVReg), rtmp, true)
 		m.insert(neg)
 	}
 
 	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
 	dup := m.allocateInstr()
-	dup.asVecDup(vtmp, rtmp, arr)
+	dup.asVecDup(vtmp.nr(), rtmp, arr)
 	m.insert(dup)
 
 	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
@@ -867,7 +867,7 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	}
 }
 
-func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm operand, rd regalloc.VReg, arr vecArrangement) {
 	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 
 	// Special case VallTrue for i64x2.
@@ -878,11 +878,11 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		//	cset dst, eq
 
 		ins := m.allocateInstr()
-		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
+		ins.asVecMisc(vecOpCmeq0, tmp.nr(), rm, vecArrangement2D)
 		m.insert(ins)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
+		addp.asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp, vecArrangement2D)
 		m.insert(addp)
 
 		fcmp := m.allocateInstr()
@@ -890,7 +890,7 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		m.insert(fcmp)
 
 		cset := m.allocateInstr()
-		cset.asCSet(rd.nr(), false, eq)
+		cset.asCSet(rd, false, eq)
 		m.insert(cset)
 
 		return
@@ -900,10 +900,10 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	ins := m.allocateInstr()
 	if op == ssa.OpcodeVanyTrue {
 		// 	umaxp v4?.16b, v2?.16b, v2?.16b
-		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
+		ins.asVecRRR(vecOpUmaxp, tmp.nr(), rm, rm, vecArrangement16B)
 	} else {
 		// 	uminv d4?, v2?.4s
-		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
+		ins.asVecLanes(vecOpUminv, tmp.nr(), rm, arr)
 	}
 	m.insert(ins)
 
@@ -917,15 +917,15 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	m.insert(movv)
 
 	fc := m.allocateInstr()
-	fc.asCCmpImm(rd, uint64(0), al, 0, true)
+	fc.asCCmpImm(operandNR(rd), uint64(0), al, 0, true)
 	m.insert(fc)
 
 	cset := m.allocateInstr()
-	cset.asCSet(rd.nr(), false, ne)
+	cset.asCSet(rd, false, ne)
 	m.insert(cset)
 }
 
-func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVhighBits(rm operand, rd regalloc.VReg, arr vecArrangement) {
 	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
 	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
@@ -947,7 +947,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v1[i] = 0xff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(7), vecArrangement16B)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -958,7 +958,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		// Lane-wise logical AND with the bit mask, meaning that we have
@@ -967,23 +967,23 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Below, we use the following notation:
 		// wi := (1 << i) if vi<0, 0 otherwise.
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v1.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
 		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
 		ext := m.allocateInstr()
-		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
+		ext.asVecExtract(v0.nr(), v1, v1, vecArrangement16B, uint32(8))
 		m.insert(ext)
 
 		// v = [w0, w8, ..., w7, w15]
 		zip1 := m.allocateInstr()
-		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
+		zip1.asVecPermute(vecOpZip1, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(zip1)
 
 		// v.h[0] = w0 + ... + w15
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		// Extract the v.h[0] as the result.
@@ -1006,7 +1006,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(15), vecArrangement8H)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1014,26 +1014,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(4), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1055,7 +1055,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffffffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(31), vecArrangement4S)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1063,26 +1063,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(2), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement4S)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1102,21 +1102,21 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// Move the higher 64-bit int into r0.
 		movv1 := m.allocateInstr()
-		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
+		movv1.asMovFromVec(r0.nr(), rm, vecArrangementD, vecIndex(1), false)
 		m.insert(movv1)
 
 		// Move the sign bit into the least significant bit.
 		lsr1 := m.allocateInstr()
-		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
+		lsr1.asALUShift(aluOpLsr, r0.nr(), r0, operandShiftImm(63), true)
 		m.insert(lsr1)
 
 		lsr2 := m.allocateInstr()
-		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
+		lsr2.asALUShift(aluOpLsr, rd, operandNR(rd), operandShiftImm(63), true)
 		m.insert(lsr2)
 
 		// rd = (r0<<1) | rd
 		lsl := m.allocateInstr()
-		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
+		lsl.asALU(aluOpAdd, rd, operandNR(rd), operandSR(r0.nr(), 1, shiftOpLSL), false)
 		m.insert(lsl)
 	default:
 		panic("Unsupported " + arr.String())
@@ -1128,7 +1128,7 @@ func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
 	arr := ssaLaneToArrangement(lane)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	ins.asVecMisc(op, rd, rn, arr)
 	m.insert(ins)
 }
@@ -1137,22 +1137,22 @@ func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 	ins.asVecRRR(op, rd, rn, rm, arr)
 	m.insert(ins)
 }
 
-func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVIMul(rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	if arr != vecArrangement2D {
 		mul := m.allocateInstr()
 		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
 		m.insert(mul)
 	} else {
-		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp1 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp3 := m.compiler.AllocateVReg(ssa.TypeV128)
 
-		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmpRes := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
 		rev64 := m.allocateInstr()
@@ -1160,7 +1160,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(rev64)
 
 		mul := m.allocateInstr()
-		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
+		mul.asVecRRR(vecOpMul, tmp2, operandNR(tmp2), rn, vecArrangement4S)
 		m.insert(mul)
 
 		xtn1 := m.allocateInstr()
@@ -1168,7 +1168,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(xtn1)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
+		addp.asVecRRR(vecOpAddp, tmp2, operandNR(tmp2), operandNR(tmp2), vecArrangement4S)
 		m.insert(addp)
 
 		xtn2 := m.allocateInstr()
@@ -1179,15 +1179,15 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		// In short, in UMLAL instruction, the result register is also one of the source register, and
 		// the value on the result register is significant.
 		shll := m.allocateInstr()
-		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
+		shll.asVecMisc(vecOpShll, tmpRes, operandNR(tmp2), vecArrangement2S)
 		m.insert(shll)
 
 		umlal := m.allocateInstr()
-		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
+		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, operandNR(tmp3), operandNR(tmp1), vecArrangement2S)
 		m.insert(umlal)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmpRes.nr())
+		mov.asFpuMov128(rd, tmpRes)
 		m.insert(mov)
 	}
 }
@@ -1203,7 +1203,7 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 	// BSL modifies the destination register, so we need to use a temporary register so that
 	// the actual definition of the destination register happens *after* the BSL instruction.
 	// That way, we can force the spill instruction to be inserted after the BSL instruction.
-	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 	fcmgt := m.allocateInstr()
 	if max {
@@ -1220,17 +1220,17 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 
 	res := operandNR(m.compiler.VRegOf(instr.Return()))
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(res.nr(), tmp.nr())
+	mov2.asFpuMov128(res.nr(), tmp)
 	m.insert(mov2)
 }
 
-func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn regalloc.VReg, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
-		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpSDiv, rd, operandNR(rn), rm, _64bit)
 	} else {
-		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpUDiv, rd, operandNR(rn), rm, _64bit)
 	}
 	m.insert(div)
 
@@ -1239,11 +1239,11 @@ func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 
 	// rd = rn-rd*rm by MSUB instruction.
 	msub := m.allocateInstr()
-	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
+	msub.asALURRRR(aluOpMSub, rd, operandNR(rd), rm, rn, _64bit)
 	m.insert(msub)
 }
 
-func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIDiv(execCtxVReg, rd regalloc.VReg, rn, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
@@ -1260,7 +1260,7 @@ func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
 		minusOneCheck := m.allocateInstr()
 		// Sets eq condition if rm == -1.
-		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
+		minusOneCheck.asALU(aluOpAddS, xzrVReg, rm, operandImm12(1, 0), _64bit)
 		m.insert(minusOneCheck)
 
 		ccmp := m.allocateInstr()
@@ -1290,20 +1290,20 @@ func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, c
 func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmpI, tmpF operand
+	var tmpI, tmpF regalloc.VReg
 	_64 := x.Type() == ssa.TypeF64
 	if _64 {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF64)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF32)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
 	rd := m.compiler.VRegOf(ret)
-	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
+	m.lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF, _64)
 }
 
-func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
+func (m *machine) lowerFcopysignImpl(rd regalloc.VReg, rn, rm operand, tmpI, tmpF regalloc.VReg, _64bit bool) {
 	// This is exactly the same code emitted by GCC for "__builtin_copysign":
 	//
 	//    mov     x0, -9223372036854775808
@@ -1313,26 +1313,26 @@ func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool
 
 	setMSB := m.allocateInstr()
 	if _64bit {
-		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
+		m.lowerConstantI64(tmpI, math.MinInt64)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementD, vecIndex(0))
 	} else {
-		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
+		m.lowerConstantI32(tmpI, math.MinInt32)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementS, vecIndex(0))
 	}
 	m.insert(setMSB)
 
-	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+	tmpReg := m.compiler.AllocateVReg(ssa.TypeF64)
 
 	mov := m.allocateInstr()
-	mov.asFpuMov64(tmpReg.nr(), rn.nr())
+	mov.asFpuMov64(tmpReg, rn.nr())
 	m.insert(mov)
 
 	vbit := m.allocateInstr()
-	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
+	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, operandNR(tmpF), vecArrangement8B)
 	m.insert(vbit)
 
 	movDst := m.allocateInstr()
-	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
+	movDst.asFpuMov64(rd, tmpReg)
 	m.insert(movDst)
 }
 
@@ -1340,7 +1340,7 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 	v, dstType := instr.BitcastData()
 	srcType := v.Type()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	srcInt := srcType.IsInt()
 	dstInt := dstType.IsInt()
 	switch {
@@ -1371,14 +1371,14 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 
 func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(out))
+	rd := m.compiler.VRegOf(out)
 
 	neg := m.allocateInstr()
 	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
 	m.insert(neg)
 }
 
-func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
+func (m *machine) lowerFpuToInt(rd regalloc.VReg, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
 	if !nonTrapping {
 		// First of all, we have to clear the FPU flags.
 		flagClear := m.allocateInstr()
@@ -1405,7 +1405,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 		// Check if the conversion was undefined by comparing the status with 1.
 		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
 		alu := m.allocateInstr()
-		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
+		alu.asALU(aluOpSubS, xzrVReg, operandNR(tmpReg), operandImm12(1, 0), true)
 		m.insert(alu)
 
 		// If it is not undefined, we can return the result.
@@ -1429,7 +1429,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 	}
 }
 
-func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
+func (m *machine) lowerIntToFpu(rd regalloc.VReg, rn operand, signed, src64bit, dst64bit bool) {
 	cvt := m.allocateInstr()
 	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
 	m.insert(cvt)
@@ -1456,7 +1456,7 @@ func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(instr)
 }
@@ -1482,7 +1482,7 @@ func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
 	case !add && yNegated: // rn+rm = x-(-y) = x-y
 		aop = aluOpAdd
 	}
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	alu := m.allocateInstr()
 	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(alu)
@@ -1527,7 +1527,7 @@ func (m *machine) lowerIcmp(si *ssa.Instruction) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
+	alu.asALU(aluOpSubS, xzrVReg, rn, rm, in64bit)
 	m.insert(alu)
 
 	cset := m.allocateInstr()
@@ -1542,7 +1542,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1554,7 +1554,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1598,7 +1598,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1610,7 +1610,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1631,7 +1631,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 	}
 }
 
-func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuToInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
@@ -1643,15 +1643,15 @@ func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool
 	if arr == vecArrangement2D {
 		narrow := m.allocateInstr()
 		if signed {
-			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpSqxtn, rd, operandNR(rd), vecArrangement2S)
 		} else {
-			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpUqxtn, rd, operandNR(rd), vecArrangement2S)
 		}
 		m.insert(narrow)
 	}
 }
 
-func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuFromInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
@@ -1665,7 +1665,7 @@ func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
 	x, amount := si.Arg2()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
@@ -1678,11 +1678,11 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 
-	var rd operand
+	var rd regalloc.VReg
 	if ignoreResult {
-		rd = operandNR(xzrVReg)
+		rd = xzrVReg
 	} else {
-		rd = operandNR(m.compiler.VRegOf(si.Return()))
+		rd = m.compiler.VRegOf(si.Return())
 	}
 
 	_64 := x.Type().Bits() == 64
@@ -1691,7 +1691,7 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 		c := instr.ConstantVal()
 		if isBitMaskImmediate(c, _64) {
 			// Constant bit wise operations can be lowered to a single instruction.
-			alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
+			alu.asALUBitmaskImm(op, rd, rn.nr(), c, _64)
 			m.insert(alu)
 			return
 		}
@@ -1709,25 +1709,25 @@ func (m *machine) lowerRotl(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	rd := operandNR(m.compiler.VRegOf(r))
+	rd := m.compiler.VRegOf(r)
 
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
 }
 
-func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
+func (m *machine) lowerRotlImpl(rd regalloc.VReg, rn, rm operand, tmp regalloc.VReg, is64bit bool) {
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	neg := m.allocateInstr()
 	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
 	m.insert(neg)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
+	alu.asALU(aluOpRotR, rd, rn, operandNR(tmp), is64bit)
 	m.insert(alu)
 }
 
@@ -1737,7 +1737,7 @@ func (m *machine) lowerRotr(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
@@ -1797,7 +1797,7 @@ func (m *machine) lowerImul(x, y, result ssa.Value) {
 	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
 
 	mul := m.allocateInstr()
-	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
+	mul.asALURRRR(aluOpMAdd, rd, rn, rm, xzrVReg, x.Type().Bits() == 64)
 	m.insert(mul)
 }
 
@@ -1849,22 +1849,22 @@ func (m *machine) lowerPopcnt(x, result ssa.Value) {
 	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
 	//
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 
 	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	ins := m.allocateInstr()
-	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
+	ins.asMovToVec(rf1.nr(), rn, vecArrangementD, vecIndex(0))
 	m.insert(ins)
 
 	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	cnt := m.allocateInstr()
-	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
+	cnt.asVecMisc(vecOpCnt, rf2.nr(), rf1, vecArrangement16B)
 	m.insert(cnt)
 
 	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	uaddlv := m.allocateInstr()
-	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
+	uaddlv.asVecLanes(vecOpUaddlv, rf3.nr(), rf2, vecArrangement8B)
 	m.insert(uaddlv)
 
 	mov := m.allocateInstr()
@@ -1879,32 +1879,35 @@ func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.Ex
 	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
 
 	setExitCode := m.allocateInstr()
-	setExitCode.asStore(operandNR(tmpReg1),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+	}
+	setExitCode.asStore(operandNR(tmpReg1), mode, 32)
 
 	// In order to unwind the stack, we also need to push the current stack pointer:
 	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
 	movSpToTmp := m.allocateInstr()
 	movSpToTmp.asMove64(tmp2, spVReg)
 	strSpToExecCtx := m.allocateInstr()
-	strSpToExecCtx.asStore(operandNR(tmp2),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode2 := m.amodePool.Allocate()
+	*mode2 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSpToExecCtx.asStore(operandNR(tmp2), mode2, 64)
 	// Also the address of this exit.
 	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
 	currentAddrToTmp := m.allocateInstr()
 	currentAddrToTmp.asAdr(tmp3, 0)
 	storeCurrentAddrToExecCtx := m.allocateInstr()
-	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode3 := m.amodePool.Allocate()
+	*mode3 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), mode3, 64)
 
 	exitSeq := m.allocateInstr()
 	exitSeq.asExitSequence(execCtxVReg)
@@ -1937,7 +1940,7 @@ func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
 	alu.asALU(
 		aluOpSubS,
 		// We don't need the result, just need to set flags.
-		operandNR(xzrVReg),
+		xzrVReg,
 		rn,
 		rm,
 		x.Type().Bits() == 64,
@@ -2012,7 +2015,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 		alu.asALU(
 			aluOpSubS,
 			// We don't need the result, just need to set flags.
-			operandNR(xzrVReg),
+			xzrVReg,
 			rn,
 			operandNR(xzrVReg),
 			c.Type().Bits() == 64,
@@ -2024,7 +2027,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	switch x.Type() {
 	case ssa.TypeI32, ssa.TypeI64:
 		// csel rd, rn, rm, cc
@@ -2041,10 +2044,10 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	}
 }
 
-func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
+func (m *machine) lowerSelectVec(rc, rn, rm operand, rd regalloc.VReg) {
 	// First check if `rc` is zero or not.
 	checkZero := m.allocateInstr()
-	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
+	checkZero.asALU(aluOpSubS, xzrVReg, rc, operandNR(xzrVReg), false)
 	m.insert(checkZero)
 
 	// Then use CSETM to set all bits to one if `rc` is zero.
@@ -2054,7 +2057,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 	m.insert(cset)
 
 	// Then move the bits to the result vector register.
-	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
 	dup := m.allocateInstr()
 	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
 	m.insert(dup)
@@ -2067,7 +2070,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 
 	// Finally, move the result to the destination register.
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(rd.nr(), tmp2.nr())
+	mov2.asFpuMov128(rd, tmp2)
 	m.insert(mov2)
 }
 
@@ -2099,28 +2102,28 @@ func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
 	addr, val := si.Arg2()
 	addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 	rs := m.getOperand_NR(valDef, extModeNone)
 
 	_64 := si.Return().Type().Bits() == 64
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64)
+	m.lowerAtomicRmwImpl(op, rn.nr(), rs.nr(), rt, tmp, size, negateArg, flipArg, _64)
 }
 
-func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) {
+func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp regalloc.VReg, size uint64, negateArg, flipArg, dst64bit bool) {
 	switch {
 	case negateArg:
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit)
+		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(neg)
 	case flipArg:
 		flip := m.allocateInstr()
-		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit)
+		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(flip)
 	default:
 		tmp = rs
@@ -2139,32 +2142,32 @@ func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
 	rn := m.getOperand_NR(addrDef, extModeNone)
 	rt := m.getOperand_NR(replDef, extModeNone)
 	rs := m.getOperand_NR(expDef, extModeNone)
-	tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type()))
+	tmp := m.compiler.AllocateVReg(si.Return().Type())
 
 	_64 := si.Return().Type().Bits() == 64
 	// rs is overwritten by CAS, so we need to move it to the result register before the instruction
 	// in case when it is used somewhere else.
 	mov := m.allocateInstr()
 	if _64 {
-		mov.asMove64(tmp.nr(), rs.nr())
+		mov.asMove64(tmp, rs.nr())
 	} else {
-		mov.asMove32(tmp.nr(), rs.nr())
+		mov.asMove32(tmp, rs.nr())
 	}
 	m.insert(mov)
 
-	m.lowerAtomicCasImpl(rn, tmp, rt, size)
+	m.lowerAtomicCasImpl(rn.nr(), tmp, rt.nr(), size)
 
 	mov2 := m.allocateInstr()
 	rd := m.compiler.VRegOf(si.Return())
 	if _64 {
-		mov2.asMove64(rd, tmp.nr())
+		mov2.asMove64(rd, tmp)
 	} else {
-		mov2.asMove32(rd, tmp.nr())
+		mov2.asMove32(rd, tmp)
 	}
 	m.insert(mov2)
 }
 
-func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) {
+func (m *machine) lowerAtomicCasImpl(rn, rs, rt regalloc.VReg, size uint64) {
 	cas := m.allocateInstr()
 	cas.asAtomicCas(rn, rs, rt, size)
 	m.insert(cas)
@@ -2176,12 +2179,12 @@ func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
 
 	addrDef := m.compiler.ValueDefinition(addr)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 
-	m.lowerAtomicLoadImpl(rn, rt, size)
+	m.lowerAtomicLoadImpl(rn.nr(), rt, size)
 }
 
-func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) {
+func (m *machine) lowerAtomicLoadImpl(rn, rt regalloc.VReg, size uint64) {
 	ld := m.allocateInstr()
 	ld.asAtomicLoad(rn, rt, size)
 	m.insert(ld)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
index 4842eaa38..fd0760d72 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@@ -24,6 +24,14 @@ type (
 	addressModeKind byte
 )
 
+func resetAddressMode(a *addressMode) {
+	a.kind = 0
+	a.rn = 0
+	a.rm = 0
+	a.extOp = 0
+	a.imm = 0
+}
+
 const (
 	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
 	// and then scaled by bits(type)/8.
@@ -140,15 +148,17 @@ func (a addressMode) format(dstSizeBits byte) (ret string) {
 	return
 }
 
-func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+func addressModePreOrPostIndex(m *machine, rn regalloc.VReg, imm int64, preIndex bool) *addressMode {
 	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
 		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
 	}
+	mode := m.amodePool.Allocate()
 	if preIndex {
-		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
 	} else {
-		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
 	}
+	return mode
 }
 
 func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
@@ -207,9 +217,9 @@ func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret
 	amode := m.lowerToAddressMode(ptr, offset, size)
 	load := m.allocateInstr()
 	if signed {
-		load.asSLoad(operandNR(ret), amode, size)
+		load.asSLoad(ret, amode, size)
 	} else {
-		load.asULoad(operandNR(ret), amode, size)
+		load.asULoad(ret, amode, size)
 	}
 	m.insert(load)
 }
@@ -221,11 +231,11 @@ func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(dst), amode, typ.Bits())
+		load.asULoad(dst, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+		load.asFpuLoad(dst, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(dst), amode, 128)
+		load.asFpuLoad(dst, amode, 128)
 	default:
 		panic("TODO")
 	}
@@ -239,7 +249,7 @@ func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane,
 	m.lowerConstantI64(offsetReg, int64(offset))
 	addedBase := m.addReg64ToReg64(base, offsetReg)
 
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 
 	ld1r := m.allocateInstr()
 	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
@@ -258,7 +268,7 @@ func (m *machine) lowerStore(si *ssa.Instruction) {
 }
 
 // lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
-func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode *addressMode) {
 	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
 	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
 	// to support more efficient address resolution.
@@ -272,32 +282,33 @@ func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte
 // During the construction, this might emit additional instructions.
 //
 // Extracted as a separate function for easy testing.
-func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode *addressMode) {
+	amode = m.amodePool.Allocate()
 	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
 	case a64sExist && a32sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		var a32 addend32
 		a32 = a32s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+		*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
 	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
 		offset = 0
 	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
 		offset = 0
 	case a64sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		if !a64s.Empty() {
 			index := a64s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+			*amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	case a32sExist:
 		base32 := a32s.Dequeue()
@@ -314,14 +325,14 @@ func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32],
 
 		if !a32s.Empty() {
 			index := a32s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+			*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	default: // Only static offsets.
 		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.lowerConstantI64(tmpReg, offset)
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
 		offset = 0
 	}
 
@@ -411,13 +422,13 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
 	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), imm12Op, true)
 	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
-		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpSub, rd, operandNR(r), imm12Op, true)
 	} else {
 		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.load64bitConst(c, tmp)
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), operandNR(tmp), true)
 	}
 	m.insert(alu)
 	return
@@ -426,7 +437,7 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandNR(rm), true)
 	m.insert(alu)
 	return
 }
@@ -434,7 +445,7 @@ func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandER(rm, ext, 64), true)
 	m.insert(alu)
 	return
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
index b435d9ba9..5f584f928 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
@@ -21,6 +21,8 @@ type (
 		regAlloc   regalloc.Allocator
 		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
 
+		amodePool wazevoapi.Pool[addressMode]
+
 		// addendsWorkQueue is used during address lowering, defined here for reuse.
 		addendsWorkQueue wazevoapi.Queue[ssa.Value]
 		addends32        wazevoapi.Queue[addend32]
@@ -105,6 +107,7 @@ func NewBackend() backend.Machine {
 		spillSlots:        make(map[regalloc.VRegID]int64),
 		executableContext: newExecutableContext(),
 		regAlloc:          regalloc.NewAllocator(regInfo),
+		amodePool:         wazevoapi.NewPool[addressMode](resetAddressMode),
 	}
 	return m
 }
@@ -149,6 +152,7 @@ func (m *machine) Reset() {
 	m.maxRequiredStackSizeForCalls = 0
 	m.executableContext.Reset()
 	m.jmpTableTargets = m.jmpTableTargets[:0]
+	m.amodePool.Reset()
 }
 
 // SetCurrentABI implements backend.Machine SetCurrentABI.
@@ -183,9 +187,8 @@ func (m *machine) allocateBrTarget() (nop *instruction, l label) {
 	l = ectx.AllocateLabel()
 	nop = m.allocateInstr()
 	nop.asNop0WithLabel(l)
-	pos := ectx.AllocateLabelPosition(l)
+	pos := ectx.GetOrAllocateLabelPosition(l)
 	pos.Begin, pos.End = nop, nop
-	ectx.LabelPositions[l] = pos
 	return
 }
 
@@ -209,7 +212,7 @@ func (m *machine) allocateNop() *instruction {
 }
 
 func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
-	amode := &i.amode
+	amode := i.getAmode()
 	switch amode.kind {
 	case addressModeKindResultStackSpace:
 		amode.imm += ret0offset
@@ -281,7 +284,7 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 				switch cur.kind {
 				case nop0:
 					l := cur.nop0Label()
-					if pos, ok := ectx.LabelPositions[l]; ok {
+					if pos := ectx.LabelPositions[l]; pos != nil {
 						pos.BinaryOffset = offset + size
 					}
 				case condBr:
@@ -428,8 +431,10 @@ func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *
 func (m *machine) Format() string {
 	ectx := m.executableContext
 	begins := map[*instruction]label{}
-	for l, pos := range ectx.LabelPositions {
-		begins[pos.Begin] = l
+	for _, pos := range ectx.LabelPositions {
+		if pos != nil {
+			begins[pos.Begin] = pos.L
+		}
 	}
 
 	irBlocks := map[label]ssa.BasicBlockID{}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
index 466fac464..d9032f921 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@@ -70,7 +70,7 @@ func (m *machine) setupPrologue() {
 		//                                          +-----------------+ <----- SP
 		//                                             (low address)
 		//
-		_amode := addressModePreOrPostIndex(spVReg,
+		_amode := addressModePreOrPostIndex(m, spVReg,
 			-16,  // stack pointer must be 16-byte aligned.
 			true, // Decrement before store.
 		)
@@ -159,7 +159,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 		sizeOfArgRetReg = tmpRegVReg
 
 		subSp := m.allocateInstr()
-		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+		subSp.asALU(aluOpSub, spVReg, operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
 		cur = linkInstr(cur, subSp)
 	} else {
 		sizeOfArgRetReg = xzrVReg
@@ -168,7 +168,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 	// Saves the return address (lr) and the size_of_arg_ret below the SP.
 	// size_of_arg_ret is used for stack unwinding.
 	pstr := m.allocateInstr()
-	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+	amode := addressModePreOrPostIndex(m, spVReg, -16, true /* decrement before store */)
 	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
 	cur = linkInstr(cur, pstr)
 	return cur
@@ -182,7 +182,7 @@ func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
 	} else {
 		frameSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg,
+	_amode := addressModePreOrPostIndex(m, spVReg,
 		-16,  // stack pointer must be 16-byte aligned.
 		true, // Decrement before store.
 	)
@@ -213,7 +213,7 @@ func (m *machine) postRegAlloc() {
 			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
 		default:
 			// Removes the redundant copy instruction.
-			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+			if cur.IsCopy() && cur.rn.realReg() == cur.rd.RealReg() {
 				prev, next := cur.prev, cur.next
 				// Remove the copy instruction.
 				prev.next = next
@@ -286,16 +286,16 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 		for i := range m.clobberedRegs {
 			vr := m.clobberedRegs[l-i] // reverse order to restore.
 			load := m.allocateInstr()
-			amode := addressModePreOrPostIndex(spVReg,
+			amode := addressModePreOrPostIndex(m, spVReg,
 				16,    // stack pointer must be 16-byte aligned.
 				false, // Increment after store.
 			)
 			// TODO: pair loads to reduce the number of instructions.
 			switch regTypeToRegisterSizeInBits(vr.RegType()) {
 			case 64: // save int reg.
-				load.asULoad(operandNR(vr), amode, 64)
+				load.asULoad(vr, amode, 64)
 			case 128: // save vector reg.
-				load.asFpuLoad(operandNR(vr), amode, 128)
+				load.asFpuLoad(vr, amode, 128)
 			}
 			cur = linkInstr(cur, load)
 		}
@@ -317,8 +317,8 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 	//    SP----> +-----------------+
 
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	ldr.asULoad(lrVReg,
+		addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
 	cur = linkInstr(cur, ldr)
 
 	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
@@ -351,14 +351,14 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
 		// sub tmp, sp, #requiredStackSize
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), immm12op, true)
 		cur = linkInstr(cur, sub)
 	} else {
 		// This case, we first load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		// Then subtract it.
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		cur = linkInstr(cur, sub)
 	}
 
@@ -366,16 +366,18 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 
 	// ldr tmp2, [executionContext #StackBottomPtr]
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(tmp2), addressMode{
+	amode := m.amodePool.Allocate()
+	*amode = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument.
 		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
-	}, 64)
+	}
+	ldr.asULoad(tmp2, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	// subs xzr, tmp, tmp2
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+	subs.asALU(aluOpSubS, xzrVReg, operandNR(tmpRegVReg), operandNR(tmp2), true)
 	cur = linkInstr(cur, subs)
 
 	// b.ge #imm
@@ -388,22 +390,25 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 		// First load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		setRequiredStackSize := m.allocateInstr()
-		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
-			}, 64)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+		}
+		setRequiredStackSize.asStore(operandNR(tmpRegVReg), amode, 64)
 
 		cur = linkInstr(cur, setRequiredStackSize)
 	}
 
 	ldrAddress := m.allocateInstr()
-	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+	amode2 := m.amodePool.Allocate()
+	*amode2 = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument
 		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
-	}, 64)
+	}
+	ldrAddress.asULoad(tmpRegVReg, amode2, 64)
 	cur = linkInstr(cur, ldrAddress)
 
 	// Then jumps to the stack grow call sequence's address, meaning
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
index 1c8793b73..c7eb92cc2 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@@ -91,7 +91,7 @@ func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, aft
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	store := m.allocateInstr()
 	store.asStore(operandNR(v), amode, typ.Bits())
@@ -116,16 +116,16 @@ func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, af
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(v), amode, typ.Bits())
+		load.asULoad(v, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(v), amode, typ.Bits())
+		load.asFpuLoad(v, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(v), amode, 128)
+		load.asFpuLoad(v, amode, 128)
 	default:
 		panic("TODO")
 	}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
index 3f36c84e5..655370786 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
@@ -35,7 +35,7 @@ type (
 		iter                   int
 		reversePostOrderBlocks []RegAllocBlock[I, m]
 		// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
-		labelToRegAllocBlockIndex map[Label]int
+		labelToRegAllocBlockIndex [] /* Label to */ int
 		loopNestingForestRoots    []ssa.BasicBlock
 	}
 
@@ -56,10 +56,9 @@ type (
 // NewRegAllocFunction returns a new RegAllocFunction.
 func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
 	return &RegAllocFunction[I, M]{
-		m:                         m,
-		ssb:                       ssb,
-		c:                         c,
-		labelToRegAllocBlockIndex: make(map[Label]int),
+		m:   m,
+		ssb: ssb,
+		c:   c,
 	}
 }
 
@@ -74,6 +73,9 @@ func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end
 		end:   end,
 		id:    int(sb.ID()),
 	})
+	if len(f.labelToRegAllocBlockIndex) <= int(l) {
+		f.labelToRegAllocBlockIndex = append(f.labelToRegAllocBlockIndex, make([]int, int(l)-len(f.labelToRegAllocBlockIndex)+1)...)
+	}
 	f.labelToRegAllocBlockIndex[l] = i
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
index b4450d56f..eacb6a7ef 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
@@ -60,9 +60,8 @@ type (
 		phiDefInstListPool       wazevoapi.Pool[phiDefInstList]
 
 		// Followings are re-used during various places.
-		blks             []Block
-		reals            []RealReg
-		currentOccupants regInUseSet
+		blks  []Block
+		reals []RealReg
 
 		// Following two fields are updated while iterating the blocks in the reverse postorder.
 		state       state
@@ -755,7 +754,8 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 		killSet := a.reals[:0]
 
 		// Gather the set of registers that will be used in the current instruction.
-		for _, use := range instr.Uses(&a.vs) {
+		uses := instr.Uses(&a.vs)
+		for _, use := range uses {
 			if use.IsRealReg() {
 				r := use.RealReg()
 				currentUsedSet = currentUsedSet.add(r)
@@ -770,7 +770,7 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 			}
 		}
 
-		for i, use := range instr.Uses(&a.vs) {
+		for i, use := range uses {
 			if !use.IsRealReg() {
 				vs := s.getVRegState(use.ID())
 				killed := vs.lastUse == pc
@@ -944,8 +944,7 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) {
 	s := &a.state
 
-	for i := 0; i < 64; i++ {
-		allocated := RealReg(i)
+	for allocated := RealReg(0); allocated < 64; allocated++ {
 		if allocated == addrReg { // If this is the call indirect, we should not touch the addr register.
 			continue
 		}
@@ -974,11 +973,10 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 	bID := blk.ID()
 	blkSt := a.getOrAllocateBlockState(bID)
 	desiredOccupants := &blkSt.startRegs
-	aliveOnRegVRegs := make(map[VReg]RealReg)
-	for i := 0; i < 64; i++ {
-		r := RealReg(i)
-		if v := blkSt.startRegs.get(r); v.Valid() {
-			aliveOnRegVRegs[v] = r
+	var desiredOccupantsSet RegSet
+	for i, v := range desiredOccupants {
+		if v != VRegInvalid {
+			desiredOccupantsSet = desiredOccupantsSet.add(RealReg(i))
 		}
 	}
 
@@ -987,56 +985,38 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 	}
 
 	s.currentBlockID = bID
-	a.updateLiveInVRState(a.getOrAllocateBlockState(bID))
+	a.updateLiveInVRState(blkSt)
 
-	currentOccupants := &a.currentOccupants
 	for i := 0; i < preds; i++ {
-		currentOccupants.reset()
 		if i == blkSt.startFromPredIndex {
 			continue
 		}
 
-		currentOccupantsRev := make(map[VReg]RealReg)
 		pred := blk.Pred(i)
 		predSt := a.getOrAllocateBlockState(pred.ID())
-		for ii := 0; ii < 64; ii++ {
-			r := RealReg(ii)
-			if v := predSt.endRegs.get(r); v.Valid() {
-				if _, ok := aliveOnRegVRegs[v]; !ok {
-					continue
-				}
-				currentOccupants.add(r, v)
-				currentOccupantsRev[v] = r
-			}
-		}
 
 		s.resetAt(predSt)
 
 		// Finds the free registers if any.
 		intTmp, floatTmp := VRegInvalid, VRegInvalid
 		if intFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupants.set,
+			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupantsSet,
 		); intFree != RealRegInvalid {
 			intTmp = FromRealReg(intFree, RegTypeInt)
 		}
 		if floatFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupants.set,
+			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupantsSet,
 		); floatFree != RealRegInvalid {
 			floatTmp = FromRealReg(floatFree, RegTypeFloat)
 		}
 
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
-		}
-
-		for ii := 0; ii < 64; ii++ {
-			r := RealReg(ii)
+		for r := RealReg(0); r < 64; r++ {
 			desiredVReg := desiredOccupants.get(r)
 			if !desiredVReg.Valid() {
 				continue
 			}
 
-			currentVReg := currentOccupants.get(r)
+			currentVReg := s.regsInUse.get(r)
 			if desiredVReg.ID() == currentVReg.ID() {
 				continue
 			}
@@ -1048,86 +1028,95 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 			} else {
 				tmpRealReg = floatTmp
 			}
-			a.reconcileEdge(f, r, pred, currentOccupants, currentOccupantsRev, currentVReg, desiredVReg, tmpRealReg, typ)
+			a.reconcileEdge(f, r, pred, currentVReg, desiredVReg, tmpRealReg, typ)
 		}
 	}
 }
 
+// reconcileEdge reconciles the register state between the current block and the predecessor for the real register `r`.
+//
+//   - currentVReg is the current VReg value that sits on the register `r`. This can be VRegInvalid if the register is not used at the end of the predecessor.
+//   - desiredVReg is the desired VReg value that should be on the register `r`.
+//   - freeReg is the temporary register that can be used to swap the values, which may or may not be used.
+//   - typ is the register type of the `r`.
 func (a *Allocator) reconcileEdge(f Function,
 	r RealReg,
 	pred Block,
-	currentOccupants *regInUseSet,
-	currentOccupantsRev map[VReg]RealReg,
 	currentVReg, desiredVReg VReg,
 	freeReg VReg,
 	typ RegType,
 ) {
+	// There are four cases to consider:
+	// 1. currentVReg is valid, but desiredVReg is on the stack.
+	// 2. Both currentVReg and desiredVReg are valid.
+	// 3. Desired is on a different register than `r` and currentReg is not valid.
+	// 4. Desired is on the stack and currentReg is not valid.
+
 	s := &a.state
 	if currentVReg.Valid() {
-		// Both are on reg.
-		er, ok := currentOccupantsRev[desiredVReg]
-		if !ok {
+		desiredState := s.getVRegState(desiredVReg.ID())
+		er := desiredState.r
+		if er == RealRegInvalid {
+			// Case 1: currentVReg is valid, but desiredVReg is on the stack.
 			if wazevoapi.RegAllocLoggingEnabled {
 				fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n",
 					desiredVReg.ID(), a.regInfo.RealRegName(r),
 				)
 			}
-			// This case is that the desired value is on the stack, but currentVReg is on the target register.
-			// We need to move the current value to the stack, and reload the desired value.
+			// We need to move the current value to the stack, and reload the desired value into the register.
 			// TODO: we can do better here.
 			f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			delete(currentOccupantsRev, currentVReg)
+			s.releaseRealReg(r)
 
 			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
 			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			currentOccupants.add(r, desiredVReg)
-			currentOccupantsRev[desiredVReg] = r
+			s.useRealReg(r, desiredVReg)
 			return
-		}
-
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
-				desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+		} else {
+			// Case 2: Both currentVReg and desiredVReg are valid.
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
+					desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+				)
+			}
+			// This case, we need to swap the values between the current and desired values.
+			f.SwapBefore(
+				currentVReg.SetRealReg(r),
+				desiredVReg.SetRealReg(er),
+				freeReg,
+				pred.LastInstrForInsertion(),
 			)
-		}
-		f.SwapBefore(
-			currentVReg.SetRealReg(r),
-			desiredVReg.SetRealReg(er),
-			freeReg,
-			pred.LastInstrForInsertion(),
-		)
-		s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
-		currentOccupantsRev[desiredVReg] = r
-		currentOccupantsRev[currentVReg] = er
-		currentOccupants.add(r, desiredVReg)
-		currentOccupants.add(er, currentVReg)
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+			s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
+			s.releaseRealReg(r)
+			s.releaseRealReg(er)
+			s.useRealReg(r, desiredVReg)
+			s.useRealReg(er, currentVReg)
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+			}
 		}
 	} else {
-		// Desired is on reg, but currently the target register is not used.
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("\t\tv%d is desired to be on %s, current not used\n",
 				desiredVReg.ID(), a.regInfo.RealRegName(r),
 			)
 		}
-		if currentReg, ok := currentOccupantsRev[desiredVReg]; ok {
+		if currentReg := s.getVRegState(desiredVReg.ID()).r; currentReg != RealRegInvalid {
+			// Case 3: Desired is on a different register than `r` and currentReg is not valid.
+			// We simply need to move the desired value to the register.
 			f.InsertMoveBefore(
 				FromRealReg(r, typ),
 				desiredVReg.SetRealReg(currentReg),
 				pred.LastInstrForInsertion(),
 			)
-			currentOccupants.remove(currentReg)
+			s.releaseRealReg(currentReg)
 		} else {
+			// Case 4: Both currentVReg and desiredVReg are not valid.
+			// We simply need to reload the desired value into the register.
 			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
 			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
 		}
-		currentOccupantsRev[desiredVReg] = r
-		currentOccupants.add(r, desiredVReg)
-	}
-
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
+		s.useRealReg(r, desiredVReg)
 	}
 }
 
@@ -1169,8 +1158,7 @@ func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
 	}
 	for pos != definingBlk {
 		st := a.getOrAllocateBlockState(pos.ID())
-		for ii := 0; ii < 64; ii++ {
-			rr := RealReg(ii)
+		for rr := RealReg(0); rr < 64; rr++ {
 			if st.startRegs.get(rr) == v {
 				r = rr
 				// Already in the register, so we can place the spill at the beginning of the block.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
index e9bf60661..04a8e8f4d 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
@@ -46,23 +46,24 @@ func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
 	}
 }
 
-type regInUseSet struct {
-	set RegSet
-	vrs [64]VReg
+type regInUseSet [64]VReg
+
+func newRegInUseSet() regInUseSet {
+	var ret regInUseSet
+	ret.reset()
+	return ret
 }
 
 func (rs *regInUseSet) reset() {
-	rs.set = 0
-	for i := range rs.vrs {
-		rs.vrs[i] = VRegInvalid
+	for i := range rs {
+		rs[i] = VRegInvalid
 	}
 }
 
 func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
 	var ret []string
-	for i := 0; i < 64; i++ {
-		if rs.set&(1<<uint(i)) != 0 {
-			vr := rs.vrs[i]
+	for i, vr := range rs {
+		if vr != VRegInvalid {
 			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
 		}
 	}
@@ -70,39 +71,28 @@ func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
 }
 
 func (rs *regInUseSet) has(r RealReg) bool {
-	if r >= 64 {
-		return false
-	}
-	return rs.set&(1<<uint(r)) != 0
+	return r < 64 && rs[r] != VRegInvalid
 }
 
 func (rs *regInUseSet) get(r RealReg) VReg {
-	if r >= 64 {
-		return VRegInvalid
-	}
-	return rs.vrs[r]
+	return rs[r]
 }
 
 func (rs *regInUseSet) remove(r RealReg) {
-	if r >= 64 {
-		return
-	}
-	rs.set &= ^(1 << uint(r))
-	rs.vrs[r] = VRegInvalid
+	rs[r] = VRegInvalid
 }
 
 func (rs *regInUseSet) add(r RealReg, vr VReg) {
 	if r >= 64 {
 		return
 	}
-	rs.set |= 1 << uint(r)
-	rs.vrs[r] = vr
+	rs[r] = vr
 }
 
 func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
-	for i := 0; i < 64; i++ {
-		if rs.set&(1<<uint(i)) != 0 {
-			f(RealReg(i), rs.vrs[i])
+	for i, vr := range rs {
+		if vr != VRegInvalid {
+			f(RealReg(i), vr)
 		}
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
index 3379c4dde..72ce44e26 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
@@ -2,7 +2,6 @@ package wazevo
 
 import (
 	"context"
-	"encoding/binary"
 	"fmt"
 	"reflect"
 	"runtime"
@@ -310,15 +309,6 @@ func (c *callEngine) callWithStack(ctx context.Context, paramResultStack []uint6
 				*argRes = uint64(0xffffffff) // = -1 in signed 32-bit integer.
 			} else {
 				*argRes = uint64(res)
-				calleeOpaque := opaqueViewFromPtr(uintptr(unsafe.Pointer(c.execCtx.callerModuleContextPtr)))
-				if mod.Source.MemorySection != nil { // Local memory.
-					putLocalMemory(calleeOpaque, 8 /* local memory begins at 8 */, mem)
-				} else {
-					// Imported memory's owner at offset 16 of the callerModuleContextPtr.
-					opaquePtr := uintptr(binary.LittleEndian.Uint64(calleeOpaque[16:]))
-					importedMemOwner := opaqueViewFromPtr(opaquePtr)
-					putLocalMemory(importedMemOwner, 8 /* local memory begins at 8 */, mem)
-				}
 			}
 			c.execCtx.exitCode = wazevoapi.ExitCodeOK
 			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
@@ -525,14 +515,6 @@ func (c *callEngine) callerModuleInstance() *wasm.ModuleInstance {
 	return moduleInstanceFromOpaquePtr(c.execCtx.callerModuleContextPtr)
 }
 
-func opaqueViewFromPtr(ptr uintptr) []byte {
-	var opaque []byte
-	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaque))
-	sh.Data = ptr
-	setSliceLimits(sh, 24, 24)
-	return opaque
-}
-
 const callStackCeiling = uintptr(50000000) // in uint64 (8 bytes) == 400000000 bytes in total == 400mb.
 
 func (c *callEngine) growStackWithGuarded() (newSP uintptr, newFP uintptr, err error) {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
index f7c0450ae..e49353dc8 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
@@ -31,6 +31,13 @@ func fileCacheKey(m *wasm.Module) (ret filecache.Key) {
 	s := sha256.New()
 	s.Write(m.ID[:])
 	s.Write(magic)
+	// Write the CPU features so that we can cache the compiled module for the same CPU.
+	// This prevents the incompatible CPU features from being used.
+	cpu := platform.CpuFeatures.Raw()
+	// Reuse the `ret` buffer to write the first 8 bytes of the CPU features so that we can avoid the allocation.
+	binary.LittleEndian.PutUint64(ret[:8], cpu)
+	s.Write(ret[:8])
+	// Finally, write the hash to the ret buffer.
 	s.Sum(ret[:0])
 	return
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
index 873a35a55..42cc21dcd 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
@@ -301,26 +301,7 @@ func (c *Compiler) declareWasmLocals(entry ssa.BasicBlock) {
 		st := WasmTypeToSSAType(typ)
 		variable := c.ssaBuilder.DeclareVariable(st)
 		c.setWasmLocalVariable(wasm.Index(i)+localCount, variable)
-
-		zeroInst := c.ssaBuilder.AllocateInstruction()
-		switch st {
-		case ssa.TypeI32:
-			zeroInst.AsIconst32(0)
-		case ssa.TypeI64:
-			zeroInst.AsIconst64(0)
-		case ssa.TypeF32:
-			zeroInst.AsF32const(0)
-		case ssa.TypeF64:
-			zeroInst.AsF64const(0)
-		case ssa.TypeV128:
-			zeroInst.AsVconst(0, 0)
-		default:
-			panic("TODO: " + wasm.ValueTypeName(typ))
-		}
-
-		c.ssaBuilder.InsertInstruction(zeroInst)
-		value := zeroInst.Return()
-		c.ssaBuilder.DefineVariable(variable, value, entry)
+		c.ssaBuilder.InsertZeroValue(st)
 	}
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
index 5096a6365..ff963e605 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
@@ -1086,16 +1086,8 @@ func (c *Compiler) lowerCurrentOpcode() {
 			break
 		}
 		variable := c.localVariable(index)
-		if _, ok := c.m.NonStaticLocals[c.wasmLocalFunctionIndex][index]; ok {
-			state.push(builder.MustFindValue(variable))
-		} else {
-			// If a local is static, we can simply find it in the entry block which is either a function param
-			// or a zero value. This fast pass helps to avoid the overhead of searching the entire function plus
-			// avoid adding unnecessary block arguments.
-			// TODO: I think this optimization should be done in a SSA pass like passRedundantPhiEliminationOpt,
-			// 	but somehow there's some corner cases that it fails to optimize.
-			state.push(builder.MustFindValueInBlk(variable, c.ssaBuilder.EntryBlock()))
-		}
+		state.push(builder.MustFindValue(variable))
+
 	case wasm.OpcodeLocalSet:
 		index := c.readI32u()
 		if state.unreachable {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
index ba8f546c0..efa1b9bba 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
@@ -86,16 +86,6 @@ func newAlignedOpaque(size int) moduleContextOpaque {
 	return buf
 }
 
-func putLocalMemory(opaque []byte, offset wazevoapi.Offset, mem *wasm.MemoryInstance) {
-	s := uint64(len(mem.Buffer))
-	var b uint64
-	if len(mem.Buffer) > 0 {
-		b = uint64(uintptr(unsafe.Pointer(&mem.Buffer[0])))
-	}
-	binary.LittleEndian.PutUint64(opaque[offset:], b)
-	binary.LittleEndian.PutUint64(opaque[offset+8:], s)
-}
-
 func (m *moduleEngine) setupOpaque() {
 	inst := m.module
 	offsets := &m.parent.offsets
@@ -106,7 +96,7 @@ func (m *moduleEngine) setupOpaque() {
 	)
 
 	if lm := offsets.LocalMemoryBegin; lm >= 0 {
-		putLocalMemory(opaque, lm, inst.MemoryInstance)
+		m.putLocalMemory()
 	}
 
 	// Note: imported memory is resolved in ResolveImportedFunction.
@@ -227,6 +217,25 @@ func (m *moduleEngine) SetGlobalValue(i wasm.Index, lo, hi uint64) {
 // OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
 func (m *moduleEngine) OwnsGlobals() bool { return true }
 
+// MemoryGrown implements wasm.ModuleEngine.
+func (m *moduleEngine) MemoryGrown() {
+	m.putLocalMemory()
+}
+
+// putLocalMemory writes the local memory buffer pointer and length to the opaque buffer.
+func (m *moduleEngine) putLocalMemory() {
+	mem := m.module.MemoryInstance
+	offset := m.parent.offsets.LocalMemoryBegin
+
+	s := uint64(len(mem.Buffer))
+	var b uint64
+	if len(mem.Buffer) > 0 {
+		b = uint64(uintptr(unsafe.Pointer(&mem.Buffer[0])))
+	}
+	binary.LittleEndian.PutUint64(m.opaque[offset:], b)
+	binary.LittleEndian.PutUint64(m.opaque[offset+8:], s)
+}
+
 // ResolveImportedFunction implements wasm.ModuleEngine.
 func (m *moduleEngine) ResolveImportedFunction(index, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) {
 	executableOffset, moduleCtxOffset, typeIDOffset := m.parent.offsets.ImportedFunctionOffset(index)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
index 10b6b4b62..39627b989 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
@@ -49,21 +49,12 @@ type BasicBlock interface {
 	// ReturnBlock returns ture if this block represents the function return.
 	ReturnBlock() bool
 
-	// FormatHeader returns the debug string of this block, not including instruction.
-	FormatHeader(b Builder) string
-
 	// Valid is true if this block is still valid even after optimizations.
 	Valid() bool
 
 	// Sealed is true if this block has been sealed.
 	Sealed() bool
 
-	// BeginPredIterator returns the first predecessor of this block.
-	BeginPredIterator() BasicBlock
-
-	// NextPredIterator returns the next predecessor of this block.
-	NextPredIterator() BasicBlock
-
 	// Preds returns the number of predecessors of this block.
 	Preds() int
 
@@ -88,10 +79,11 @@ type (
 	basicBlock struct {
 		id                      BasicBlockID
 		rootInstr, currentInstr *Instruction
-		params                  []blockParam
-		predIter                int
-		preds                   []basicBlockPredecessorInfo
-		success                 []*basicBlock
+		// params are Values that represent parameters to a basicBlock.
+		// Each parameter can be considered as an output of PHI instruction in traditional SSA.
+		params  []Value
+		preds   []basicBlockPredecessorInfo
+		success []*basicBlock
 		// singlePred is the alias to preds[0] for fast lookup, and only set after Seal is called.
 		singlePred *basicBlock
 		// lastDefinitions maps Variable to its last definition in this block.
@@ -116,11 +108,14 @@ type (
 
 		// loopNestingForestChildren holds the children of this block in the loop nesting forest.
 		// Non-empty if and only if this block is a loop header (i.e. loopHeader=true)
-		loopNestingForestChildren []BasicBlock
+		loopNestingForestChildren wazevoapi.VarLength[BasicBlock]
 
 		// reversePostOrder is used to sort all the blocks in the function in reverse post order.
 		// This is used in builder.LayoutBlocks.
-		reversePostOrder int
+		reversePostOrder int32
+
+		// visited is used during various traversals.
+		visited int32
 
 		// child and sibling are the ones in the dominator tree.
 		child, sibling *basicBlock
@@ -128,15 +123,6 @@ type (
 	// BasicBlockID is the unique ID of a basicBlock.
 	BasicBlockID uint32
 
-	// blockParam implements Value and represents a parameter to a basicBlock.
-	blockParam struct {
-		// value is the Value that corresponds to the parameter in this block,
-		// and can be considered as an output of PHI instruction in traditional SSA.
-		value Value
-		// typ is the type of the parameter.
-		typ Type
-	}
-
 	unknownValue struct {
 		// variable is the variable that this unknownValue represents.
 		variable Variable
@@ -145,6 +131,9 @@ type (
 	}
 )
 
+// basicBlockVarLengthNil is the default nil value for basicBlock.loopNestingForestChildren.
+var basicBlockVarLengthNil = wazevoapi.NewNilVarLength[BasicBlock]()
+
 const basicBlockIDReturnBlock = 0xffffffff
 
 // Name implements BasicBlock.Name.
@@ -190,13 +179,13 @@ func (bb *basicBlock) ReturnBlock() bool {
 // AddParam implements BasicBlock.AddParam.
 func (bb *basicBlock) AddParam(b Builder, typ Type) Value {
 	paramValue := b.allocateValue(typ)
-	bb.params = append(bb.params, blockParam{typ: typ, value: paramValue})
+	bb.params = append(bb.params, paramValue)
 	return paramValue
 }
 
 // addParamOn adds a parameter to this block whose value is already allocated.
-func (bb *basicBlock) addParamOn(typ Type, value Value) {
-	bb.params = append(bb.params, blockParam{typ: typ, value: value})
+func (bb *basicBlock) addParamOn(value Value) {
+	bb.params = append(bb.params, value)
 }
 
 // Params implements BasicBlock.Params.
@@ -206,8 +195,7 @@ func (bb *basicBlock) Params() int {
 
 // Param implements BasicBlock.Param.
 func (bb *basicBlock) Param(i int) Value {
-	p := &bb.params[i]
-	return p.value
+	return bb.params[i]
 }
 
 // Valid implements BasicBlock.Valid.
@@ -248,22 +236,6 @@ func (bb *basicBlock) NumPreds() int {
 	return len(bb.preds)
 }
 
-// BeginPredIterator implements BasicBlock.BeginPredIterator.
-func (bb *basicBlock) BeginPredIterator() BasicBlock {
-	bb.predIter = 0
-	return bb.NextPredIterator()
-}
-
-// NextPredIterator implements BasicBlock.NextPredIterator.
-func (bb *basicBlock) NextPredIterator() BasicBlock {
-	if bb.predIter >= len(bb.preds) {
-		return nil
-	}
-	pred := bb.preds[bb.predIter].blk
-	bb.predIter++
-	return pred
-}
-
 // Preds implements BasicBlock.Preds.
 func (bb *basicBlock) Preds() int {
 	return len(bb.preds)
@@ -305,7 +277,8 @@ func resetBasicBlock(bb *basicBlock) {
 	bb.unknownValues = bb.unknownValues[:0]
 	bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions)
 	bb.reversePostOrder = -1
-	bb.loopNestingForestChildren = bb.loopNestingForestChildren[:0]
+	bb.visited = 0
+	bb.loopNestingForestChildren = basicBlockVarLengthNil
 	bb.loopHeader = false
 	bb.sibling = nil
 	bb.child = nil
@@ -335,11 +308,11 @@ func (bb *basicBlock) addPred(blk BasicBlock, branch *Instruction) {
 	pred.success = append(pred.success, bb)
 }
 
-// FormatHeader implements BasicBlock.FormatHeader.
-func (bb *basicBlock) FormatHeader(b Builder) string {
+// formatHeader returns the string representation of the header of the basicBlock.
+func (bb *basicBlock) formatHeader(b Builder) string {
 	ps := make([]string, len(bb.params))
 	for i, p := range bb.params {
-		ps[i] = p.value.formatWithType(b)
+		ps[i] = p.formatWithType(b)
 	}
 
 	if len(bb.preds) > 0 {
@@ -398,7 +371,7 @@ func (bb *basicBlock) String() string {
 
 // LoopNestingForestChildren implements BasicBlock.LoopNestingForestChildren.
 func (bb *basicBlock) LoopNestingForestChildren() []BasicBlock {
-	return bb.loopNestingForestChildren
+	return bb.loopNestingForestChildren.View()
 }
 
 // LoopHeader implements BasicBlock.LoopHeader.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
index 1fc84d2ea..0b700c4b1 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
@@ -54,9 +54,6 @@ type Builder interface {
 	// MustFindValue searches the latest definition of the given Variable and returns the result.
 	MustFindValue(variable Variable) Value
 
-	// MustFindValueInBlk is the same as MustFindValue except it searches the latest definition from the given BasicBlock.
-	MustFindValueInBlk(variable Variable, blk BasicBlock) Value
-
 	// FindValueInLinearPath tries to find the latest definition of the given Variable in the linear path to the current BasicBlock.
 	// If it cannot find the definition, or it's not sealed yet, it returns ValueInvalid.
 	FindValueInLinearPath(variable Variable) Value
@@ -127,7 +124,11 @@ type Builder interface {
 	// Idom returns the immediate dominator of the given BasicBlock.
 	Idom(blk BasicBlock) BasicBlock
 
+	// VarLengthPool returns the VarLengthPool of Value.
 	VarLengthPool() *wazevoapi.VarLengthPool[Value]
+
+	// InsertZeroValue inserts a zero value constant instruction of the given type.
+	InsertZeroValue(t Type)
 }
 
 // NewBuilder returns a new Builder implementation.
@@ -135,10 +136,10 @@ func NewBuilder() Builder {
 	return &builder{
 		instructionsPool:               wazevoapi.NewPool[Instruction](resetInstruction),
 		basicBlocksPool:                wazevoapi.NewPool[basicBlock](resetBasicBlock),
+		varLengthBasicBlockPool:        wazevoapi.NewVarLengthPool[BasicBlock](),
 		varLengthPool:                  wazevoapi.NewVarLengthPool[Value](),
 		valueAnnotations:               make(map[ValueID]string),
 		signatures:                     make(map[SignatureID]*Signature),
-		blkVisited:                     make(map[*basicBlock]int),
 		valueIDAliases:                 make(map[ValueID]Value),
 		redundantParameterIndexToValue: make(map[int]Value),
 		returnBlk:                      &basicBlock{id: basicBlockIDReturnBlock},
@@ -177,12 +178,13 @@ type builder struct {
 	dominators []*basicBlock
 	sparseTree dominatorSparseTree
 
+	varLengthBasicBlockPool wazevoapi.VarLengthPool[BasicBlock]
+
 	// loopNestingForestRoots are the roots of the loop nesting forest.
 	loopNestingForestRoots []BasicBlock
 
 	// The followings are used for optimization passes/deterministic compilation.
 	instStack                      []*Instruction
-	blkVisited                     map[*basicBlock]int
 	valueIDToInstruction           []*Instruction
 	blkStack                       []*basicBlock
 	blkStack2                      []*basicBlock
@@ -200,6 +202,32 @@ type builder struct {
 	donePostBlockLayoutPasses bool
 
 	currentSourceOffset SourceOffset
+
+	// zeros are the zero value constants for each type.
+	zeros [typeEnd]Value
+}
+
+// InsertZeroValue implements Builder.InsertZeroValue.
+func (b *builder) InsertZeroValue(t Type) {
+	if b.zeros[t].Valid() {
+		return
+	}
+	zeroInst := b.AllocateInstruction()
+	switch t {
+	case TypeI32:
+		zeroInst.AsIconst32(0)
+	case TypeI64:
+		zeroInst.AsIconst64(0)
+	case TypeF32:
+		zeroInst.AsF32const(0)
+	case TypeF64:
+		zeroInst.AsF64const(0)
+	case TypeV128:
+		zeroInst.AsVconst(0, 0)
+	default:
+		panic("TODO: " + t.String())
+	}
+	b.zeros[t] = zeroInst.Insert(b).Return()
 }
 
 func (b *builder) VarLengthPool() *wazevoapi.VarLengthPool[Value] {
@@ -215,10 +243,12 @@ func (b *builder) ReturnBlock() BasicBlock {
 func (b *builder) Init(s *Signature) {
 	b.nextVariable = 0
 	b.currentSignature = s
+	b.zeros = [typeEnd]Value{ValueInvalid, ValueInvalid, ValueInvalid, ValueInvalid, ValueInvalid, ValueInvalid}
 	resetBasicBlock(b.returnBlk)
 	b.instructionsPool.Reset()
 	b.basicBlocksPool.Reset()
 	b.varLengthPool.Reset()
+	b.varLengthBasicBlockPool.Reset()
 	b.donePreBlockLayoutPasses = false
 	b.doneBlockLayout = false
 	b.donePostBlockLayoutPasses = false
@@ -231,11 +261,6 @@ func (b *builder) Init(s *Signature) {
 	b.blkStack2 = b.blkStack2[:0]
 	b.dominators = b.dominators[:0]
 	b.loopNestingForestRoots = b.loopNestingForestRoots[:0]
-
-	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
-		blk := b.basicBlocksPool.View(i)
-		delete(b.blkVisited, blk)
-	}
 	b.basicBlocksPool.Reset()
 
 	for v := ValueID(0); v < b.nextValueID; v++ {
@@ -448,11 +473,6 @@ func (b *builder) findValueInLinearPath(variable Variable, blk *basicBlock) Valu
 	return ValueInvalid
 }
 
-func (b *builder) MustFindValueInBlk(variable Variable, blk BasicBlock) Value {
-	typ := b.definedVariableType(variable)
-	return b.findValue(typ, variable, blk.(*basicBlock))
-}
-
 // MustFindValue implements Builder.MustFindValue.
 func (b *builder) MustFindValue(variable Variable) Value {
 	typ := b.definedVariableType(variable)
@@ -482,6 +502,9 @@ func (b *builder) findValue(typ Type, variable Variable, blk *basicBlock) Value
 			value:    value,
 		})
 		return value
+	} else if blk.EntryBlock() {
+		// If this is the entry block, we reach the uninitialized variable which has zero value.
+		return b.zeros[b.definedVariableType(variable)]
 	}
 
 	if pred := blk.singlePred; pred != nil {
@@ -495,21 +518,42 @@ func (b *builder) findValue(typ Type, variable Variable, blk *basicBlock) Value
 	// If this block has multiple predecessors, we have to gather the definitions,
 	// and treat them as an argument to this block.
 	//
-	// The first thing is to define a new parameter to this block which may or may not be redundant, but
-	// later we eliminate trivial params in an optimization pass. This must be done before finding the
-	// definitions in the predecessors so that we can break the cycle.
-	paramValue := blk.AddParam(b, typ)
-	b.DefineVariable(variable, paramValue, blk)
-
-	// After the new param is added, we have to manipulate the original branching instructions
-	// in predecessors so that they would pass the definition of `variable` as the argument to
-	// the newly added PHI.
+	// But before that, we have to check if the possible definitions are the same Value.
+	tmpValue := b.allocateValue(typ)
+	// Break the cycle by defining the variable with the tmpValue.
+	b.DefineVariable(variable, tmpValue, blk)
+	// Check all the predecessors if they have the same definition.
+	uniqueValue := ValueInvalid
 	for i := range blk.preds {
-		pred := &blk.preds[i]
-		value := b.findValue(typ, variable, pred.blk)
-		pred.branch.addArgumentBranchInst(b, value)
+		predValue := b.findValue(typ, variable, blk.preds[i].blk)
+		if uniqueValue == ValueInvalid {
+			uniqueValue = predValue
+		} else if uniqueValue != predValue {
+			uniqueValue = ValueInvalid
+			break
+		}
+	}
+
+	if uniqueValue != ValueInvalid {
+		// If all the predecessors have the same definition, we can use that value.
+		b.DefineVariable(variable, uniqueValue, blk)
+		b.alias(tmpValue, uniqueValue)
+		return uniqueValue
+	} else {
+		// Otherwise, add the tmpValue to this block as a parameter which may or may not be redundant, but
+		// later we eliminate trivial params in an optimization pass. This must be done before finding the
+		// definitions in the predecessors so that we can break the cycle.
+		blk.addParamOn(tmpValue)
+		// After the new param is added, we have to manipulate the original branching instructions
+		// in predecessors so that they would pass the definition of `variable` as the argument to
+		// the newly added PHI.
+		for i := range blk.preds {
+			pred := &blk.preds[i]
+			value := b.findValue(typ, variable, pred.blk)
+			pred.branch.addArgumentBranchInst(b, value)
+		}
+		return tmpValue
 	}
-	return paramValue
 }
 
 // Seal implements Builder.Seal.
@@ -523,7 +567,7 @@ func (b *builder) Seal(raw BasicBlock) {
 	for _, v := range blk.unknownValues {
 		variable, phiValue := v.variable, v.value
 		typ := b.definedVariableType(variable)
-		blk.addParamOn(typ, phiValue)
+		blk.addParamOn(phiValue)
 		for i := range blk.preds {
 			pred := &blk.preds[i]
 			predValue := b.findValue(typ, variable, pred.blk)
@@ -566,7 +610,7 @@ func (b *builder) Format() string {
 	}
 	for bb := iterBegin(); bb != nil; bb = iterNext() {
 		str.WriteByte('\n')
-		str.WriteString(bb.FormatHeader(b))
+		str.WriteString(bb.formatHeader(b))
 		str.WriteByte('\n')
 
 		for cur := bb.Root(); cur != nil; cur = cur.Next() {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
index a2e986cd1..89ec34b7e 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
@@ -22,9 +22,9 @@ func (b *builder) RunPasses() {
 func (b *builder) runPreBlockLayoutPasses() {
 	passSortSuccessors(b)
 	passDeadBlockEliminationOpt(b)
-	passRedundantPhiEliminationOpt(b)
 	// The result of passCalculateImmediateDominators will be used by various passes below.
 	passCalculateImmediateDominators(b)
+	passRedundantPhiEliminationOpt(b)
 	passNopInstElimination(b)
 
 	// TODO: implement either conversion of irreducible CFG into reducible one, or irreducible CFG detection where we panic.
@@ -78,12 +78,11 @@ func (b *builder) runFinalizingPasses() {
 // passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so.
 func passDeadBlockEliminationOpt(b *builder) {
 	entryBlk := b.entryBlk()
-	b.clearBlkVisited()
 	b.blkStack = append(b.blkStack, entryBlk)
 	for len(b.blkStack) > 0 {
 		reachableBlk := b.blkStack[len(b.blkStack)-1]
 		b.blkStack = b.blkStack[:len(b.blkStack)-1]
-		b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass.
+		reachableBlk.visited = 1
 
 		if !reachableBlk.sealed && !reachableBlk.ReturnBlock() {
 			panic(fmt.Sprintf("%s is not sealed", reachableBlk))
@@ -94,7 +93,7 @@ func passDeadBlockEliminationOpt(b *builder) {
 		}
 
 		for _, succ := range reachableBlk.success {
-			if _, ok := b.blkVisited[succ]; ok {
+			if succ.visited == 1 {
 				continue
 			}
 			b.blkStack = append(b.blkStack, succ)
@@ -102,13 +101,16 @@ func passDeadBlockEliminationOpt(b *builder) {
 	}
 
 	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
-		if _, ok := b.blkVisited[blk]; !ok {
+		if blk.visited != 1 {
 			blk.invalid = true
 		}
+		blk.visited = 0
 	}
 }
 
 // passRedundantPhiEliminationOpt eliminates the redundant PHIs (in our terminology, parameters of a block).
+// This requires the reverse post-order traversal to be calculated before calling this function,
+// hence passCalculateImmediateDominators must be called before this.
 func passRedundantPhiEliminationOpt(b *builder) {
 	redundantParameterIndexes := b.ints[:0] // reuse the slice from previous iterations.
 
@@ -118,15 +120,18 @@ func passRedundantPhiEliminationOpt(b *builder) {
 	//  relatively small. For example, sqlite speedtest binary results in the large number of redundant PHIs,
 	//  the maximum number of iteration was 22, which seems to be acceptable but not that small either since the
 	//  complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands.
+	//  -- Note --
+	// 	Currently, each iteration can run in any order of blocks, but it empirically converges quickly in practice when
+	// 	running on the reverse post-order. It might be possible to optimize this further by using the dominator tree.
 	for {
 		changed := false
-		_ = b.blockIteratorBegin() // skip entry block!
+		_ = b.blockIteratorReversePostOrderBegin() // skip entry block!
 		// Below, we intentionally use the named iteration variable name, as this comes with inevitable nested for loops!
-		for blk := b.blockIteratorNext(); blk != nil; blk = b.blockIteratorNext() {
+		for blk := b.blockIteratorReversePostOrderNext(); blk != nil; blk = b.blockIteratorReversePostOrderNext() {
 			paramNum := len(blk.params)
 
 			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
-				phiValue := blk.params[paramIndex].value
+				phiValue := blk.params[paramIndex]
 				redundant := true
 
 				nonSelfReferencingValue := ValueInvalid
@@ -184,7 +189,7 @@ func passRedundantPhiEliminationOpt(b *builder) {
 
 			// Still need to have the definition of the value of the PHI (previously as the parameter).
 			for _, redundantParamIndex := range redundantParameterIndexes {
-				phiValue := blk.params[redundantParamIndex].value
+				phiValue := blk.params[redundantParamIndex]
 				onlyValue := b.redundantParameterIndexToValue[redundantParamIndex]
 				// Create an alias in this block from the only phi argument to the phi value.
 				b.alias(phiValue, onlyValue)
@@ -227,10 +232,10 @@ func passRedundantPhiEliminationOpt(b *builder) {
 func passDeadCodeEliminationOpt(b *builder) {
 	nvid := int(b.nextValueID)
 	if nvid >= len(b.valueRefCounts) {
-		b.valueRefCounts = append(b.valueRefCounts, make([]int, b.nextValueID)...)
+		b.valueRefCounts = append(b.valueRefCounts, make([]int, nvid-len(b.valueRefCounts)+1)...)
 	}
 	if nvid >= len(b.valueIDToInstruction) {
-		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
+		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, nvid-len(b.valueIDToInstruction)+1)...)
 	}
 
 	// First, we gather all the instructions with side effects.
@@ -350,22 +355,10 @@ func (b *builder) incRefCount(id ValueID, from *Instruction) {
 	b.valueRefCounts[id]++
 }
 
-// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places.
-func (b *builder) clearBlkVisited() {
-	b.blkStack2 = b.blkStack2[:0]
-	for key := range b.blkVisited {
-		b.blkStack2 = append(b.blkStack2, key)
-	}
-	for _, blk := range b.blkStack2 {
-		delete(b.blkVisited, blk)
-	}
-	b.blkStack2 = b.blkStack2[:0]
-}
-
 // passNopInstElimination eliminates the instructions which is essentially a no-op.
 func passNopInstElimination(b *builder) {
 	if int(b.nextValueID) >= len(b.valueIDToInstruction) {
-		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
+		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, int(b.nextValueID)-len(b.valueIDToInstruction)+1)...)
 	}
 
 	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
index 9068180a0..584b5eade 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
@@ -23,8 +23,6 @@ import (
 //
 // This heuristic is done in maybeInvertBranches function.
 func passLayoutBlocks(b *builder) {
-	b.clearBlkVisited()
-
 	// We might end up splitting critical edges which adds more basic blocks,
 	// so we store the currently existing basic blocks in nonSplitBlocks temporarily.
 	// That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks.
@@ -47,20 +45,20 @@ func passLayoutBlocks(b *builder) {
 	for _, blk := range nonSplitBlocks {
 		for i := range blk.preds {
 			pred := blk.preds[i].blk
-			if _, ok := b.blkVisited[pred]; ok || !pred.Valid() {
+			if pred.visited == 1 || !pred.Valid() {
 				continue
 			} else if pred.reversePostOrder < blk.reversePostOrder {
 				// This means the edge is critical, and this pred is the trampoline and yet to be inserted.
 				// Split edge trampolines must come before the destination in reverse post-order.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred)
-				b.blkVisited[pred] = 0 // mark as inserted, the value is not used.
+				pred.visited = 1 // mark as inserted.
 			}
 		}
 
 		// Now that we've already added all the potential trampoline blocks incoming to this block,
 		// we can add this block itself.
 		b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk)
-		b.blkVisited[blk] = 0 // mark as inserted, the value is not used.
+		blk.visited = 1 // mark as inserted.
 
 		if len(blk.success) < 2 {
 			// There won't be critical edge originating from this block.
@@ -116,7 +114,7 @@ func passLayoutBlocks(b *builder) {
 			if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline {
 				// This can be lowered as fallthrough at the end of the block.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} else {
 				uninsertedTrampolines = append(uninsertedTrampolines, trampoline)
 			}
@@ -126,7 +124,7 @@ func passLayoutBlocks(b *builder) {
 			if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself.
 				// This means the critical edge was backward, so we insert after the current block immediately.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} // If the target is forward, we can wait to insert until the target is inserted.
 		}
 		uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block.
@@ -142,8 +140,8 @@ func passLayoutBlocks(b *builder) {
 
 	if wazevoapi.SSAValidationEnabled {
 		for _, trampoline := range trampolines {
-			if _, ok := b.blkVisited[trampoline]; !ok {
-				panic("BUG: trampoline block not inserted: " + trampoline.FormatHeader(b))
+			if trampoline.visited != 1 {
+				panic("BUG: trampoline block not inserted: " + trampoline.formatHeader(b))
 			}
 			trampoline.validate(b)
 		}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
index 50cb9c475..e8288c4bd 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
@@ -15,10 +15,6 @@ import (
 // At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag.
 func passCalculateImmediateDominators(b *builder) {
 	reversePostOrder := b.reversePostOrderedBasicBlocks[:0]
-	exploreStack := b.blkStack[:0]
-	b.clearBlkVisited()
-
-	entryBlk := b.entryBlk()
 
 	// Store the reverse postorder from the entrypoint into reversePostOrder slice.
 	// This calculation of reverse postorder is not described in the paper,
@@ -28,14 +24,17 @@ func passCalculateImmediateDominators(b *builder) {
 	// which is a reasonable assumption as long as SSA Builder is properly used.
 	//
 	// First we push blocks in postorder iteratively visit successors of the entry block.
-	exploreStack = append(exploreStack, entryBlk)
+	entryBlk := b.entryBlk()
+	exploreStack := append(b.blkStack[:0], entryBlk)
+	// These flags are used to track the state of the block in the DFS traversal.
+	// We temporarily use the reversePostOrder field to store the state.
 	const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2
-	b.blkVisited[entryBlk] = visitStateSeen
+	entryBlk.visited = visitStateSeen
 	for len(exploreStack) > 0 {
 		tail := len(exploreStack) - 1
 		blk := exploreStack[tail]
 		exploreStack = exploreStack[:tail]
-		switch b.blkVisited[blk] {
+		switch blk.visited {
 		case visitStateUnseen:
 			// This is likely a bug in the frontend.
 			panic("BUG: unsupported CFG")
@@ -48,16 +47,18 @@ func passCalculateImmediateDominators(b *builder) {
 				if succ.ReturnBlock() || succ.invalid {
 					continue
 				}
-				if b.blkVisited[succ] == visitStateUnseen {
-					b.blkVisited[succ] = visitStateSeen
+				if succ.visited == visitStateUnseen {
+					succ.visited = visitStateSeen
 					exploreStack = append(exploreStack, succ)
 				}
 			}
 			// Finally, we could pop this block once we pop all of its successors.
-			b.blkVisited[blk] = visitStateDone
+			blk.visited = visitStateDone
 		case visitStateDone:
 			// Note: at this point we push blk in postorder despite its name.
 			reversePostOrder = append(reversePostOrder, blk)
+		default:
+			panic("BUG")
 		}
 	}
 	// At this point, reversePostOrder has postorder actually, so we reverse it.
@@ -67,7 +68,7 @@ func passCalculateImmediateDominators(b *builder) {
 	}
 
 	for i, blk := range reversePostOrder {
-		blk.reversePostOrder = i
+		blk.reversePostOrder = int32(i)
 	}
 
 	// Reuse the dominators slice if possible from the previous computation of function.
@@ -180,7 +181,7 @@ func passBuildLoopNestingForest(b *builder) {
 			b.loopNestingForestRoots = append(b.loopNestingForestRoots, blk)
 		} else if n == ent {
 		} else if n.loopHeader {
-			n.loopNestingForestChildren = append(n.loopNestingForestChildren, blk)
+			n.loopNestingForestChildren = n.loopNestingForestChildren.Append(&b.varLengthBasicBlockPool, blk)
 		}
 	}
 
@@ -193,7 +194,7 @@ func passBuildLoopNestingForest(b *builder) {
 
 func printLoopNestingForest(root *basicBlock, depth int) {
 	fmt.Println(strings.Repeat("\t", depth), "loop nesting forest root:", root.ID())
-	for _, child := range root.loopNestingForestChildren {
+	for _, child := range root.loopNestingForestChildren.View() {
 		fmt.Println(strings.Repeat("\t", depth+1), "child:", child.ID())
 		if child.LoopHeader() {
 			printLoopNestingForest(child.(*basicBlock), depth+2)
@@ -202,10 +203,10 @@ func printLoopNestingForest(root *basicBlock, depth int) {
 }
 
 type dominatorSparseTree struct {
-	time         int
+	time         int32
 	euler        []*basicBlock
-	first, depth []int
-	table        [][]int
+	first, depth []int32
+	table        [][]int32
 }
 
 // passBuildDominatorTree builds the dominator tree for the function, and constructs builder.sparseTree.
@@ -232,11 +233,11 @@ func passBuildDominatorTree(b *builder) {
 	n := b.basicBlocksPool.Allocated()
 	st := &b.sparseTree
 	st.euler = append(st.euler[:0], make([]*basicBlock, 2*n-1)...)
-	st.first = append(st.first[:0], make([]int, n)...)
+	st.first = append(st.first[:0], make([]int32, n)...)
 	for i := range st.first {
 		st.first[i] = -1
 	}
-	st.depth = append(st.depth[:0], make([]int, 2*n-1)...)
+	st.depth = append(st.depth[:0], make([]int32, 2*n-1)...)
 	st.time = 0
 
 	// Start building the sparse tree.
@@ -244,9 +245,9 @@ func passBuildDominatorTree(b *builder) {
 	st.buildSparseTable()
 }
 
-func (dt *dominatorSparseTree) eulerTour(node *basicBlock, height int) {
+func (dt *dominatorSparseTree) eulerTour(node *basicBlock, height int32) {
 	if wazevoapi.SSALoggingEnabled {
-		fmt.Println(strings.Repeat("\t", height), "euler tour:", node.ID())
+		fmt.Println(strings.Repeat("\t", int(height)), "euler tour:", node.ID())
 	}
 	dt.euler[dt.time] = node
 	dt.depth[dt.time] = height
@@ -270,13 +271,13 @@ func (dt *dominatorSparseTree) buildSparseTable() {
 	table := dt.table
 
 	if n >= len(table) {
-		table = append(table, make([][]int, n+1)...)
+		table = append(table, make([][]int32, n-len(table)+1)...)
 	}
 	for i := range table {
 		if len(table[i]) < k {
-			table[i] = append(table[i], make([]int, k)...)
+			table[i] = append(table[i], make([]int32, k-len(table[i]))...)
 		}
-		table[i][0] = i
+		table[i][0] = int32(i)
 	}
 
 	for j := 1; 1<<j <= n; j++ {
@@ -292,7 +293,7 @@ func (dt *dominatorSparseTree) buildSparseTable() {
 }
 
 // rmq performs a range minimum query on the sparse table.
-func (dt *dominatorSparseTree) rmq(l, r int) int {
+func (dt *dominatorSparseTree) rmq(l, r int32) int32 {
 	table := dt.table
 	depth := dt.depth
 	j := int(math.Log2(float64(r - l + 1)))
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
index e8c8cd9de..73daf4269 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
@@ -21,6 +21,9 @@ const (
 
 	// TypeV128 represents 128-bit SIMD vectors.
 	TypeV128
+
+	// -- Do not add new types after this line. ----
+	typeEnd
 )
 
 // String implements fmt.Stringer.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
index 3149fdc9e..313e34f9a 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
@@ -69,7 +69,7 @@ type IDedPool[T any] struct {
 
 // NewIDedPool returns a new IDedPool.
 func NewIDedPool[T any](resetFn func(*T)) IDedPool[T] {
-	return IDedPool[T]{pool: NewPool[T](resetFn)}
+	return IDedPool[T]{pool: NewPool[T](resetFn), maxIDEncountered: -1}
 }
 
 // GetOrAllocate returns the T with the given id.
@@ -97,7 +97,7 @@ func (p *IDedPool[T]) Get(id int) *T {
 // Reset resets the pool.
 func (p *IDedPool[T]) Reset() {
 	p.pool.Reset()
-	for i := range p.idToItems {
+	for i := 0; i <= p.maxIDEncountered; i++ {
 		p.idToItems[i] = nil
 	}
 	p.maxIDEncountered = -1
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go
index 25d7d3fdc..0dc6ec19c 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid.go
@@ -6,6 +6,9 @@ type CpuFeatureFlags interface {
 	Has(cpuFeature CpuFeature) bool
 	// HasExtra returns true when the specified extraFlag (represented as uint64) is supported
 	HasExtra(cpuFeature CpuFeature) bool
+	// Raw returns the raw bitset that represents CPU features used by wazero. This can be used for cache keying.
+	// For now, we only use four features, so uint64 is enough.
+	Raw() uint64
 }
 
 type CpuFeature uint64
@@ -17,9 +20,11 @@ const (
 	CpuFeatureAmd64SSE4_1 CpuFeature = 1 << 19
 	// CpuFeatureAmd64SSE4_2 is the flag to query CpuFeatureFlags.Has for SSEv4.2 capabilities on amd64
 	CpuFeatureAmd64SSE4_2 CpuFeature = 1 << 20
+	// Note: when adding new features, ensure that the feature is included in CpuFeatureFlags.Raw.
 )
 
 const (
 	// CpuExtraFeatureAmd64ABM is the flag to query CpuFeatureFlags.HasExtra for Advanced Bit Manipulation capabilities (e.g. LZCNT) on amd64
 	CpuExtraFeatureAmd64ABM CpuFeature = 1 << 5
+	// Note: when adding new features, ensure that the feature is included in CpuFeatureFlags.Raw.
 )
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
index 8c9f1a9f3..fbdb53936 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
@@ -2,10 +2,10 @@
 
 package platform
 
-// CpuFeatures exposes the capabilities for this CPU, queried via the Has, HasExtra methods
-var CpuFeatures CpuFeatureFlags = loadCpuFeatureFlags()
+// CpuFeatures exposes the capabilities for this CPU, queried via the Has, HasExtra methods.
+var CpuFeatures = loadCpuFeatureFlags()
 
-// cpuFeatureFlags implements CpuFeatureFlags interface
+// cpuFeatureFlags implements CpuFeatureFlags interface.
 type cpuFeatureFlags struct {
 	flags      uint64
 	extraFlags uint64
@@ -15,13 +15,13 @@ type cpuFeatureFlags struct {
 // implemented in impl_amd64.s
 func cpuid(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32)
 
-// cpuidAsBitmap combines the result of invoking cpuid to uint64 bitmap
+// cpuidAsBitmap combines the result of invoking cpuid to uint64 bitmap.
 func cpuidAsBitmap(arg1, arg2 uint32) uint64 {
 	_ /* eax */, _ /* ebx */, ecx, edx := cpuid(arg1, arg2)
 	return (uint64(edx) << 32) | uint64(ecx)
 }
 
-// loadStandardRange load flags from the standard range, panics otherwise
+// loadStandardRange load flags from the standard range, panics otherwise.
 func loadStandardRange(id uint32) uint64 {
 	// ensure that the id is in the valid range, returned by cpuid(0,0)
 	maxRange, _, _, _ := cpuid(0, 0)
@@ -31,7 +31,7 @@ func loadStandardRange(id uint32) uint64 {
 	return cpuidAsBitmap(id, 0)
 }
 
-// loadStandardRange load flags from the extended range, panics otherwise
+// loadStandardRange load flags from the extended range, panics otherwise.
 func loadExtendedRange(id uint32) uint64 {
 	// ensure that the id is in the valid range, returned by cpuid(0x80000000,0)
 	maxRange, _, _, _ := cpuid(0x80000000, 0)
@@ -48,12 +48,32 @@ func loadCpuFeatureFlags() CpuFeatureFlags {
 	}
 }
 
-// Has implements the same method on the CpuFeatureFlags interface
+// Has implements the same method on the CpuFeatureFlags interface.
 func (f *cpuFeatureFlags) Has(cpuFeature CpuFeature) bool {
 	return (f.flags & uint64(cpuFeature)) != 0
 }
 
-// HasExtra implements the same method on the CpuFeatureFlags interface
+// HasExtra implements the same method on the CpuFeatureFlags interface.
 func (f *cpuFeatureFlags) HasExtra(cpuFeature CpuFeature) bool {
 	return (f.extraFlags & uint64(cpuFeature)) != 0
 }
+
+// Raw implements the same method on the CpuFeatureFlags interface.
+func (f *cpuFeatureFlags) Raw() uint64 {
+	// Below, we only set the first 4 bits for the features we care about,
+	// instead of setting all the unnecessary bits obtained from the CPUID instruction.
+	var ret uint64
+	if f.Has(CpuFeatureAmd64SSE3) {
+		ret = 1 << 0
+	}
+	if f.Has(CpuFeatureAmd64SSE4_1) {
+		ret |= 1 << 1
+	}
+	if f.Has(CpuFeatureAmd64SSE4_2) {
+		ret |= 1 << 2
+	}
+	if f.HasExtra(CpuExtraFeatureAmd64ABM) {
+		ret |= 1 << 3
+	}
+	return ret
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
index 8ae826d36..291bcea65 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
@@ -4,11 +4,14 @@ package platform
 
 var CpuFeatures CpuFeatureFlags = &cpuFeatureFlags{}
 
-// cpuFeatureFlags implements CpuFeatureFlags for unsupported platforms
+// cpuFeatureFlags implements CpuFeatureFlags for unsupported platforms.
 type cpuFeatureFlags struct{}
 
-// Has implements the same method on the CpuFeatureFlags interface
+// Has implements the same method on the CpuFeatureFlags interface.
 func (c *cpuFeatureFlags) Has(cpuFeature CpuFeature) bool { return false }
 
-// HasExtra implements the same method on the CpuFeatureFlags interface
+// HasExtra implements the same method on the CpuFeatureFlags interface.
 func (c *cpuFeatureFlags) HasExtra(cpuFeature CpuFeature) bool { return false }
+
+// Raw implements the same method on the CpuFeatureFlags interface.
+func (c *cpuFeatureFlags) Raw() uint64 { return 0 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
index a61996d58..b0519003b 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
@@ -12,8 +12,6 @@ const (
 	mmapProtARM64 = syscall.PROT_READ | syscall.PROT_WRITE
 )
 
-const MmapSupported = true
-
 func munmapCodeSegment(code []byte) error {
 	return syscall.Munmap(code)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
index 27833db37..079aa643f 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
@@ -9,8 +9,6 @@ import (
 
 var errUnsupported = fmt.Errorf("mmap unsupported on GOOS=%s. Use interpreter instead.", runtime.GOOS)
 
-const MmapSupported = false
-
 func munmapCodeSegment(code []byte) error {
 	panic(errUnsupported)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
index 69fcb6d6b..03a254d4a 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
@@ -21,8 +21,6 @@ const (
 	windows_PAGE_EXECUTE_READWRITE uintptr = 0x00000040
 )
 
-const MmapSupported = true
-
 func munmapCodeSegment(code []byte) error {
 	return freeMemory(code)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go b/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go
index 9a77205bb..fdbf1fde0 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/sysfs/file.go
@@ -38,9 +38,6 @@ func NewStdioFile(stdin bool, f fs.File) (fsapi.File, error) {
 }
 
 func OpenFile(path string, flag experimentalsys.Oflag, perm fs.FileMode) (*os.File, experimentalsys.Errno) {
-	if flag&experimentalsys.O_DIRECTORY != 0 && flag&(experimentalsys.O_WRONLY|experimentalsys.O_RDWR) != 0 {
-		return nil, experimentalsys.EISDIR // invalid to open a directory writeable
-	}
 	return openFile(path, flag, perm)
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go
index 58a458217..61a342ef2 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/engine.go
@@ -69,4 +69,7 @@ type ModuleEngine interface {
 	// FunctionInstanceReference returns Reference for the given Index for a FunctionInstance. The returned values are used by
 	// the initialization via ElementSegment.
 	FunctionInstanceReference(funcIndex Index) Reference
+
+	// MemoryGrown notifies the engine that the memory has grown.
+	MemoryGrown()
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
index 8da689076..ce2c7254d 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
@@ -67,11 +67,6 @@ func (m *Module) validateFunctionWithMaxStackValues(
 	declaredFunctionIndexes map[Index]struct{},
 	br *bytes.Reader,
 ) error {
-	nonStaticLocals := make(map[Index]struct{})
-	if len(m.NonStaticLocals) > 0 {
-		m.NonStaticLocals[idx] = nonStaticLocals
-	}
-
 	functionType := &m.TypeSection[m.FunctionSection[idx]]
 	code := &m.CodeSection[idx]
 	body := code.Body
@@ -357,7 +352,6 @@ func (m *Module) validateFunctionWithMaxStackValues(
 					return fmt.Errorf("invalid local index for %s %d >= %d(=len(locals)+len(parameters))",
 						OpcodeLocalSetName, index, l)
 				}
-				nonStaticLocals[index] = struct{}{}
 				var expType ValueType
 				if index < inputLen {
 					expType = functionType.Params[index]
@@ -373,7 +367,6 @@ func (m *Module) validateFunctionWithMaxStackValues(
 					return fmt.Errorf("invalid local index for %s %d >= %d(=len(locals)+len(parameters))",
 						OpcodeLocalTeeName, index, l)
 				}
-				nonStaticLocals[index] = struct{}{}
 				var expType ValueType
 				if index < inputLen {
 					expType = functionType.Params[index]
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go
index 5cc5012da..947b16112 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/memory.go
@@ -59,11 +59,14 @@ type MemoryInstance struct {
 	// with a fixed weight of 1 and no spurious notifications.
 	waiters sync.Map
 
+	// ownerModuleEngine is the module engine that owns this memory instance.
+	ownerModuleEngine ModuleEngine
+
 	expBuffer experimental.LinearMemory
 }
 
 // NewMemoryInstance creates a new instance based on the parameters in the SectionIDMemory.
-func NewMemoryInstance(memSec *Memory, allocator experimental.MemoryAllocator) *MemoryInstance {
+func NewMemoryInstance(memSec *Memory, allocator experimental.MemoryAllocator, moduleEngine ModuleEngine) *MemoryInstance {
 	minBytes := MemoryPagesToBytesNum(memSec.Min)
 	capBytes := MemoryPagesToBytesNum(memSec.Cap)
 	maxBytes := MemoryPagesToBytesNum(memSec.Max)
@@ -89,12 +92,13 @@ func NewMemoryInstance(memSec *Memory, allocator experimental.MemoryAllocator) *
 		buffer = make([]byte, minBytes, capBytes)
 	}
 	return &MemoryInstance{
-		Buffer:    buffer,
-		Min:       memSec.Min,
-		Cap:       memoryBytesNumToPages(uint64(cap(buffer))),
-		Max:       memSec.Max,
-		Shared:    memSec.IsShared,
-		expBuffer: expBuffer,
+		Buffer:            buffer,
+		Min:               memSec.Min,
+		Cap:               memoryBytesNumToPages(uint64(cap(buffer))),
+		Max:               memSec.Max,
+		Shared:            memSec.IsShared,
+		expBuffer:         expBuffer,
+		ownerModuleEngine: moduleEngine,
 	}
 }
 
@@ -247,14 +251,12 @@ func (m *MemoryInstance) Grow(delta uint32) (result uint32, ok bool) {
 			m.Buffer = buffer
 			m.Cap = newPages
 		}
-		return currentPages, true
 	} else if newPages > m.Cap { // grow the memory.
 		if m.Shared {
 			panic("shared memory cannot be grown, this is a bug in wazero")
 		}
 		m.Buffer = append(m.Buffer, make([]byte, MemoryPagesToBytesNum(delta))...)
 		m.Cap = newPages
-		return currentPages, true
 	} else { // We already have the capacity we need.
 		if m.Shared {
 			// We assume grow is called under a guest lock.
@@ -264,8 +266,9 @@ func (m *MemoryInstance) Grow(delta uint32) (result uint32, ok bool) {
 		} else {
 			m.Buffer = m.Buffer[:MemoryPagesToBytesNum(newPages)]
 		}
-		return currentPages, true
 	}
+	m.ownerModuleEngine.MemoryGrown()
+	return currentPages, true
 }
 
 // Pages implements the same method as documented on api.Memory.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go b/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go
index 68573b918..8369ad9ed 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/module.go
@@ -185,9 +185,6 @@ type Module struct {
 	// as described in https://yurydelendik.github.io/webassembly-dwarf/, though it is not specified in the Wasm
 	// specification: https://github.com/WebAssembly/debugging/issues/1
 	DWARFLines *wasmdebug.DWARFLines
-
-	// NonStaticLocals collects the local indexes that will change its value through either local.get or local.tee.
-	NonStaticLocals []map[Index]struct{}
 }
 
 // ModuleID represents sha256 hash value uniquely assigned to Module.
@@ -366,8 +363,6 @@ func (m *Module) validateFunctions(enabledFeatures api.CoreFeatures, functions [
 	br := bytes.NewReader(nil)
 	// Also, we reuse the stacks across multiple function validations to reduce allocations.
 	vs := &stacks{}
-	// Non-static locals are gathered during validation and used in the down-stream compilation.
-	m.NonStaticLocals = make([]map[Index]struct{}, len(m.FunctionSection))
 	for idx, typeIndex := range m.FunctionSection {
 		if typeIndex >= typeCount {
 			return fmt.Errorf("invalid %s: type section index %d out of range", m.funcDesc(SectionIDFunction, Index(idx)), typeIndex)
@@ -655,7 +650,7 @@ func paramNames(localNames IndirectNameMap, funcIdx uint32, paramLen int) []stri
 func (m *ModuleInstance) buildMemory(module *Module, allocator experimental.MemoryAllocator) {
 	memSec := module.MemorySection
 	if memSec != nil {
-		m.MemoryInstance = NewMemoryInstance(memSec, allocator)
+		m.MemoryInstance = NewMemoryInstance(memSec, allocator, m.Engine)
 		m.MemoryInstance.definition = &module.MemoryDefinitionSection[0]
 	}
 }
diff --git a/vendor/modules.txt b/vendor/modules.txt
index d0fd99a6c..201dcdd5c 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -517,7 +517,7 @@ github.com/modern-go/concurrent
 # github.com/modern-go/reflect2 v1.0.2
 ## explicit; go 1.12
 github.com/modern-go/reflect2
-# github.com/ncruces/go-sqlite3 v0.16.1
+# github.com/ncruces/go-sqlite3 v0.16.2
 ## explicit; go 1.21
 github.com/ncruces/go-sqlite3
 github.com/ncruces/go-sqlite3/driver
@@ -833,7 +833,7 @@ github.com/tdewolff/parse/v2/strconv
 # github.com/technologize/otel-go-contrib v1.1.1
 ## explicit; go 1.17
 github.com/technologize/otel-go-contrib/otelginmetrics
-# github.com/tetratelabs/wazero v1.7.2
+# github.com/tetratelabs/wazero v1.7.3
 ## explicit; go 1.20
 github.com/tetratelabs/wazero
 github.com/tetratelabs/wazero/api