]> git.baikalelectronics.ru Git - kernel.git/commitdiff
KVM: x86: fix MSR_IA32_TSC read for nested migration
authorMaxim Levitsky <mlevitsk@redhat.com>
Mon, 21 Sep 2020 10:38:05 +0000 (13:38 +0300)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 24 Sep 2020 17:35:07 +0000 (13:35 -0400)
MSR reads/writes should always access the L1 state, since the (nested)
hypervisor should intercept all the msrs it wants to adjust, and these
that it doesn't should be read by the guest as if the host had read it.

However IA32_TSC is an exception. Even when not intercepted, guest still
reads the value + TSC offset.
The write however does not take any TSC offset into account.

This is documented in Intel's SDM and seems also to happen on AMD as well.

This creates a problem when userspace wants to read the IA32_TSC value and then
write it. (e.g for migration)

In this case it reads L2 value but write is interpreted as an L1 value.
To fix this make the userspace initiated reads of IA32_TSC return L1 value
as well.

Huge thanks to Dave Gilbert for helping me understand this very confusing
semantic of MSR writes.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20200921103805.9102-2-mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/kvm/x86.c

index 67362607e396d7704e9cf01b9c95f352f59a47f3..e3fe1d126cc3547f6420120c71059c3748da7b31 100644 (file)
@@ -3224,9 +3224,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_POWER_CTL:
                msr_info->data = vcpu->arch.msr_ia32_power_ctl;
                break;
-       case MSR_IA32_TSC:
-               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+       case MSR_IA32_TSC: {
+               /*
+                * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+                * even when not intercepted. AMD manual doesn't explicitly
+                * state this but appears to behave the same.
+                *
+                * On userspace reads and writes, however, we unconditionally
+                * operate L1's TSC value to ensure backwards-compatible
+                * behavior for migration.
+                */
+               u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+                                                           vcpu->arch.tsc_offset;
+
+               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
                break;
+       }
        case MSR_MTRRcap:
        case 0x200 ... 0x2ff:
                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);