pkgsrc/sysutils/xenkernel411/patches/patch-XSA286

$NetBSD: patch-XSA286,v 1.2 2020/11/12 11:29:25 bouyer Exp $

From: Jan Beulich <jbeulich@suse.com>
Subject: x86: don't allow clearing of TF_kernel_mode for other than 64-bit PV

The flag is really only meant for those, both HVM and 32-bit PV tell
kernel from user mode based on CPL/RPL. Remove the all-question-marks
comment and let's be on the safe side here and also suppress clearing
for 32-bit PV (this isn't a fast path after all).

Remove no longer necessary is_pv_32bit_*() from sh_update_cr3() and
sh_walk_guest_tables(). Note that shadow_one_bit_disable() already
assumes the new behavior.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: George Dunlap <george.dunlap@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 35857dbe86..1d0ac81c5b 100644
--- xen/arch/x86/domain.c.orig
+++ xen/arch/x86/domain.c
@@ -804,9 +804,15 @@ int arch_set_info_guest(

     v->fpu_initialised = !!(flags & VGCF_I387_VALID);

-    v->arch.flags &= ~TF_kernel_mode;
-    if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
-        v->arch.flags |= TF_kernel_mode;
+    v->arch.flags |= TF_kernel_mode;
+    if ( unlikely(!(flags & VGCF_in_kernel)) &&
+         /*
+          * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
+          * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
+          * shadow_one_bit_disable() for why that is.
+          */
+         !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
+        v->arch.flags &= ~TF_kernel_mode;

     v->arch.vgc_flags = flags;

diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 8ab343d16e..a2ebb4943f 100644
--- xen/arch/x86/mm/shadow/multi.c.orig
+++ xen/arch/x86/mm/shadow/multi.c
@@ -180,7 +180,7 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                              INVALID_MFN, v->arch.paging.shadow.gl3e);
 #else /* 32 or 64 */
     const struct domain *d = v->domain;
-    mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d)
+    mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
                       ? pagetable_get_mfn(v->arch.guest_table)
                       : pagetable_get_mfn(v->arch.guest_table_user));
     void *root_map = map_domain_page(root_mfn);
@@ -4018,7 +4018,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
                   v, (unsigned long)pagetable_get_pfn(v->arch.guest_table));

 #if GUEST_PAGING_LEVELS == 4
-    if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) )
+    if ( !(v->arch.flags & TF_kernel_mode) )
         gmfn = pagetable_get_mfn(v->arch.guest_table_user);
     else
 #endif
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 3bd157967a..e73daa55e4 100644
--- xen/arch/x86/x86_64/mm.c.orig
+++ xen/arch/x86/x86_64/mm.c
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;

 l2_pgentry_t *compat_idle_pg_table_l2;

-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;

-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();

     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();

     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index e73daa55e4..1ca9547d68 100644
--- xen/arch/x86/x86_64/mm.c.orig
+++ xen/arch/x86/x86_64/mm.c
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;

 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();

     if ( !is_canonical_address(addr) )
         return l4e_empty();

-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn);
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);

     return l4e;
 }
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();

     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();

-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);

     return l3e;
 }
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;

     if ( !is_pv_vcpu(v) )
         return NULL;

     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }

-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }

-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;

  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }

 /*
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 80bf280fb2..ee08c13881 100644
--- xen/arch/x86/pv/mm.c.orig
+++ xen/arch/x86/pv/mm.c
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;

-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);

     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 1ca9547d68..dfa33ba894 100644
--- xen/arch/x86/x86_64/mm.c.orig
+++ xen/arch/x86/x86_64/mm.c
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
     return l3e;
 }

+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 7825691d06..afafe87fe7 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@ -585,7 +585,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);

 int __sync_local_execstate(void);

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index ee08c13881..c70785d0cf 100644
--- xen/arch/x86/pv/mm.c.orig
+++ xen/arch/x86/pv/mm.c
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
 }

 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index 976209ba4c..cc4ee1affb 100644
--- xen/arch/x86/pv/mm.h.orig
+++ xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);

 int new_guest_cr3(mfn_t mfn);

-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();

     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));

-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);

     return l1e;
 }
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index a3c0c2dd19..c9ee5156f8 100644
--- xen/arch/x86/pv/ro-page-fault.c.orig
+++ xen/arch/x86/pv/ro-page-fault.c
@@ -357,7 +357,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     bool mmio_ro;

     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);

     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index dfa33ba894..cca7ea6e9d 100644
--- xen/arch/x86/x86_64/mm.c.orig
+++ xen/arch/x86/x86_64/mm.c
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
     return l2e;
 }

+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index afafe87fe7..423313ae3a 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@ -588,6 +588,7 @@ int vcpu_destroy_pagetables(struct vcpu *);

 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);

 int __sync_local_execstate(void);

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using top level linear page tables in
 {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 0c24530ed9..d89fa27f8e 100644
--- xen/arch/x86/domain_page.c.orig
+++ xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]

 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
     unsigned int bitmap_pages;
+    int rc;

     ASSERT(is_pv_domain(d));

@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
         return 0;
 #endif

+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)

     spin_lock_init(&dcache->lock);

-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }

 int mapcache_vcpu_init(struct vcpu *v)
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }

     return l1e_get_mfn(*pl1e);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 626768a950..8f975a747d 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -6038,6 +6038,10 @@ void free_perdomain_mappings(struct domain *d)
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);

+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 8f975a747d..10175764e8 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -1755,9 +1755,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];

-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);

     /* Slot 259: Shadow linear mappings (if applicable) .*/
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index c7fa18925b..1933a6a2a2 100644
--- xen/arch/x86/mm/shadow/private.h.orig
+++ xen/arch/x86/mm/shadow/private.h
@@ -137,6 +137,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif

+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index cca7ea6e9d..d7551e594a 100644
--- xen/arch/x86/x86_64/mm.c.orig
+++ xen/arch/x86/x86_64/mm.c
@@ -833,9 +833,6 @@ void __init paging_init(void)

     machine_to_phys_mapping_valid = 1;

-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;

  nomem:
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 9ef9d03ca7..4670ab99f6 100644
--- xen/include/asm-x86/config.h.orig
+++ xen/include/asm-x86/config.h
@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index c1e92937c0..e72c277b9f 100644
--- xen/include/asm-x86/page.h.orig
+++ xen/include/asm-x86/page.h
@@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *);
 #define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))

-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;
From 1d021db3c8712d25e25f078833baa160c90f260f Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Thu, 22 Oct 2020 11:28:58 +0100
Subject: [PATCH 1/2] x86/pv: Drop FLUSH_TLB_GLOBAL in do_mmu_update() for XPTI

c/s 9d1d31ad9498 "x86: slightly reduce Meltdown band-aid overhead" removed the
use of Global TLB flushes on the Xen entry path, but added a FLUSH_TLB_GLOBAL
to the L4 path in do_mmu_update().

However, this was unnecessary.

It is the guests responsibility to perform appropriate TLB flushing if the L4
modification altered an established mapping in a flush-relevant way.  In this
case, an MMUEXT_OP hypercall will follow.  The case which Xen needs to cover
is when new mappings are created, and the resync on the exit-to-guest path
covers this correctly.

There is a corner case with multiple vCPUs in hypercalls at the same time,
which 9d1d31ad9498 changed, and this patch changes back to its original XPTI
behaviour.

Architecturally, established TLB entries can continue to be used until the
broadcast flush has completed.  Therefore, even with concurrent hypercalls,
the guest cannot depend on older mappings not being used until an MMUEXT_OP
hypercall completes.  Xen's implementation of guest-initiated flushes will
take correct effect on top of an in-progress hypercall, picking up new mapping
setting before the other vCPU's MMUEXT_OP completes.

Note: The correctness of this change is not impacted by whether XPTI uses
global mappings or not.  Correctness there depends on the behaviour of Xen on
the entry/exit paths when switching two/from the XPTI "shadow" pagetables.

This is (not really) XSA-286 (but necessary to simplify the logic).

Fixes: 9d1d31ad9498 ("x86: slightly reduce Meltdown band-aid overhead")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 055e1c3a3d95b1e753148369fbc4ba48782dd602)
---
 xen/arch/x86/mm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 5ca5c8c9a2..129da1e648 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -4279,7 +4279,7 @@ long do_mmu_update(

         cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
         if ( !cpumask_empty(mask) )
-            flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
+            flush_mask(mask, FLUSH_ROOT_PGTBL);
     }

     perfc_add(num_page_updates, i);
--
2.20.1

From e274c8bdc12eb596e55233040e8b49da27150f31 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 19 Oct 2020 15:51:22 +0100
Subject: [PATCH 2/2] x86/pv: Flush TLB in response to paging structure changes

With MMU_UPDATE, a PV guest can make changes to higher level pagetables.  This
is safe from Xen's point of view (as the update only affects guest mappings),
and the guest is required to flush (if necessary) after making updates.

However, Xen's use of linear pagetables (UPDATE_VA_MAPPING, GNTTABOP_map,
writeable pagetables, etc.) is an implementation detail outside of the
API/ABI.

Changes in the paging structure require invalidations in the linear pagetable
range for subsequent accesses into the linear pagetables to access non-stale
mappings.  Xen must provide suitable flushing to prevent intermixed guest
actions from accidentally accessing/modifying the wrong pagetable.

For all L2 and higher modifications, flush the TLB.  PV guests cannot create
L2 or higher entries with the Global bit set, so no mappings established in
the linear range can be global.  (This could in principle be an order 39 flush
starting at LINEAR_PT_VIRT_START, but no such mechanism exists in practice.)

Express the necessary flushes as a set of booleans which accumulate across the
operation.  Comment the flushing logic extensively.

This is XSA-286.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 16a20963b3209788f2c0d3a3eebb7d92f03f5883)
---
 xen/arch/x86/mm.c | 69 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 129da1e648..3528cf6b85 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -3983,7 +3983,8 @@ long do_mmu_update(
     struct vcpu *curr = current, *v = curr;
     struct domain *d = v->domain, *pt_owner = d, *pg_owner;
     mfn_t map_mfn = INVALID_MFN;
-    bool sync_guest = false;
+    bool flush_linear_pt = false, flush_root_pt_local = false,
+        flush_root_pt_others = false;
     uint32_t xsm_needed = 0;
     uint32_t xsm_checked = 0;
     int rc = put_old_guest_table(curr);
@@ -4133,6 +4134,8 @@ long do_mmu_update(
                         break;
                     rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+                    if ( !rc )
+                        flush_linear_pt = true;
                     break;

                 case PGT_l3_page_table:
@@ -4140,6 +4143,8 @@ long do_mmu_update(
                         break;
                     rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+                    if ( !rc )
+                        flush_linear_pt = true;
                     break;

                 case PGT_l4_page_table:
@@ -4147,6 +4152,8 @@ long do_mmu_update(
                         break;
                     rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+                    if ( !rc )
+                        flush_linear_pt = true;
                     if ( !rc && pt_owner->arch.pv_domain.xpti )
                     {
                         bool local_in_use = false;
@@ -4154,7 +4161,7 @@ long do_mmu_update(
                         if ( pagetable_get_pfn(curr->arch.guest_table) == mfn )
                         {
                             local_in_use = true;
-                            get_cpu_info()->root_pgt_changed = true;
+                            flush_root_pt_local = true;
                         }

                         /*
@@ -4166,7 +4173,7 @@ long do_mmu_update(
                              (1 + !!(page->u.inuse.type_info & PGT_pinned) +
                               (pagetable_get_pfn(curr->arch.guest_table_user) ==
                                mfn) + local_in_use) )
-                            sync_guest = true;
+                            flush_root_pt_others = true;
                     }
                     break;

@@ -4268,19 +4275,61 @@ long do_mmu_update(
     if ( va )
         unmap_domain_page(va);

-    if ( sync_guest )
+    /*
+     * Perform required TLB maintenance.
+     *
+     * This logic currently depend on flush_linear_pt being a superset of the
+     * flush_root_pt_* conditions.
+     *
+     * pt_owner may not be current->domain.  This may occur during
+     * construction of 32bit PV guests, or debugging of PV guests.  The
+     * behaviour cannot be correct with domain unpaused.  We therefore expect
+     * pt_owner->dirty_cpumask to be empty, but it is a waste of effort to
+     * explicitly check for, and exclude, this corner case.
+     *
+     * flush_linear_pt requires a FLUSH_TLB to all dirty CPUs.  The flush must
+     * be performed now to maintain correct behaviour across a multicall.
+     * i.e. we cannot relax FLUSH_TLB to FLUSH_ROOT_PGTBL, given that the
+     * former is a side effect of the latter, because the resync (which is in
+     * the return-to-guest path) happens too late.
+     *
+     * flush_root_pt_* requires FLUSH_ROOT_PGTBL on either the local CPU
+     * (implies pt_owner == current->domain and current->processor set in
+     * pt_owner->dirty_cpumask), and/or all *other* dirty CPUs as there are
+     * references we can't account for locally.
+     */
+    if ( flush_linear_pt /* || flush_root_pt_local || flush_root_pt_others */ )
     {
+        unsigned int cpu = smp_processor_id();
+        cpumask_t *mask = pt_owner->dirty_cpumask;
+
         /*
-         * Force other vCPU-s of the affected guest to pick up L4 entry
-         * changes (if any).
+         * Always handle local flushing separately (if applicable), to
+         * separate the flush invocations appropriately for scope of the two
+         * flush_root_pt_* variables.
          */
-        unsigned int cpu = smp_processor_id();
-        cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
+        if ( likely(cpumask_test_cpu(cpu, mask)) )
+        {
+            mask = per_cpu(scratch_cpumask, cpu);

-        cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
+            cpumask_copy(mask, pt_owner->dirty_cpumask);
+            __cpumask_clear_cpu(cpu, mask);
+
+            flush_local(FLUSH_TLB |
+                        (flush_root_pt_local ? FLUSH_ROOT_PGTBL : 0));
+        }
+        else
+            /* Sanity check.  flush_root_pt_local implies local cpu is dirty. */
+            ASSERT(!flush_root_pt_local);
+
+        /* Flush the remote dirty CPUs.  Does not include the local CPU. */
         if ( !cpumask_empty(mask) )
-            flush_mask(mask, FLUSH_ROOT_PGTBL);
+            flush_mask(mask, FLUSH_TLB |
+                       (flush_root_pt_others ? FLUSH_ROOT_PGTBL : 0));
     }
+    else
+        /* Sanity check.  flush_root_pt_* implies flush_linear_pt. */
+        ASSERT(!flush_root_pt_local && !flush_root_pt_others);

     perfc_add(num_page_updates, i);

--
2.20.1