6 years ago · 0d1e8b8d2b
--- a/Documentation/s390/vfio-ap.txt
+++ b/Documentation/s390/vfio-ap.txt
@@ -0,0 +1,837 @@
 
															+Introduction:
														
 
															+============
														
 
															+The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised
														
 
															+of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards.
														
 
															+The AP devices provide cryptographic functions to all CPUs assigned to a
														
 
															+linux system running in an IBM Z system LPAR.
														
 
															+
														
 
															+The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap
														
 
															+is to make AP cards available to KVM guests using the VFIO mediated device
														
 
															+framework. This implementation relies considerably on the s390 virtualization
														
 
															+facilities which do most of the hard work of providing direct access to AP
														
 
															+devices.
														
 
															+
														
 
															+AP Architectural Overview:
														
 
															+=========================
														
 
															+To facilitate the comprehension of the design, let's start with some
														
 
															+definitions:
														
 
															+
														
 
															+* AP adapter
														
 
															+
														
 
															+  An AP adapter is an IBM Z adapter card that can perform cryptographic
														
 
															+  functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters
														
 
															+  assigned to the LPAR in which a linux host is running will be available to
														
 
															+  the linux host. Each adapter is identified by a number from 0 to 255; however,
														
 
															+  the maximum adapter number is determined by machine model and/or adapter type.
														
 
															+  When installed, an AP adapter is accessed by AP instructions executed by any
														
 
															+  CPU.
														
 
															+
														
 
															+  The AP adapter cards are assigned to a given LPAR via the system's Activation
														
 
															+  Profile which can be edited via the HMC. When the linux host system is IPL'd
														
 
															+  in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and
														
 
															+  creates a sysfs device for each assigned adapter. For example, if AP adapters
														
 
															+  4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following
														
 
															+  sysfs device entries:
														
 
															+
														
 
															+    /sys/devices/ap/card04
														
 
															+    /sys/devices/ap/card0a
														
 
															+
														
 
															+  Symbolic links to these devices will also be created in the AP bus devices
														
 
															+  sub-directory:
														
 
															+
														
 
															+    /sys/bus/ap/devices/[card04]
														
 
															+    /sys/bus/ap/devices/[card04]
														
 
															+
														
 
															+* AP domain
														
 
															+
														
 
															+  An adapter is partitioned into domains. An adapter can hold up to 256 domains
														
 
															+  depending upon the adapter type and hardware configuration. A domain is
														
 
															+  identified by a number from 0 to 255; however, the maximum domain number is
														
 
															+  determined by machine model and/or adapter type.. A domain can be thought of
														
 
															+  as a set of hardware registers and memory used for processing AP commands. A
														
 
															+  domain can be configured with a secure private key used for clear key
														
 
															+  encryption. A domain is classified in one of two ways depending upon how it
														
 
															+  may be accessed:
														
 
															+
														
 
															+    * Usage domains are domains that are targeted by an AP instruction to
														
 
															+      process an AP command.
														
 
															+
														
 
															+    * Control domains are domains that are changed by an AP command sent to a
														
 
															+      usage domain; for example, to set the secure private key for the control
														
 
															+      domain.
														
 
															+
														
 
															+  The AP usage and control domains are assigned to a given LPAR via the system's
														
 
															+  Activation Profile which can be edited via the HMC. When a linux host system
														
 
															+  is IPL'd in the LPAR, the AP bus module detects the AP usage and control
														
 
															+  domains assigned to the LPAR. The domain number of each usage domain and
														
 
															+  adapter number of each AP adapter are combined to create AP queue devices
														
 
															+  (see AP Queue section below). The domain number of each control domain will be
														
 
															+  represented in a bitmask and stored in a sysfs file
														
 
															+  /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least
														
 
															+  significant bit, correspond to domains 0-255.
														
 
															+
														
 
															+* AP Queue
														
 
															+
														
 
															+  An AP queue is the means by which an AP command is sent to a usage domain
														
 
															+  inside a specific adapter. An AP queue is identified by a tuple
														
 
															+  comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
														
 
															+  APQI corresponds to a given usage domain number within the adapter. This tuple
														
 
															+  forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
														
 
															+  instructions include a field containing the APQN to identify the AP queue to
														
 
															+  which the AP command is to be sent for processing.
														
 
															+
														
 
															+  The AP bus will create a sysfs device for each APQN that can be derived from
														
 
															+  the cross product of the AP adapter and usage domain numbers detected when the
														
 
															+  AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage
														
 
															+  domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the
														
 
															+  following sysfs entries:
														
 
															+
														
 
															+    /sys/devices/ap/card04/04.0006
														
 
															+    /sys/devices/ap/card04/04.0047
														
 
															+    /sys/devices/ap/card0a/0a.0006
														
 
															+    /sys/devices/ap/card0a/0a.0047
														
 
															+
														
 
															+  The following symbolic links to these devices will be created in the AP bus
														
 
															+  devices subdirectory:
														
 
															+
														
 
															+    /sys/bus/ap/devices/[04.0006]
														
 
															+    /sys/bus/ap/devices/[04.0047]
														
 
															+    /sys/bus/ap/devices/[0a.0006]
														
 
															+    /sys/bus/ap/devices/[0a.0047]
														
 
															+
														
 
															+* AP Instructions:
														
 
															+
														
 
															+  There are three AP instructions:
														
 
															+
														
 
															+  * NQAP: to enqueue an AP command-request message to a queue
														
 
															+  * DQAP: to dequeue an AP command-reply message from a queue
														
 
															+  * PQAP: to administer the queues
														
 
															+
														
 
															+  AP instructions identify the domain that is targeted to process the AP
														
 
															+  command; this must be one of the usage domains. An AP command may modify a
														
 
															+  domain that is not one of the usage domains, but the modified domain
														
 
															+  must be one of the control domains.
														
 
															+
														
 
															+AP and SIE:
														
 
															+==========
														
 
															+Let's now take a look at how AP instructions executed on a guest are interpreted
														
 
															+by the hardware.
														
 
															+
														
 
															+A satellite control block called the Crypto Control Block (CRYCB) is attached to
														
 
															+our main hardware virtualization control block. The CRYCB contains three fields
														
 
															+to identify the adapters, usage domains and control domains assigned to the KVM
														
 
															+guest:
														
 
															+
														
 
															+* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
														
 
															+  to the KVM guest. Each bit in the mask, from left to right (i.e. from most
														
 
															+  significant to least significant bit in big endian order), corresponds to
														
 
															+  an APID from 0-255. If a bit is set, the corresponding adapter is valid for
														
 
															+  use by the KVM guest.
														
 
															+
														
 
															+* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
														
 
															+  assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from
														
 
															+  most significant to least significant bit in big endian order), corresponds to
														
 
															+  an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue
														
 
															+  is valid for use by the KVM guest.
														
 
															+
														
 
															+* The AP Domain Mask field is a bit mask that identifies the AP control domains
														
 
															+  assigned to the KVM guest. The ADM bit mask controls which domains can be
														
 
															+  changed by an AP command-request message sent to a usage domain from the
														
 
															+  guest. Each bit in the mask, from left to right (i.e. from most significant to
														
 
															+  least significant bit in big endian order), corresponds to a domain from
														
 
															+  0-255. If a bit is set, the corresponding domain can be modified by an AP
														
 
															+  command-request message sent to a usage domain.
														
 
															+
														
 
															+If you recall from the description of an AP Queue, AP instructions include
														
 
															+an APQN to identify the AP queue to which an AP command-request message is to be
														
 
															+sent (NQAP and PQAP instructions), or from which a command-reply message is to
														
 
															+be received (DQAP instruction). The validity of an APQN is defined by the matrix
														
 
															+calculated from the APM and AQM; it is the cross product of all assigned adapter
														
 
															+numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1
														
 
															+and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6),
														
 
															+(2,5) and (2,6) will be valid for the guest.
														
 
															+
														
 
															+The APQNs can provide secure key functionality - i.e., a private key is stored
														
 
															+on the adapter card for each of its domains - so each APQN must be assigned to
														
 
															+at most one guest or to the linux host.
														
 
															+
														
 
															+   Example 1: Valid configuration:
														
 
															+   ------------------------------
														
 
															+   Guest1: adapters 1,2  domains 5,6
														
 
															+   Guest2: adapter  1,2  domain 7
														
 
															+
														
 
															+   This is valid because both guests have a unique set of APQNs:
														
 
															+      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
														
 
															+      Guest2 has APQNs (1,7), (2,7)
														
 
															+
														
 
															+   Example 2: Valid configuration:
														
 
															+   ------------------------------
														
 
															+   Guest1: adapters 1,2 domains 5,6
														
 
															+   Guest2: adapters 3,4 domains 5,6
														
 
															+
														
 
															+   This is also valid because both guests have a unique set of APQNs:
														
 
															+      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
														
 
															+      Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
														
 
															+
														
 
															+   Example 3: Invalid configuration:
														
 
															+   --------------------------------
														
 
															+   Guest1: adapters 1,2  domains 5,6
														
 
															+   Guest2: adapter  1    domains 6,7
														
 
															+
														
 
															+   This is an invalid configuration because both guests have access to
														
 
															+   APQN (1,6).
														
 
															+
														
 
															+The Design:
														
 
															+===========
														
 
															+The design introduces three new objects:
														
 
															+
														
 
															+1. AP matrix device
														
 
															+2. VFIO AP device driver (vfio_ap.ko)
														
 
															+3. VFIO AP mediated matrix pass-through device
														
 
															+
														
 
															+The VFIO AP device driver
														
 
															+-------------------------
														
 
															+The VFIO AP (vfio_ap) device driver serves the following purposes:
														
 
															+
														
 
															+1. Provides the interfaces to secure APQNs for exclusive use of KVM guests.
														
 
															+
														
 
															+2. Sets up the VFIO mediated device interfaces to manage a mediated matrix
														
 
															+   device and creates the sysfs interfaces for assigning adapters, usage
														
 
															+   domains, and control domains comprising the matrix for a KVM guest.
														
 
															+
														
 
															+3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's
														
 
															+   SIE state description to grant the guest access to a matrix of AP devices
														
 
															+
														
 
															+Reserve APQNs for exclusive use of KVM guests
														
 
															+---------------------------------------------
														
 
															+The following block diagram illustrates the mechanism by which APQNs are
														
 
															+reserved:
														
 
															+
														
 
															+                              +------------------+
														
 
															+               7 remove       |                  |
														
 
															+         +--------------------> cex4queue driver |
														
 
															+         |                    |                  |
														
 
															+         |                    +------------------+
														
 
															+         |
														
 
															+         |
														
 
															+         |                    +------------------+          +-----------------+
														
 
															+         |  5 register driver |                  | 3 create |                 |
														
 
															+         |   +---------------->   Device core    +---------->  matrix device  |
														
 
															+         |   |                |                  |          |                 |
														
 
															+         |   |                +--------^---------+          +-----------------+
														
 
															+         |   |                         |
														
 
															+         |   |                         +-------------------+
														
 
															+         |   | +-----------------------------------+       |
														
 
															+         |   | |      4 register AP driver         |       | 2 register device
														
 
															+         |   | |                                   |       |
														
 
															++--------+---+-v---+                      +--------+-------+-+
														
 
															+|                  |                      |                  |
														
 
															+|      ap_bus      +--------------------- >  vfio_ap driver  |
														
 
															+|                  |       8 probe        |                  |
														
 
															++--------^---------+                      +--^--^------------+
														
 
															+6 edit   |                                   |  |
														
 
															+  apmask |     +-----------------------------+  | 9 mdev create
														
 
															+  aqmask |     |           1 modprobe           |
														
 
															++--------+-----+---+           +----------------+-+         +------------------+
														
 
															+|                  |           |                  |8 create |     mediated     |
														
 
															+|      admin       |           | VFIO device core |--------->     matrix       |
														
 
															+|                  +           |                  |         |     device       |
														
 
															++------+-+---------+           +--------^---------+         +--------^---------+
														
 
															+       | |                              |                            |
														
 
															+       | | 9 create vfio_ap-passthrough |                            |
														
 
															+       | +------------------------------+                            |
														
 
															+       +-------------------------------------------------------------+
														
 
															+                   10  assign adapter/domain/control domain
														
 
															+
														
 
															+The process for reserving an AP queue for use by a KVM guest is:
														
 
															+
														
 
															+1. The administrator loads the vfio_ap device driver
														
 
															+2. The vfio-ap driver during its initialization will register a single 'matrix'
														
 
															+   device with the device core. This will serve as the parent device for
														
 
															+   all mediated matrix devices used to configure an AP matrix for a guest.
														
 
															+3. The /sys/devices/vfio_ap/matrix device is created by the device core
														
 
															+4  The vfio_ap device driver will register with the AP bus for AP queue devices
														
 
															+   of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap
														
 
															+   driver's probe and remove callback interfaces. Devices older than CEX4 queues
														
 
															+   are not supported to simplify the implementation by not needlessly
														
 
															+   complicating the design by supporting older devices that will go out of
														
 
															+   service in the relatively near future, and for which there are few older
														
 
															+   systems around on which to test.
														
 
															+5. The AP bus registers the vfio_ap device driver with the device core
														
 
															+6. The administrator edits the AP adapter and queue masks to reserve AP queues
														
 
															+   for use by the vfio_ap device driver.
														
 
															+7. The AP bus removes the AP queues reserved for the vfio_ap driver from the
														
 
															+   default zcrypt cex4queue driver.
														
 
															+8. The AP bus probes the vfio_ap device driver to bind the queues reserved for
														
 
															+   it.
														
 
															+9. The administrator creates a passthrough type mediated matrix device to be
														
 
															+   used by a guest
														
 
															+10 The administrator assigns the adapters, usage domains and control domains
														
 
															+   to be exclusively used by a guest.
														
 
															+
														
 
															+Set up the VFIO mediated device interfaces
														
 
															+------------------------------------------
														
 
															+The VFIO AP device driver utilizes the common interface of the VFIO mediated
														
 
															+device core driver to:
														
 
															+* Register an AP mediated bus driver to add a mediated matrix device to and
														
 
															+  remove it from a VFIO group.
														
 
															+* Create and destroy a mediated matrix device
														
 
															+* Add a mediated matrix device to and remove it from the AP mediated bus driver
														
 
															+* Add a mediated matrix device to and remove it from an IOMMU group
														
 
															+
														
 
															+The following high-level block diagram shows the main components and interfaces
														
 
															+of the VFIO AP mediated matrix device driver:
														
 
															+
														
 
															+ +-------------+
														
 
															+ |             |
														
 
															+ | +---------+ | mdev_register_driver() +--------------+
														
 
															+ | |  Mdev   | +<-----------------------+              |
														
 
															+ | |  bus    | |                        | vfio_mdev.ko |
														
 
															+ | | driver  | +----------------------->+              |<-> VFIO user
														
 
															+ | +---------+ |    probe()/remove()    +--------------+    APIs
														
 
															+ |             |
														
 
															+ |  MDEV CORE  |
														
 
															+ |   MODULE    |
														
 
															+ |   mdev.ko   |
														
 
															+ | +---------+ | mdev_register_device() +--------------+
														
 
															+ | |Physical | +<-----------------------+              |
														
 
															+ | | device  | |                        |  vfio_ap.ko  |<-> matrix
														
 
															+ | |interface| +----------------------->+              |    device
														
 
															+ | +---------+ |       callback         +--------------+
														
 
															+ +-------------+
														
 
															+
														
 
															+During initialization of the vfio_ap module, the matrix device is registered
														
 
															+with an 'mdev_parent_ops' structure that provides the sysfs attribute
														
 
															+structures, mdev functions and callback interfaces for managing the mediated
														
 
															+matrix device.
														
 
															+
														
 
															+* sysfs attribute structures:
														
 
															+  * supported_type_groups
														
 
															+    The VFIO mediated device framework supports creation of user-defined
														
 
															+    mediated device types. These mediated device types are specified
														
 
															+    via the 'supported_type_groups' structure when a device is registered
														
 
															+    with the mediated device framework. The registration process creates the
														
 
															+    sysfs structures for each mediated device type specified in the
														
 
															+    'mdev_supported_types' sub-directory of the device being registered. Along
														
 
															+    with the device type, the sysfs attributes of the mediated device type are
														
 
															+    provided.
														
 
															+
														
 
															+    The VFIO AP device driver will register one mediated device type for
														
 
															+    passthrough devices:
														
 
															+      /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough
														
 
															+    Only the read-only attributes required by the VFIO mdev framework will
														
 
															+    be provided:
														
 
															+        ... name
														
 
															+        ... device_api
														
 
															+        ... available_instances
														
 
															+        ... device_api
														
 
															+        Where:
														
 
															+        * name: specifies the name of the mediated device type
														
 
															+        * device_api: the mediated device type's API
														
 
															+        * available_instances: the number of mediated matrix passthrough devices
														
 
															+                               that can be created
														
 
															+        * device_api: specifies the VFIO API
														
 
															+  * mdev_attr_groups
														
 
															+    This attribute group identifies the user-defined sysfs attributes of the
														
 
															+    mediated device. When a device is registered with the VFIO mediated device
														
 
															+    framework, the sysfs attribute files identified in the 'mdev_attr_groups'
														
 
															+    structure will be created in the mediated matrix device's directory. The
														
 
															+    sysfs attributes for a mediated matrix device are:
														
 
															+    * assign_adapter:
														
 
															+    * unassign_adapter:
														
 
															+      Write-only attributes for assigning/unassigning an AP adapter to/from the
														
 
															+      mediated matrix device. To assign/unassign an adapter, the APID of the
														
 
															+      adapter is echoed to the respective attribute file.
														
 
															+    * assign_domain:
														
 
															+    * unassign_domain:
														
 
															+      Write-only attributes for assigning/unassigning an AP usage domain to/from
														
 
															+      the mediated matrix device. To assign/unassign a domain, the domain
														
 
															+      number of the the usage domain is echoed to the respective attribute
														
 
															+      file.
														
 
															+    * matrix:
														
 
															+      A read-only file for displaying the APQNs derived from the cross product
														
 
															+      of the adapter and domain numbers assigned to the mediated matrix device.
														
 
															+    * assign_control_domain:
														
 
															+    * unassign_control_domain:
														
 
															+      Write-only attributes for assigning/unassigning an AP control domain
														
 
															+      to/from the mediated matrix device. To assign/unassign a control domain,
														
 
															+      the ID of the domain to be assigned/unassigned is echoed to the respective
														
 
															+      attribute file.
														
 
															+    * control_domains:
														
 
															+      A read-only file for displaying the control domain numbers assigned to the
														
 
															+      mediated matrix device.
														
 
															+
														
 
															+* functions:
														
 
															+  * create:
														
 
															+    allocates the ap_matrix_mdev structure used by the vfio_ap driver to:
														
 
															+    * Store the reference to the KVM structure for the guest using the mdev
														
 
															+    * Store the AP matrix configuration for the adapters, domains, and control
														
 
															+      domains assigned via the corresponding sysfs attributes files
														
 
															+  * remove:
														
 
															+    deallocates the mediated matrix device's ap_matrix_mdev structure. This will
														
 
															+    be allowed only if a running guest is not using the mdev.
														
 
															+
														
 
															+* callback interfaces
														
 
															+  * open:
														
 
															+    The vfio_ap driver uses this callback to register a
														
 
															+    VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix
														
 
															+    device. The open is invoked when QEMU connects the VFIO iommu group
														
 
															+    for the mdev matrix device to the MDEV bus. Access to the KVM structure used
														
 
															+    to configure the KVM guest is provided via this callback. The KVM structure,
														
 
															+    is used to configure the guest's access to the AP matrix defined via the
														
 
															+    mediated matrix device's sysfs attribute files.
														
 
															+  * release:
														
 
															+    unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
														
 
															+    mdev matrix device and deconfigures the guest's AP matrix.
														
 
															+
														
 
															+Configure the APM, AQM and ADM in the CRYCB:
														
 
															+-------------------------------------------
														
 
															+Configuring the AP matrix for a KVM guest will be performed when the
														
 
															+VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier
														
 
															+function is called when QEMU connects to KVM. The guest's AP matrix is
														
 
															+configured via it's CRYCB by:
														
 
															+* Setting the bits in the APM corresponding to the APIDs assigned to the
														
 
															+  mediated matrix device via its 'assign_adapter' interface.
														
 
															+* Setting the bits in the AQM corresponding to the domains assigned to the
														
 
															+  mediated matrix device via its 'assign_domain' interface.
														
 
															+* Setting the bits in the ADM corresponding to the domain dIDs assigned to the
														
 
															+  mediated matrix device via its 'assign_control_domains' interface.
														
 
															+
														
 
															+The CPU model features for AP
														
 
															+-----------------------------
														
 
															+The AP stack relies on the presence of the AP instructions as well as two
														
 
															+facilities: The AP Facilities Test (APFT) facility; and the AP Query
														
 
															+Configuration Information (QCI) facility. These features/facilities are made
														
 
															+available to a KVM guest via the following CPU model features:
														
 
															+
														
 
															+1. ap: Indicates whether the AP instructions are installed on the guest. This
														
 
															+   feature will be enabled by KVM only if the AP instructions are installed
														
 
															+   on the host.
														
 
															+
														
 
															+2. apft: Indicates the APFT facility is available on the guest. This facility
														
 
															+   can be made available to the guest only if it is available on the host (i.e.,
														
 
															+   facility bit 15 is set).
														
 
															+
														
 
															+3. apqci: Indicates the AP QCI facility is available on the guest. This facility
														
 
															+   can be made available to the guest only if it is available on the host (i.e.,
														
 
															+   facility bit 12 is set).
														
 
															+
														
 
															+Note: If the user chooses to specify a CPU model different than the 'host'
														
 
															+model to QEMU, the CPU model features and facilities need to be turned on
														
 
															+explicitly; for example:
														
 
															+
														
 
															+     /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on
														
 
															+
														
 
															+A guest can be precluded from using AP features/facilities by turning them off
														
 
															+explicitly; for example:
														
 
															+
														
 
															+     /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off
														
 
															+
														
 
															+Note: If the APFT facility is turned off (apft=off) for the guest, the guest
														
 
															+will not see any AP devices. The zcrypt device drivers that register for type 10
														
 
															+and newer AP devices - i.e., the cex4card and cex4queue device drivers - need
														
 
															+the APFT facility to ascertain the facilities installed on a given AP device. If
														
 
															+the APFT facility is not installed on the guest, then the probe of device
														
 
															+drivers will fail since only type 10 and newer devices can be configured for
														
 
															+guest use.
														
 
															+
														
 
															+Example:
														
 
															+=======
														
 
															+Let's now provide an example to illustrate how KVM guests may be given
														
 
															+access to AP facilities. For this example, we will show how to configure
														
 
															+three guests such that executing the lszcrypt command on the guests would
														
 
															+look like this:
														
 
															+
														
 
															+Guest1
														
 
															+------
														
 
															+CARD.DOMAIN TYPE  MODE
														
 
															+------------------------------
														
 
															+05          CEX5C CCA-Coproc
														
 
															+05.0004     CEX5C CCA-Coproc
														
 
															+05.00ab     CEX5C CCA-Coproc
														
 
															+06          CEX5A Accelerator
														
 
															+06.0004     CEX5A Accelerator
														
 
															+06.00ab     CEX5C CCA-Coproc
														
 
															+
														
 
															+Guest2
														
 
															+------
														
 
															+CARD.DOMAIN TYPE  MODE
														
 
															+------------------------------
														
 
															+05          CEX5A Accelerator
														
 
															+05.0047     CEX5A Accelerator
														
 
															+05.00ff     CEX5A Accelerator
														
 
															+
														
 
															+Guest2
														
 
															+------
														
 
															+CARD.DOMAIN TYPE  MODE
														
 
															+------------------------------
														
 
															+06          CEX5A Accelerator
														
 
															+06.0047     CEX5A Accelerator
														
 
															+06.00ff     CEX5A Accelerator
														
 
															+
														
 
															+These are the steps:
														
 
															+
														
 
															+1. Install the vfio_ap module on the linux host. The dependency chain for the
														
 
															+   vfio_ap module is:
														
 
															+   * iommu
														
 
															+   * s390
														
 
															+   * zcrypt
														
 
															+   * vfio
														
 
															+   * vfio_mdev
														
 
															+   * vfio_mdev_device
														
 
															+   * KVM
														
 
															+
														
 
															+   To build the vfio_ap module, the kernel build must be configured with the
														
 
															+   following Kconfig elements selected:
														
 
															+   * IOMMU_SUPPORT
														
 
															+   * S390
														
 
															+   * ZCRYPT
														
 
															+   * S390_AP_IOMMU
														
 
															+   * VFIO
														
 
															+   * VFIO_MDEV
														
 
															+   * VFIO_MDEV_DEVICE
														
 
															+   * KVM
														
 
															+
														
 
															+   If using make menuconfig select the following to build the vfio_ap module:
														
 
															+   -> Device Drivers
														
 
															+      -> IOMMU Hardware Support
														
 
															+         select S390 AP IOMMU Support
														
 
															+      -> VFIO Non-Privileged userspace driver framework
														
 
															+         -> Mediated device driver frramework
														
 
															+            -> VFIO driver for Mediated devices
														
 
															+   -> I/O subsystem
														
 
															+      -> VFIO support for AP devices
														
 
															+
														
 
															+2. Secure the AP queues to be used by the three guests so that the host can not
														
 
															+   access them. To secure them, there are two sysfs files that specify
														
 
															+   bitmasks marking a subset of the APQN range as 'usable by the default AP
														
 
															+   queue device drivers' or 'not usable by the default device drivers' and thus
														
 
															+   available for use by the vfio_ap device driver'. The location of the sysfs
														
 
															+   files containing the masks are:
														
 
															+
														
 
															+   /sys/bus/ap/apmask
														
 
															+   /sys/bus/ap/aqmask
														
 
															+
														
 
															+   The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
														
 
															+   (APID). Each bit in the mask, from left to right (i.e., from most significant
														
 
															+   to least significant bit in big endian order), corresponds to an APID from
														
 
															+   0-255. If a bit is set, the APID is marked as usable only by the default AP
														
 
															+   queue device drivers; otherwise, the APID is usable by the vfio_ap
														
 
															+   device driver.
														
 
															+
														
 
															+   The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
														
 
															+   (APQI). Each bit in the mask, from left to right (i.e., from most significant
														
 
															+   to least significant bit in big endian order), corresponds to an APQI from
														
 
															+   0-255. If a bit is set, the APQI is marked as usable only by the default AP
														
 
															+   queue device drivers; otherwise, the APQI is usable by the vfio_ap device
														
 
															+   driver.
														
 
															+
														
 
															+   Take, for example, the following mask:
														
 
															+
														
 
															+      0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
														
 
															+
														
 
															+    It indicates:
														
 
															+
														
 
															+      1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
														
 
															+      belong to the vfio_ap device driver's pool.
														
 
															+
														
 
															+   The APQN of each AP queue device assigned to the linux host is checked by the
														
 
															+   AP bus against the set of APQNs derived from the cross product of APIDs
														
 
															+   and APQIs marked as usable only by the default AP queue device drivers. If a
														
 
															+   match is detected,  only the default AP queue device drivers will be probed;
														
 
															+   otherwise, the vfio_ap device driver will be probed.
														
 
															+
														
 
															+   By default, the two masks are set to reserve all APQNs for use by the default
														
 
															+   AP queue device drivers. There are two ways the default masks can be changed:
														
 
															+
														
 
															+   1. The sysfs mask files can be edited by echoing a string into the
														
 
															+      respective sysfs mask file in one of two formats:
														
 
															+
														
 
															+      * An absolute hex string starting with 0x - like "0x12345678" - sets
														
 
															+        the mask. If the given string is shorter than the mask, it is padded
														
 
															+        with 0s on the right; for example, specifying a mask value of 0x41 is
														
 
															+        the same as specifying:
														
 
															+
														
 
															+           0x4100000000000000000000000000000000000000000000000000000000000000
														
 
															+
														
 
															+        Keep in mind that the mask reads from left to right (i.e., most
														
 
															+        significant to least significant bit in big endian order), so the mask
														
 
															+        above identifies device numbers 1 and 7 (01000001).
														
 
															+
														
 
															+        If the string is longer than the mask, the operation is terminated with
														
 
															+        an error (EINVAL).
														
 
															+
														
 
															+      * Individual bits in the mask can be switched on and off by specifying
														
 
															+        each bit number to be switched in a comma separated list. Each bit
														
 
															+        number string must be prepended with a ('+') or minus ('-') to indicate
														
 
															+        the corresponding bit is to be switched on ('+') or off ('-'). Some
														
 
															+        valid values are:
														
 
															+
														
 
															+           "+0"    switches bit 0 on
														
 
															+           "-13"   switches bit 13 off
														
 
															+           "+0x41" switches bit 65 on
														
 
															+           "-0xff" switches bit 255 off
														
 
															+
														
 
															+           The following example:
														
 
															+              +0,-6,+0x47,-0xf0
														
 
															+
														
 
															+              Switches bits 0 and 71 (0x47) on
														
 
															+              Switches bits 6 and 240 (0xf0) off
														
 
															+
														
 
															+        Note that the bits not specified in the list remain as they were before
														
 
															+        the operation.
														
 
															+
														
 
															+   2. The masks can also be changed at boot time via parameters on the kernel
														
 
															+      command line like this:
														
 
															+
														
 
															+         ap.apmask=0xffff ap.aqmask=0x40
														
 
															+
														
 
															+         This would create the following masks:
														
 
															+
														
 
															+            apmask:
														
 
															+            0xffff000000000000000000000000000000000000000000000000000000000000
														
 
															+
														
 
															+            aqmask:
														
 
															+            0x4000000000000000000000000000000000000000000000000000000000000000
														
 
															+
														
 
															+         Resulting in these two pools:
														
 
															+
														
 
															+            default drivers pool:    adapter 0-15, domain 1
														
 
															+            alternate drivers pool:  adapter 16-255, domains 0, 2-255
														
 
															+
														
 
															+   Securing the APQNs for our example:
														
 
															+   ----------------------------------
														
 
															+   To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047,
														
 
															+   06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding
														
 
															+   APQNs can either be removed from the default masks:
														
 
															+
														
 
															+      echo -5,-6 > /sys/bus/ap/apmask
														
 
															+
														
 
															+      echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
														
 
															+
														
 
															+   Or the masks can be set as follows:
														
 
															+
														
 
															+      echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \
														
 
															+      > apmask
														
 
															+
														
 
															+      echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \
														
 
															+      > aqmask
														
 
															+
														
 
															+   This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
														
 
															+   06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
														
 
															+   sysfs directory for the vfio_ap device driver will now contain symbolic links
														
 
															+   to the AP queue devices bound to it:
														
 
															+
														
 
															+   /sys/bus/ap
														
 
															+   ... [drivers]
														
 
															+   ...... [vfio_ap]
														
 
															+   ......... [05.0004]
														
 
															+   ......... [05.0047]
														
 
															+   ......... [05.00ab]
														
 
															+   ......... [05.00ff]
														
 
															+   ......... [06.0004]
														
 
															+   ......... [06.0047]
														
 
															+   ......... [06.00ab]
														
 
															+   ......... [06.00ff]
														
 
															+
														
 
															+   Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
														
 
															+   can be bound to the vfio_ap device driver. The reason for this is to
														
 
															+   simplify the implementation by not needlessly complicating the design by
														
 
															+   supporting older devices that will go out of service in the relatively near
														
 
															+   future and for which there are few older systems on which to test.
														
 
															+
														
 
															+   The administrator, therefore, must take care to secure only AP queues that
														
 
															+   can be bound to the vfio_ap device driver. The device type for a given AP
														
 
															+   queue device can be read from the parent card's sysfs directory. For example,
														
 
															+   to see the hardware type of the queue 05.0004:
														
 
															+
														
 
															+   cat /sys/bus/ap/devices/card05/hwtype
														
 
															+
														
 
															+   The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
														
 
															+   vfio_ap device driver.
														
 
															+
														
 
															+3. Create the mediated devices needed to configure the AP matrixes for the
														
 
															+   three guests and to provide an interface to the vfio_ap driver for
														
 
															+   use by the guests:
														
 
															+
														
 
															+   /sys/devices/vfio_ap/matrix/
														
 
															+   --- [mdev_supported_types]
														
 
															+   ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
														
 
															+   --------- create
														
 
															+   --------- [devices]
														
 
															+
														
 
															+   To create the mediated devices for the three guests:
														
 
															+
														
 
															+	uuidgen > create
														
 
															+	uuidgen > create
														
 
															+	uuidgen > create
														
 
															+
														
 
															+        or
														
 
															+
														
 
															+        echo $uuid1 > create
														
 
															+        echo $uuid2 > create
														
 
															+        echo $uuid3 > create
														
 
															+
														
 
															+   This will create three mediated devices in the [devices] subdirectory named
														
 
															+   after the UUID written to the create attribute file. We call them $uuid1,
														
 
															+   $uuid2 and $uuid3 and this is the sysfs directory structure after creation:
														
 
															+
														
 
															+   /sys/devices/vfio_ap/matrix/
														
 
															+   --- [mdev_supported_types]
														
 
															+   ------ [vfio_ap-passthrough]
														
 
															+   --------- [devices]
														
 
															+   ------------ [$uuid1]
														
 
															+   --------------- assign_adapter
														
 
															+   --------------- assign_control_domain
														
 
															+   --------------- assign_domain
														
 
															+   --------------- matrix
														
 
															+   --------------- unassign_adapter
														
 
															+   --------------- unassign_control_domain
														
 
															+   --------------- unassign_domain
														
 
															+
														
 
															+   ------------ [$uuid2]
														
 
															+   --------------- assign_adapter
														
 
															+   --------------- assign_control_domain
														
 
															+   --------------- assign_domain
														
 
															+   --------------- matrix
														
 
															+   --------------- unassign_adapter
														
 
															+   ----------------unassign_control_domain
														
 
															+   ----------------unassign_domain
														
 
															+
														
 
															+   ------------ [$uuid3]
														
 
															+   --------------- assign_adapter
														
 
															+   --------------- assign_control_domain
														
 
															+   --------------- assign_domain
														
 
															+   --------------- matrix
														
 
															+   --------------- unassign_adapter
														
 
															+   ----------------unassign_control_domain
														
 
															+   ----------------unassign_domain
														
 
															+
														
 
															+4. The administrator now needs to configure the matrixes for the mediated
														
 
															+   devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
														
 
															+
														
 
															+   This is how the matrix is configured for Guest1:
														
 
															+
														
 
															+      echo 5 > assign_adapter
														
 
															+      echo 6 > assign_adapter
														
 
															+      echo 4 > assign_domain
														
 
															+      echo 0xab > assign_domain
														
 
															+
														
 
															+      Control domains can similarly be assigned using the assign_control_domain
														
 
															+      sysfs file.
														
 
															+
														
 
															+      If a mistake is made configuring an adapter, domain or control domain,
														
 
															+      you can use the unassign_xxx files to unassign the adapter, domain or
														
 
															+      control domain.
														
 
															+
														
 
															+      To display the matrix configuration for Guest1:
														
 
															+
														
 
															+         cat matrix
														
 
															+
														
 
															+   This is how the matrix is configured for Guest2:
														
 
															+
														
 
															+      echo 5 > assign_adapter
														
 
															+      echo 0x47 > assign_domain
														
 
															+      echo 0xff > assign_domain
														
 
															+
														
 
															+   This is how the matrix is configured for Guest3:
														
 
															+
														
 
															+      echo 6 > assign_adapter
														
 
															+      echo 0x47 > assign_domain
														
 
															+      echo 0xff > assign_domain
														
 
															+
														
 
															+   In order to successfully assign an adapter:
														
 
															+
														
 
															+   * The adapter number specified must represent a value from 0 up to the
														
 
															+     maximum adapter number configured for the system. If an adapter number
														
 
															+     higher than the maximum is specified, the operation will terminate with
														
 
															+     an error (ENODEV).
														
 
															+
														
 
															+   * All APQNs that can be derived from the adapter ID and the IDs of
														
 
															+     the previously assigned domains must be bound to the vfio_ap device
														
 
															+     driver. If no domains have yet been assigned, then there must be at least
														
 
															+     one APQN with the specified APID bound to the vfio_ap driver. If no such
														
 
															+     APQNs are bound to the driver, the operation will terminate with an
														
 
															+     error (EADDRNOTAVAIL).
														
 
															+
														
 
															+     No APQN that can be derived from the adapter ID and the IDs of the
														
 
															+     previously assigned domains can be assigned to another mediated matrix
														
 
															+     device. If an APQN is assigned to another mediated matrix device, the
														
 
															+     operation will terminate with an error (EADDRINUSE).
														
 
															+
														
 
															+   In order to successfully assign a domain:
														
 
															+
														
 
															+   * The domain number specified must represent a value from 0 up to the
														
 
															+     maximum domain number configured for the system. If a domain number
														
 
															+     higher than the maximum is specified, the operation will terminate with
														
 
															+     an error (ENODEV).
														
 
															+
														
 
															+   * All APQNs that can be derived from the domain ID and the IDs of
														
 
															+     the previously assigned adapters must be bound to the vfio_ap device
														
 
															+     driver. If no domains have yet been assigned, then there must be at least
														
 
															+     one APQN with the specified APQI bound to the vfio_ap driver. If no such
														
 
															+     APQNs are bound to the driver, the operation will terminate with an
														
 
															+     error (EADDRNOTAVAIL).
														
 
															+
														
 
															+     No APQN that can be derived from the domain ID and the IDs of the
														
 
															+     previously assigned adapters can be assigned to another mediated matrix
														
 
															+     device. If an APQN is assigned to another mediated matrix device, the
														
 
															+     operation will terminate with an error (EADDRINUSE).
														
 
															+
														
 
															+   In order to successfully assign a control domain, the domain number
														
 
															+   specified must represent a value from 0 up to the maximum domain number
														
 
															+   configured for the system. If a control domain number higher than the maximum
														
 
															+   is specified, the operation will terminate with an error (ENODEV).
														
 
															+
														
 
															+5. Start Guest1:
														
 
															+
														
 
															+   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
														
 
															+      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
														
 
															+
														
 
															+7. Start Guest2:
														
 
															+
														
 
															+   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
														
 
															+      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
														
 
															+
														
 
															+7. Start Guest3:
														
 
															+
														
 
															+   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
														
 
															+      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
														
 
															+
														
 
															+When the guest is shut down, the mediated matrix devices may be removed.
														
 
															+
														
 
															+Using our example again, to remove the mediated matrix device $uuid1:
														
 
															+
														
 
															+   /sys/devices/vfio_ap/matrix/
														
 
															+      --- [mdev_supported_types]
														
 
															+      ------ [vfio_ap-passthrough]
														
 
															+      --------- [devices]
														
 
															+      ------------ [$uuid1]
														
 
															+      --------------- remove
														
 
															+
														
 
															+
														
 
															+   echo 1 > remove
														
 
															+
														
 
															+   This will remove all of the mdev matrix device's sysfs structures including
														
 
															+   the mdev device itself. To recreate and reconfigure the mdev matrix device,
														
 
															+   all of the steps starting with step 3 will have to be performed again. Note
														
 
															+   that the remove will fail if a guest using the mdev is still running.
														
 
															+
														
 
															+   It is not necessary to remove an mdev matrix device, but one may want to
														
 
															+   remove it if no guest will use it during the remaining lifetime of the linux
														
 
															+   host. If the mdev matrix device is removed, one may want to also reconfigure
														
 
															+   the pool of adapters and queues reserved for use by the default drivers.
														
 
															+
														
 
															+Limitations
														
 
															+===========
														
 
															+* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
														
 
															+  to the default drivers pool of a queue that is still assigned to a mediated
														
 
															+  device in use by a guest. It is incumbent upon the administrator to
														
 
															+  ensure there is no mediated device in use by a guest to which the APQN is
														
 
															+  assigned lest the host be given access to the private data of the AP queue
														
 
															+  device such as a private key configured specifically for the guest.
														
 
															+
														
 
															+* Dynamically modifying the AP matrix for a running guest (which would amount to
														
 
															+  hot(un)plug of AP devices for the guest) is currently not supported
														
 
															+
														
 
															+* Live guest migration is not supported for guests using AP devices.
														
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -123,6 +123,37 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
 
															 flag KVM_VM_MIPS_VZ.
														
 
															+On arm64, the physical address size for a VM (IPA Size limit) is limited
														
 
															+to 40bits by default. The limit can be configured if the host supports the
														
 
															+extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use
														
 
															+KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type
														
 
															+identifier, where IPA_Bits is the maximum width of any physical
														
 
															+address used by the VM. The IPA_Bits is encoded in bits[7-0] of the
														
 
															+machine type identifier.
														
 
															+
														
 
															+e.g, to configure a guest to use 48bit physical address size :
														
 
															+
														
 
															+    vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48));
														
 
															+
														
 
															+The requested size (IPA_Bits) must be :
														
 
															+  0 - Implies default size, 40bits (for backward compatibility)
														
 
															+
														
 
															+  or
														
 
															+
														
 
															+  N - Implies N bits, where N is a positive integer such that,
														
 
															+      32 <= N <= Host_IPA_Limit
														
 
															+
														
 
															+Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and
														
 
															+is dependent on the CPU capability and the kernel configuration. The limit can
														
 
															+be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION
														
 
															+ioctl() at run-time.
														
 
															+
														
 
															+Please note that configuring the IPA size does not affect the capability
														
 
															+exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects
														
 
															+size of the address translated by the stage2 level (guest physical to
														
 
															+host physical address translations).
														
 
															+
														
 
															+
														
 
															 4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST
														
 
															 Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST
														
@@ -850,7 +881,7 @@ struct kvm_vcpu_events {
 
															 		__u8 injected;
														
 
															 		__u8 nr;
														
 
															 		__u8 has_error_code;
														
 
															-		__u8 pad;
														
 
															+		__u8 pending;
														
 
															 		__u32 error_code;
														
 
															 	} exception;
														
 
															 	struct {
														
@@ -873,15 +904,23 @@ struct kvm_vcpu_events {
 
															 		__u8 smm_inside_nmi;
														
 
															 		__u8 latched_init;
														
 
															 	} smi;
														
 
															+	__u8 reserved[27];
														
 
															+	__u8 exception_has_payload;
														
 
															+	__u64 exception_payload;
														
 
															 };
														
 
															-Only two fields are defined in the flags field:
														
 
															+The following bits are defined in the flags field:
														
 
															-- KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
														
 
															+- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that
														
 
															   interrupt.shadow contains a valid state.
														
 
															-- KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
														
 
															-  smi contains a valid state.
														
 
															+- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a
														
 
															+  valid state.
														
 
															+
														
 
															+- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the
														
 
															+  exception_has_payload, exception_payload, and exception.pending
														
 
															+  fields contain a valid state. This bit will be set whenever
														
 
															+  KVM_CAP_EXCEPTION_PAYLOAD is enabled.
														
 
															 ARM/ARM64:
														
@@ -961,6 +1000,11 @@ shall be written into the VCPU.
 
															 KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
														
 
															+If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD
														
 
															+can be set in the flags field to signal that the
														
 
															+exception_has_payload, exception_payload, and exception.pending fields
														
 
															+contain a valid state and shall be written into the VCPU.
														
 
															+
														
 
															 ARM/ARM64:
														
 
															 Set the pending SError exception state for this VCPU. It is not possible to
														
@@ -1922,6 +1966,7 @@ registers, find a list below:
 
															   PPC   | KVM_REG_PPC_TIDR              | 64
														
 
															   PPC   | KVM_REG_PPC_PSSCR             | 64
														
 
															   PPC   | KVM_REG_PPC_DEC_EXPIRY        | 64
														
 
															+  PPC   | KVM_REG_PPC_PTCR              | 64
														
 
															   PPC   | KVM_REG_PPC_TM_GPR0           | 64
														
 
															           ...
														
 
															   PPC   | KVM_REG_PPC_TM_GPR31          | 64
														
@@ -2269,6 +2314,10 @@ The supported flags are:
 
															         The emulated MMU supports 1T segments in addition to the
														
 
															         standard 256M ones.
														
 
															+    - KVM_PPC_NO_HASH
														
 
															+	This flag indicates that HPT guests are not supported by KVM,
														
 
															+	thus all guests must use radix MMU mode.
														
 
															+
														
 
															 The "slb_size" field indicates how many SLB entries are supported
														
 
															 The "sps" array contains 8 entries indicating the supported base
														
@@ -3676,6 +3725,34 @@ Returns: 0 on success, -1 on error
 
															 This copies the vcpu's kvm_nested_state struct from userspace to the kernel.  For
														
 
															 the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
														
 
															+4.116 KVM_(UN)REGISTER_COALESCED_MMIO
														
 
															+
														
 
															+Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio)
														
 
															+	    KVM_CAP_COALESCED_PIO (for coalesced pio)
														
 
															+Architectures: all
														
 
															+Type: vm ioctl
														
 
															+Parameters: struct kvm_coalesced_mmio_zone
														
 
															+Returns: 0 on success, < 0 on error
														
 
															+
														
 
															+Coalesced I/O is a performance optimization that defers hardware
														
 
															+register write emulation so that userspace exits are avoided.  It is
														
 
															+typically used to reduce the overhead of emulating frequently accessed
														
 
															+hardware registers.
														
 
															+
														
 
															+When a hardware register is configured for coalesced I/O, write accesses
														
 
															+do not exit to userspace and their value is recorded in a ring buffer
														
 
															+that is shared between kernel and userspace.
														
 
															+
														
 
															+Coalesced I/O is used if one or more write accesses to a hardware
														
 
															+register can be deferred until a read or a write to another hardware
														
 
															+register on the same device.  This last access will cause a vmexit and
														
 
															+userspace will process accesses from the ring buffer before emulating
														
 
															+it. That will avoid exiting to userspace on repeated writes.
														
 
															+
														
 
															+Coalesced pio is based on coalesced mmio. There is little difference
														
 
															+between coalesced mmio and pio except that coalesced pio records accesses
														
 
															+to I/O ports.
														
 
															+
														
 
															 5. The kvm_run structure
														
 
															 ------------------------
														
@@ -4522,7 +4599,7 @@ hpage module parameter is not set to 1, -EINVAL is returned.
 
															 While it is generally possible to create a huge page backed VM without
														
 
															 this capability, the VM will not be able to run.
														
 
															-7.14 KVM_CAP_MSR_PLATFORM_INFO
														
 
															+7.15 KVM_CAP_MSR_PLATFORM_INFO
														
 
															 Architectures: x86
														
 
															 Parameters: args[0] whether feature should be enabled or not
														
@@ -4531,6 +4608,45 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise,
 
															 a #GP would be raised when the guest tries to access. Currently, this
														
 
															 capability does not enable write permissions of this MSR for the guest.
														
 
															+7.16 KVM_CAP_PPC_NESTED_HV
														
 
															+
														
 
															+Architectures: ppc
														
 
															+Parameters: none
														
 
															+Returns: 0 on success, -EINVAL when the implementation doesn't support
														
 
															+	 nested-HV virtualization.
														
 
															+
														
 
															+HV-KVM on POWER9 and later systems allows for "nested-HV"
														
 
															+virtualization, which provides a way for a guest VM to run guests that
														
 
															+can run using the CPU's supervisor mode (privileged non-hypervisor
														
 
															+state).  Enabling this capability on a VM depends on the CPU having
														
 
															+the necessary functionality and on the facility being enabled with a
														
 
															+kvm-hv module parameter.
														
 
															+
														
 
															+7.17 KVM_CAP_EXCEPTION_PAYLOAD
														
 
															+
														
 
															+Architectures: x86
														
 
															+Parameters: args[0] whether feature should be enabled or not
														
 
															+
														
 
															+With this capability enabled, CR2 will not be modified prior to the
														
 
															+emulated VM-exit when L1 intercepts a #PF exception that occurs in
														
 
															+L2. Similarly, for kvm-intel only, DR6 will not be modified prior to
														
 
															+the emulated VM-exit when L1 intercepts a #DB exception that occurs in
														
 
															+L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or
														
 
															+#DB) exception for L2, exception.has_payload will be set and the
														
 
															+faulting address (or the new DR6 bits*) will be reported in the
														
 
															+exception_payload field. Similarly, when userspace injects a #PF (or
														
 
															+#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set
														
 
															+exception.has_payload and to put the faulting address (or the new DR6
														
 
															+bits*) in the exception_payload field.
														
 
															+
														
 
															+This capability also enables exception.pending in struct
														
 
															+kvm_vcpu_events, which allows userspace to distinguish between pending
														
 
															+and injected exceptions.
														
 
															+
														
 
															+
														
 
															+* For the new DR6 bits, note that bit 16 is set iff the #DB exception
														
 
															+  will clear DR6.RTM.
														
 
															+
														
 
															 8. Other capabilities.
														
 
															 ----------------------
														
@@ -4772,3 +4888,10 @@ CPU when the exception is taken. If this virtual SError is taken to EL1 using
 
															 AArch64, this value will be reported in the ISS field of ESR_ELx.
														
 
															 See KVM_CAP_VCPU_EVENTS for more details.
														
 
															+8.20 KVM_CAP_HYPERV_SEND_IPI
														
 
															+
														
 
															+Architectures: x86
														
 
															+
														
 
															+This capability indicates that KVM supports paravirtualized Hyper-V IPI send
														
 
															+hypercalls:
														
 
															+HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
														
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12800,6 +12800,18 @@ W:	http://www.ibm.com/developerworks/linux/linux390/
 
															 S:	Supported
														
 
															 F:	drivers/s390/crypto/
														
 
															+S390 VFIO AP DRIVER
														
 
															+M:	Tony Krowiak <akrowiak@linux.ibm.com>
														
 
															+M:	Pierre Morel <pmorel@linux.ibm.com>
														
 
															+M:	Halil Pasic <pasic@linux.ibm.com>
														
 
															+L:	linux-s390@vger.kernel.org
														
 
															+W:	http://www.ibm.com/developerworks/linux/linux390/
														
 
															+S:	Supported
														
 
															+F:	drivers/s390/crypto/vfio_ap_drv.c
														
 
															+F:	drivers/s390/crypto/vfio_ap_private.h
														
 
															+F:	drivers/s390/crypto/vfio_ap_ops.c
														
 
															+F:	Documentation/s390/vfio-ap.txt
														
 
															+
														
 
															 S390 ZFCP DRIVER
														
 
															 M:	Steffen Maier <maier@linux.ibm.com>
														
 
															 M:	Benjamin Block <bblock@linux.ibm.com>
														
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -133,8 +133,7 @@
 
															  * space.
														
 
															  */
														
 
															 #define KVM_PHYS_SHIFT	(40)
														
 
															-#define KVM_PHYS_SIZE	(_AC(1, ULL) << KVM_PHYS_SHIFT)
														
 
															-#define KVM_PHYS_MASK	(KVM_PHYS_SIZE - _AC(1, ULL))
														
 
															+
														
 
															 #define PTRS_PER_S2_PGD	(_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
														
 
															 /* Virtualization Translation Control Register (VTCR) bits */
														
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void)
 
															 	kvm_call_hyp(__init_stage2_translation);
														
 
															 }
														
 
															-static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
														
 
															+static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
														
 
															 {
														
 
															 	return 0;
														
 
															 }
														
@@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
 
															 struct kvm *kvm_arch_alloc_vm(void);
														
 
															 void kvm_arch_free_vm(struct kvm *kvm);
														
 
															+static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
														
 
															+{
														
 
															+	/*
														
 
															+	 * On 32bit ARM, VMs get a static 40bit IPA stage2 setup,
														
 
															+	 * so any non-zero value used as type is illegal.
														
 
															+	 */
														
 
															+	if (type)
														
 
															+		return -EINVAL;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 #endif /* __ARM_KVM_HOST_H__ */
														
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -35,16 +35,12 @@
 
															 		addr;							\
														
 
															 	})
														
 
															-/*
														
 
															- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
														
 
															- */
														
 
															-#define KVM_MMU_CACHE_MIN_PAGES	2
														
 
															-
														
 
															 #ifndef __ASSEMBLY__
														
 
															 #include <linux/highmem.h>
														
 
															 #include <asm/cacheflush.h>
														
 
															 #include <asm/cputype.h>
														
 
															+#include <asm/kvm_arm.h>
														
 
															 #include <asm/kvm_hyp.h>
														
 
															 #include <asm/pgalloc.h>
														
 
															 #include <asm/stage2_pgtable.h>
														
@@ -52,6 +48,13 @@
 
															 /* Ensure compatibility with arm64 */
														
 
															 #define VA_BITS			32
														
 
															+#define kvm_phys_shift(kvm)		KVM_PHYS_SHIFT
														
 
															+#define kvm_phys_size(kvm)		(1ULL << kvm_phys_shift(kvm))
														
 
															+#define kvm_phys_mask(kvm)		(kvm_phys_size(kvm) - 1ULL)
														
 
															+#define kvm_vttbr_baddr_mask(kvm)	VTTBR_BADDR_MASK
														
 
															+
														
 
															+#define stage2_pgd_size(kvm)		(PTRS_PER_S2_PGD * sizeof(pgd_t))
														
 
															+
														
 
															 int create_hyp_mappings(void *from, void *to, pgprot_t prot);
														
 
															 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
														
 
															 			   void __iomem **kaddr,
														
@@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void)
 
															 #define kvm_phys_to_vttbr(addr)		(addr)
														
 
															+static inline void kvm_set_ipa_limit(void) {}
														
 
															+
														
 
															 static inline bool kvm_cpu_has_cnp(void)
														
 
															 {
														
 
															 	return false;
														
--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
@@ -19,43 +19,53 @@
 
															 #ifndef __ARM_S2_PGTABLE_H_
														
 
															 #define __ARM_S2_PGTABLE_H_
														
 
															-#define stage2_pgd_none(pgd)			pgd_none(pgd)
														
 
															-#define stage2_pgd_clear(pgd)			pgd_clear(pgd)
														
 
															-#define stage2_pgd_present(pgd)			pgd_present(pgd)
														
 
															-#define stage2_pgd_populate(pgd, pud)		pgd_populate(NULL, pgd, pud)
														
 
															-#define stage2_pud_offset(pgd, address)		pud_offset(pgd, address)
														
 
															-#define stage2_pud_free(pud)			pud_free(NULL, pud)
														
 
															-
														
 
															-#define stage2_pud_none(pud)			pud_none(pud)
														
 
															-#define stage2_pud_clear(pud)			pud_clear(pud)
														
 
															-#define stage2_pud_present(pud)			pud_present(pud)
														
 
															-#define stage2_pud_populate(pud, pmd)		pud_populate(NULL, pud, pmd)
														
 
															-#define stage2_pmd_offset(pud, address)		pmd_offset(pud, address)
														
 
															-#define stage2_pmd_free(pmd)			pmd_free(NULL, pmd)
														
 
															-
														
 
															-#define stage2_pud_huge(pud)			pud_huge(pud)
														
 
															+/*
														
 
															+ * kvm_mmu_cache_min_pages() is the number of pages required
														
 
															+ * to install a stage-2 translation. We pre-allocate the entry
														
 
															+ * level table at VM creation. Since we have a 3 level page-table,
														
 
															+ * we need only two pages to add a new mapping.
														
 
															+ */
														
 
															+#define kvm_mmu_cache_min_pages(kvm)	2
														
 
															+
														
 
															+#define stage2_pgd_none(kvm, pgd)		pgd_none(pgd)
														
 
															+#define stage2_pgd_clear(kvm, pgd)		pgd_clear(pgd)
														
 
															+#define stage2_pgd_present(kvm, pgd)		pgd_present(pgd)
														
 
															+#define stage2_pgd_populate(kvm, pgd, pud)	pgd_populate(NULL, pgd, pud)
														
 
															+#define stage2_pud_offset(kvm, pgd, address)	pud_offset(pgd, address)
														
 
															+#define stage2_pud_free(kvm, pud)		pud_free(NULL, pud)
														
 
															+
														
 
															+#define stage2_pud_none(kvm, pud)		pud_none(pud)
														
 
															+#define stage2_pud_clear(kvm, pud)		pud_clear(pud)
														
 
															+#define stage2_pud_present(kvm, pud)		pud_present(pud)
														
 
															+#define stage2_pud_populate(kvm, pud, pmd)	pud_populate(NULL, pud, pmd)
														
 
															+#define stage2_pmd_offset(kvm, pud, address)	pmd_offset(pud, address)
														
 
															+#define stage2_pmd_free(kvm, pmd)		pmd_free(NULL, pmd)
														
 
															+
														
 
															+#define stage2_pud_huge(kvm, pud)		pud_huge(pud)
														
 
															 /* Open coded p*d_addr_end that can deal with 64bit addresses */
														
 
															-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
														
 
															+static inline phys_addr_t
														
 
															+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
														
 
															 {
														
 
															 	phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
														
 
															 	return (boundary - 1 < end - 1) ? boundary : end;
														
 
															 }
														
 
															-#define stage2_pud_addr_end(addr, end)		(end)
														
 
															+#define stage2_pud_addr_end(kvm, addr, end)	(end)
														
 
															-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
														
 
															+static inline phys_addr_t
														
 
															+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
														
 
															 {
														
 
															 	phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK;
														
 
															 	return (boundary - 1 < end - 1) ? boundary : end;
														
 
															 }
														
 
															-#define stage2_pgd_index(addr)				pgd_index(addr)
														
 
															+#define stage2_pgd_index(kvm, addr)		pgd_index(addr)
														
 
															-#define stage2_pte_table_empty(ptep)			kvm_page_empty(ptep)
														
 
															-#define stage2_pmd_table_empty(pmdp)			kvm_page_empty(pmdp)
														
 
															-#define stage2_pud_table_empty(pudp)			false
														
 
															+#define stage2_pte_table_empty(kvm, ptep)	kvm_page_empty(ptep)
														
 
															+#define stage2_pmd_table_empty(kvm, pmdp)	kvm_page_empty(pmdp)
														
 
															+#define stage2_pud_table_empty(kvm, pudp)	false
														
 
															 #endif	/* __ARM_S2_PGTABLE_H_ */
														
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -537,6 +537,27 @@ static inline void arm64_set_ssbd_mitigation(bool state) {}
 
															 #endif
														
 
															 extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
														
 
															+
														
 
															+static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
														
 
															+{
														
 
															+	switch (parange) {
														
 
															+	case 0: return 32;
														
 
															+	case 1: return 36;
														
 
															+	case 2: return 40;
														
 
															+	case 3: return 42;
														
 
															+	case 4: return 44;
														
 
															+	case 5: return 48;
														
 
															+	case 6: return 52;
														
 
															+	/*
														
 
															+	 * A future PE could use a value unknown to the kernel.
														
 
															+	 * However, by the "D10.1.4 Principles of the ID scheme
														
 
															+	 * for fields in ID registers", ARM DDI 0487C.a, any new
														
 
															+	 * value is guaranteed to be higher than what we know already.
														
 
															+	 * As a safe limit, we return the limit supported by the kernel.
														
 
															+	 */
														
 
															+	default: return CONFIG_ARM64_PA_BITS;
														
 
															+	}
														
 
															+}
														
 
															 #endif /* __ASSEMBLY__ */
														
 
															 #endif
														
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -107,6 +107,7 @@
 
															 #define VTCR_EL2_RES1		(1 << 31)
														
 
															 #define VTCR_EL2_HD		(1 << 22)
														
 
															 #define VTCR_EL2_HA		(1 << 21)
														
 
															+#define VTCR_EL2_PS_SHIFT	TCR_EL2_PS_SHIFT
														
 
															 #define VTCR_EL2_PS_MASK	TCR_EL2_PS_MASK
														
 
															 #define VTCR_EL2_TG0_MASK	TCR_TG0_MASK
														
 
															 #define VTCR_EL2_TG0_4K		TCR_TG0_4K
														
@@ -120,63 +121,150 @@
 
															 #define VTCR_EL2_IRGN0_WBWA	TCR_IRGN0_WBWA
														
 
															 #define VTCR_EL2_SL0_SHIFT	6
														
 
															 #define VTCR_EL2_SL0_MASK	(3 << VTCR_EL2_SL0_SHIFT)
														
 
															-#define VTCR_EL2_SL0_LVL1	(1 << VTCR_EL2_SL0_SHIFT)
														
 
															 #define VTCR_EL2_T0SZ_MASK	0x3f
														
 
															-#define VTCR_EL2_T0SZ_40B	24
														
 
															 #define VTCR_EL2_VS_SHIFT	19
														
 
															 #define VTCR_EL2_VS_8BIT	(0 << VTCR_EL2_VS_SHIFT)
														
 
															 #define VTCR_EL2_VS_16BIT	(1 << VTCR_EL2_VS_SHIFT)
														
 
															+#define VTCR_EL2_T0SZ(x)	TCR_T0SZ(x)
														
 
															+
														
 
															 /*
														
 
															  * We configure the Stage-2 page tables to always restrict the IPA space to be
														
 
															  * 40 bits wide (T0SZ = 24).  Systems with a PARange smaller than 40 bits are
														
 
															  * not known to exist and will break with this configuration.
														
 
															  *
														
 
															- * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time
														
 
															- * (see hyp-init.S).
														
 
															+ * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
														
 
															  *
														
 
															  * Note that when using 4K pages, we concatenate two first level page tables
														
 
															  * together. With 16K pages, we concatenate 16 first level page tables.
														
 
															  *
														
 
															- * The magic numbers used for VTTBR_X in this patch can be found in Tables
														
 
															- * D4-23 and D4-25 in ARM DDI 0487A.b.
														
 
															  */
														
 
															-#define VTCR_EL2_T0SZ_IPA	VTCR_EL2_T0SZ_40B
														
 
															 #define VTCR_EL2_COMMON_BITS	(VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
														
 
															 				 VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
														
 
															-#ifdef CONFIG_ARM64_64K_PAGES
														
 
															 /*
														
 
															- * Stage2 translation configuration:
														
 
															- * 64kB pages (TG0 = 1)
														
 
															- * 2 level page tables (SL = 1)
														
 
															+ * VTCR_EL2:SL0 indicates the entry level for Stage2 translation.
														
 
															+ * Interestingly, it depends on the page size.
														
 
															+ * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a
														
 
															+ *
														
 
															+ *	-----------------------------------------
														
 
															+ *	| Entry level		|  4K  | 16K/64K |
														
 
															+ *	------------------------------------------
														
 
															+ *	| Level: 0		|  2   |   -     |
														
 
															+ *	------------------------------------------
														
 
															+ *	| Level: 1		|  1   |   2     |
														
 
															+ *	------------------------------------------
														
 
															+ *	| Level: 2		|  0   |   1     |
														
 
															+ *	------------------------------------------
														
 
															+ *	| Level: 3		|  -   |   0     |
														
 
															+ *	------------------------------------------
														
 
															+ *
														
 
															+ * The table roughly translates to :
														
 
															+ *
														
 
															+ *	SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
														
 
															+ *
														
 
															+ * Where TGRAN_SL0_BASE is a magic number depending on the page size:
														
 
															+ * 	TGRAN_SL0_BASE(4K) = 2
														
 
															+ *	TGRAN_SL0_BASE(16K) = 3
														
 
															+ *	TGRAN_SL0_BASE(64K) = 3
														
 
															+ * provided we take care of ruling out the unsupported cases and
														
 
															+ * Entry_Level = 4 - Number_of_levels.
														
 
															+ *
														
 
															  */
														
 
															-#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
														
 
															-#define VTTBR_X_TGRAN_MAGIC		38
														
 
															+#ifdef CONFIG_ARM64_64K_PAGES
														
 
															+
														
 
															+#define VTCR_EL2_TGRAN			VTCR_EL2_TG0_64K
														
 
															+#define VTCR_EL2_TGRAN_SL0_BASE		3UL
														
 
															+
														
 
															 #elif defined(CONFIG_ARM64_16K_PAGES)
														
 
															-/*
														
 
															- * Stage2 translation configuration:
														
 
															- * 16kB pages (TG0 = 2)
														
 
															- * 2 level page tables (SL = 1)
														
 
															- */
														
 
															-#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
														
 
															-#define VTTBR_X_TGRAN_MAGIC		42
														
 
															+
														
 
															+#define VTCR_EL2_TGRAN			VTCR_EL2_TG0_16K
														
 
															+#define VTCR_EL2_TGRAN_SL0_BASE		3UL
														
 
															+
														
 
															 #else	/* 4K */
														
 
															-/*
														
 
															- * Stage2 translation configuration:
														
 
															- * 4kB pages (TG0 = 0)
														
 
															- * 3 level page tables (SL = 1)
														
 
															- */
														
 
															-#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
														
 
															-#define VTTBR_X_TGRAN_MAGIC		37
														
 
															+
														
 
															+#define VTCR_EL2_TGRAN			VTCR_EL2_TG0_4K
														
 
															+#define VTCR_EL2_TGRAN_SL0_BASE		2UL
														
 
															+
														
 
															 #endif
														
 
															-#define VTCR_EL2_FLAGS			(VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
														
 
															-#define VTTBR_X				(VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
														
 
															+#define VTCR_EL2_LVLS_TO_SL0(levels)	\
														
 
															+	((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT)
														
 
															+#define VTCR_EL2_SL0_TO_LVLS(sl0)	\
														
 
															+	((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE)
														
 
															+#define VTCR_EL2_LVLS(vtcr)		\
														
 
															+	VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT)
														
 
															+
														
 
															+#define VTCR_EL2_FLAGS			(VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN)
														
 
															+#define VTCR_EL2_IPA(vtcr)		(64 - ((vtcr) & VTCR_EL2_T0SZ_MASK))
														
 
															+
														
 
															+/*
														
 
															+ * ARM VMSAv8-64 defines an algorithm for finding the translation table
														
 
															+ * descriptors in section D4.2.8 in ARM DDI 0487C.a.
														
 
															+ *
														
 
															+ * The algorithm defines the expectations on the translation table
														
 
															+ * addresses for each level, based on PAGE_SIZE, entry level
														
 
															+ * and the translation table size (T0SZ). The variable "x" in the
														
 
															+ * algorithm determines the alignment of a table base address at a given
														
 
															+ * level and thus determines the alignment of VTTBR:BADDR for stage2
														
 
															+ * page table entry level.
														
 
															+ * Since the number of bits resolved at the entry level could vary
														
 
															+ * depending on the T0SZ, the value of "x" is defined based on a
														
 
															+ * Magic constant for a given PAGE_SIZE and Entry Level. The
														
 
															+ * intermediate levels must be always aligned to the PAGE_SIZE (i.e,
														
 
															+ * x = PAGE_SHIFT).
														
 
															+ *
														
 
															+ * The value of "x" for entry level is calculated as :
														
 
															+ *    x = Magic_N - T0SZ
														
 
															+ *
														
 
															+ * where Magic_N is an integer depending on the page size and the entry
														
 
															+ * level of the page table as below:
														
 
															+ *
														
 
															+ *	--------------------------------------------
														
 
															+ *	| Entry level		|  4K    16K   64K |
														
 
															+ *	--------------------------------------------
														
 
															+ *	| Level: 0 (4 levels)	| 28   |  -  |  -  |
														
 
															+ *	--------------------------------------------
														
 
															+ *	| Level: 1 (3 levels)	| 37   | 31  | 25  |
														
 
															+ *	--------------------------------------------
														
 
															+ *	| Level: 2 (2 levels)	| 46   | 42  | 38  |
														
 
															+ *	--------------------------------------------
														
 
															+ *	| Level: 3 (1 level)	| -    | 53  | 51  |
														
 
															+ *	--------------------------------------------
														
 
															+ *
														
 
															+ * We have a magic formula for the Magic_N below:
														
 
															+ *
														
 
															+ *  Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels)
														
 
															+ *
														
 
															+ * where Number_of_levels = (4 - Level). We are only interested in the
														
 
															+ * value for Entry_Level for the stage2 page table.
														
 
															+ *
														
 
															+ * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows:
														
 
															+ *
														
 
															+ *	x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
														
 
															+ *	  = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels)
														
 
															+ *
														
 
															+ * Here is one way to explain the Magic Formula:
														
 
															+ *
														
 
															+ *  x = log2(Size_of_Entry_Level_Table)
														
 
															+ *
														
 
															+ * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another
														
 
															+ * PAGE_SHIFT bits in the PTE, we have :
														
 
															+ *
														
 
															+ *  Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT)
														
 
															+ *		     = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3
														
 
															+ *  where n = number of levels, and since each pointer is 8bytes, we have:
														
 
															+ *
														
 
															+ *  x = Bits_Entry_Level + 3
														
 
															+ *    = IPA_SHIFT - (PAGE_SHIFT - 3) * n
														
 
															+ *
														
 
															+ * The only constraint here is that, we have to find the number of page table
														
 
															+ * levels for a given IPA size (which we do, see stage2_pt_levels())
														
 
															+ */
														
 
															+#define ARM64_VTTBR_X(ipa, levels)	((ipa) - ((levels) * (PAGE_SHIFT - 3)))
														
 
															 #define VTTBR_CNP_BIT     (UL(1))
														
 
															-#define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)
														
 
															 #define VTTBR_VMID_SHIFT  (UL(48))
														
 
															 #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
														
@@ -224,6 +312,13 @@
 
															 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
														
 
															 #define HPFAR_MASK	(~UL(0xf))
														
 
															+/*
														
 
															+ * We have
														
 
															+ *	PAR	[PA_Shift - 1	: 12] = PA	[PA_Shift - 1 : 12]
														
 
															+ *	HPFAR	[PA_Shift - 9	: 4]  = FIPA	[PA_Shift - 1 : 12]
														
 
															+ */
														
 
															+#define PAR_TO_HPFAR(par)		\
														
 
															+	(((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
														
 
															 #define kvm_arm_exception_type	\
														
 
															 	{0, "IRQ" }, 		\
														
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -30,6 +30,7 @@
 
															 #define ARM_EXCEPTION_IRQ	  0
														
 
															 #define ARM_EXCEPTION_EL1_SERROR  1
														
 
															 #define ARM_EXCEPTION_TRAP	  2
														
 
															+#define ARM_EXCEPTION_IL	  3
														
 
															 /* The hyp-stub will return this for any kvm_call_hyp() call */
														
 
															 #define ARM_EXCEPTION_HYP_GONE	  HVC_STUB_ERR
														
@@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void);
 
															 extern u32 __kvm_get_mdcr_el2(void);
														
 
															-extern u32 __init_stage2_translation(void);
														
 
															-
														
 
															 /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */
														
 
															 #define __hyp_this_cpu_ptr(sym)						\
														
 
															 	({								\
														
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
															 int __attribute_const__ kvm_target_cpu(void);
														
 
															 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
														
 
															-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
														
 
															+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
														
 
															 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
														
 
															 struct kvm_arch {
														
@@ -61,11 +61,13 @@ struct kvm_arch {
 
															 	u64    vmid_gen;
														
 
															 	u32    vmid;
														
 
															-	/* 1-level 2nd stage table, protected by kvm->mmu_lock */
														
 
															+	/* stage2 entry level table */
														
 
															 	pgd_t *pgd;
														
 
															 	/* VTTBR value associated with above pgd and vmid */
														
 
															 	u64    vttbr;
														
 
															+	/* VTCR_EL2 value for this VM */
														
 
															+	u64    vtcr;
														
 
															 	/* The last vcpu id that ran on each physical CPU */
														
 
															 	int __percpu *last_vcpu_ran;
														
@@ -451,13 +453,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 
															 int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
														
 
															 			       struct kvm_device_attr *attr);
														
 
															-static inline void __cpu_init_stage2(void)
														
 
															-{
														
 
															-	u32 parange = kvm_call_hyp(__init_stage2_translation);
														
 
															-
														
 
															-	WARN_ONCE(parange < 40,
														
 
															-		  "PARange is %d bits, unsupported configuration!", parange);
														
 
															-}
														
 
															+static inline void __cpu_init_stage2(void) {}
														
 
															 /* Guest/host FPSIMD coordination helpers */
														
 
															 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
														
@@ -520,8 +516,12 @@ static inline int kvm_arm_have_ssbd(void)
 
															 void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
														
 
															 void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
														
 
															+void kvm_set_ipa_limit(void);
														
 
															+
														
 
															 #define __KVM_HAVE_ARCH_VM_ALLOC
														
 
															 struct kvm *kvm_arch_alloc_vm(void);
														
 
															 void kvm_arch_free_vm(struct kvm *kvm);
														
 
															+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
														
 
															+
														
 
															 #endif /* __ARM64_KVM_HOST_H__ */
														
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void);
 
															 u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
														
 
															 void __noreturn __hyp_do_panic(unsigned long, ...);
														
 
															+/*
														
 
															+ * Must be called from hyp code running at EL2 with an updated VTTBR
														
 
															+ * and interrupts disabled.
														
 
															+ */
														
 
															+static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm)
														
 
															+{
														
 
															+	write_sysreg(kvm->arch.vtcr, vtcr_el2);
														
 
															+	write_sysreg(kvm->arch.vttbr, vttbr_el2);
														
 
															+}
														
 
															+
														
 
															 #endif /* __ARM64_KVM_HYP_H__ */
														
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
 
															  * We currently only support a 40bit IPA.
														
 
															  */
														
 
															 #define KVM_PHYS_SHIFT	(40)
														
 
															-#define KVM_PHYS_SIZE	(1UL << KVM_PHYS_SHIFT)
														
 
															-#define KVM_PHYS_MASK	(KVM_PHYS_SIZE - 1UL)
														
 
															+
														
 
															+#define kvm_phys_shift(kvm)		VTCR_EL2_IPA(kvm->arch.vtcr)
														
 
															+#define kvm_phys_size(kvm)		(_AC(1, ULL) << kvm_phys_shift(kvm))
														
 
															+#define kvm_phys_mask(kvm)		(kvm_phys_size(kvm) - _AC(1, ULL))
														
 
															+
														
 
															+static inline bool kvm_page_empty(void *ptr)
														
 
															+{
														
 
															+	struct page *ptr_page = virt_to_page(ptr);
														
 
															+	return page_count(ptr_page) == 1;
														
 
															+}
														
 
															 #include <asm/stage2_pgtable.h>
														
@@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
 
															 	return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
														
 
															 }
														
 
															-static inline bool kvm_page_empty(void *ptr)
														
 
															-{
														
 
															-	struct page *ptr_page = virt_to_page(ptr);
														
 
															-	return page_count(ptr_page) == 1;
														
 
															-}
														
 
															-
														
 
															 #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
														
 
															 #ifdef __PAGETABLE_PMD_FOLDED
														
@@ -517,6 +519,30 @@ static inline int hyp_map_aux_data(void)
 
															 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
														
 
															+/*
														
 
															+ * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
														
 
															+ * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
														
 
															+ * 52bit IPS.
														
 
															+ */
														
 
															+static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
														
 
															+{
														
 
															+	int x = ARM64_VTTBR_X(ipa_shift, levels);
														
 
															+
														
 
															+	return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
														
 
															+}
														
 
															+
														
 
															+static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
														
 
															+{
														
 
															+	unsigned int x = arm64_vttbr_x(ipa_shift, levels);
														
 
															+
														
 
															+	return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
														
 
															+}
														
 
															+
														
 
															+static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
														
 
															+{
														
 
															+	return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
														
 
															+}
														
 
															+
														
 
															 static inline bool kvm_cpu_has_cnp(void)
														
 
															 {
														
 
															 	return system_supports_cnp();
														
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -25,6 +25,9 @@
 
															 #define CurrentEL_EL1		(1 << 2)
														
 
															 #define CurrentEL_EL2		(2 << 2)
														
 
															+/* Additional SPSR bits not exposed in the UABI */
														
 
															+#define PSR_IL_BIT		(1 << 20)
														
 
															+
														
 
															 /* AArch32-specific ptrace requests */
														
 
															 #define COMPAT_PTRACE_GETREGS		12
														
 
															 #define COMPAT_PTRACE_SETREGS		13
														
--- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h
+++ b/arch/arm64/include/asm/stage2_pgtable-nopmd.h
@@ -1,42 +0,0 @@
 
															-/*
														
 
															- * Copyright (C) 2016 - ARM Ltd
														
 
															- *
														
 
															- * This program is free software; you can redistribute it and/or modify
														
 
															- * it under the terms of the GNU General Public License version 2 as
														
 
															- * published by the Free Software Foundation.
														
 
															- *
														
 
															- * This program is distributed in the hope that it will be useful,
														
 
															- * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															- * GNU General Public License for more details.
														
 
															- *
														
 
															- * You should have received a copy of the GNU General Public License
														
 
															- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
														
 
															- */
														
 
															-
														
 
															-#ifndef __ARM64_S2_PGTABLE_NOPMD_H_
														
 
															-#define __ARM64_S2_PGTABLE_NOPMD_H_
														
 
															-
														
 
															-#include <asm/stage2_pgtable-nopud.h>
														
 
															-
														
 
															-#define __S2_PGTABLE_PMD_FOLDED
														
 
															-
														
 
															-#define S2_PMD_SHIFT		S2_PUD_SHIFT
														
 
															-#define S2_PTRS_PER_PMD		1
														
 
															-#define S2_PMD_SIZE		(1UL << S2_PMD_SHIFT)
														
 
															-#define S2_PMD_MASK		(~(S2_PMD_SIZE-1))
														
 
															-
														
 
															-#define stage2_pud_none(pud)			(0)
														
 
															-#define stage2_pud_present(pud)			(1)
														
 
															-#define stage2_pud_clear(pud)			do { } while (0)
														
 
															-#define stage2_pud_populate(pud, pmd)		do { } while (0)
														
 
															-#define stage2_pmd_offset(pud, address)		((pmd_t *)(pud))
														
 
															-
														
 
															-#define stage2_pmd_free(pmd)			do { } while (0)
														
 
															-
														
 
															-#define stage2_pmd_addr_end(addr, end)		(end)
														
 
															-
														
 
															-#define stage2_pud_huge(pud)			(0)
														
 
															-#define stage2_pmd_table_empty(pmdp)		(0)
														
 
															-
														
 
															-#endif
														
--- a/arch/arm64/include/asm/stage2_pgtable-nopud.h
+++ b/arch/arm64/include/asm/stage2_pgtable-nopud.h
@@ -1,39 +0,0 @@
 
															-/*
														
 
															- * Copyright (C) 2016 - ARM Ltd
														
 
															- *
														
 
															- * This program is free software; you can redistribute it and/or modify
														
 
															- * it under the terms of the GNU General Public License version 2 as
														
 
															- * published by the Free Software Foundation.
														
 
															- *
														
 
															- * This program is distributed in the hope that it will be useful,
														
 
															- * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															- * GNU General Public License for more details.
														
 
															- *
														
 
															- * You should have received a copy of the GNU General Public License
														
 
															- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
														
 
															- */
														
 
															-
														
 
															-#ifndef __ARM64_S2_PGTABLE_NOPUD_H_
														
 
															-#define __ARM64_S2_PGTABLE_NOPUD_H_
														
 
															-
														
 
															-#define __S2_PGTABLE_PUD_FOLDED
														
 
															-
														
 
															-#define S2_PUD_SHIFT		S2_PGDIR_SHIFT
														
 
															-#define S2_PTRS_PER_PUD		1
														
 
															-#define S2_PUD_SIZE		(_AC(1, UL) << S2_PUD_SHIFT)
														
 
															-#define S2_PUD_MASK		(~(S2_PUD_SIZE-1))
														
 
															-
														
 
															-#define stage2_pgd_none(pgd)			(0)
														
 
															-#define stage2_pgd_present(pgd)			(1)
														
 
															-#define stage2_pgd_clear(pgd)			do { } while (0)
														
 
															-#define stage2_pgd_populate(pgd, pud)	do { } while (0)
														
 
															-
														
 
															-#define stage2_pud_offset(pgd, address)		((pud_t *)(pgd))
														
 
															-
														
 
															-#define stage2_pud_free(x)			do { } while (0)
														
 
															-
														
 
															-#define stage2_pud_addr_end(addr, end)		(end)
														
 
															-#define stage2_pud_table_empty(pmdp)		(0)
														
 
															-
														
 
															-#endif
														
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -19,8 +19,16 @@
 
															 #ifndef __ARM64_S2_PGTABLE_H_
														
 
															 #define __ARM64_S2_PGTABLE_H_
														
 
															+#include <linux/hugetlb.h>
														
 
															 #include <asm/pgtable.h>
														
 
															+/*
														
 
															+ * PGDIR_SHIFT determines the size a top-level page table entry can map
														
 
															+ * and depends on the number of levels in the page table. Compute the
														
 
															+ * PGDIR_SHIFT for a given number of levels.
														
 
															+ */
														
 
															+#define pt_levels_pgdir_shift(lvls)	ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls))
														
 
															+
														
 
															 /*
														
 
															  * The hardware supports concatenation of up to 16 tables at stage2 entry level
														
 
															  * and we use the feature whenever possible.
														
@@ -29,112 +37,208 @@
 
															  * On arm64, the smallest PAGE_SIZE supported is 4k, which means
														
 
															  *             (PAGE_SHIFT - 3) > 4 holds for all page sizes.
														
 
															  * This implies, the total number of page table levels at stage2 expected
														
 
															- * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4)
														
 
															+ * by the hardware is actually the number of levels required for (IPA_SHIFT - 4)
														
 
															  * in normal translations(e.g, stage1), since we cannot have another level in
														
 
															- * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4).
														
 
															+ * the range (IPA_SHIFT, IPA_SHIFT - 4).
														
 
															  */
														
 
															-#define STAGE2_PGTABLE_LEVELS		ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
														
 
															+#define stage2_pgtable_levels(ipa)	ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
														
 
															+#define kvm_stage2_levels(kvm)		VTCR_EL2_LVLS(kvm->arch.vtcr)
														
 
															-/*
														
 
															- * With all the supported VA_BITs and 40bit guest IPA, the following condition
														
 
															- * is always true:
														
 
															- *
														
 
															- *       STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS
														
 
															- *
														
 
															- * We base our stage-2 page table walker helpers on this assumption and
														
 
															- * fall back to using the host version of the helper wherever possible.
														
 
															- * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back
														
 
															- * to using the host version, since it is guaranteed it is not folded at host.
														
 
															- *
														
 
															- * If the condition breaks in the future, we can rearrange the host level
														
 
															- * definitions and reuse them for stage2. Till then...
														
 
															- */
														
 
															-#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS
														
 
															-#error "Unsupported combination of guest IPA and host VA_BITS."
														
 
															-#endif
														
 
															-
														
 
															-/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */
														
 
															-#define S2_PGDIR_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS)
														
 
															-#define S2_PGDIR_SIZE			(_AC(1, UL) << S2_PGDIR_SHIFT)
														
 
															-#define S2_PGDIR_MASK			(~(S2_PGDIR_SIZE - 1))
														
 
															+/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */
														
 
															+#define stage2_pgdir_shift(kvm)		pt_levels_pgdir_shift(kvm_stage2_levels(kvm))
														
 
															+#define stage2_pgdir_size(kvm)		(1ULL << stage2_pgdir_shift(kvm))
														
 
															+#define stage2_pgdir_mask(kvm)		~(stage2_pgdir_size(kvm) - 1)
														
 
															 /*
														
 
															  * The number of PTRS across all concatenated stage2 tables given by the
														
 
															  * number of bits resolved at the initial level.
														
 
															+ * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
														
 
															+ * in which case, stage2_pgd_ptrs will have one entry.
														
 
															  */
														
 
															-#define PTRS_PER_S2_PGD			(1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT))
														
 
															+#define pgd_ptrs_shift(ipa, pgdir_shift)	\
														
 
															+	((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
														
 
															+#define __s2_pgd_ptrs(ipa, lvls)		\
														
 
															+	(1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
														
 
															+#define __s2_pgd_size(ipa, lvls)	(__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
														
 
															+
														
 
															+#define stage2_pgd_ptrs(kvm)		__s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
														
 
															+#define stage2_pgd_size(kvm)		__s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
														
 
															 /*
														
 
															- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
														
 
															- * levels in addition to the PGD.
														
 
															+ * kvm_mmmu_cache_min_pages() is the number of pages required to install
														
 
															+ * a stage-2 translation. We pre-allocate the entry level page table at
														
 
															+ * the VM creation.
														
 
															  */
														
 
															-#define KVM_MMU_CACHE_MIN_PAGES		(STAGE2_PGTABLE_LEVELS - 1)
														
 
															+#define kvm_mmu_cache_min_pages(kvm)	(kvm_stage2_levels(kvm) - 1)
														
 
															-
														
 
															-#if STAGE2_PGTABLE_LEVELS > 3
														
 
															+/* Stage2 PUD definitions when the level is present */
														
 
															+static inline bool kvm_stage2_has_pud(struct kvm *kvm)
														
 
															+{
														
 
															+	return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
														
 
															+}
														
 
															 #define S2_PUD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
														
 
															-#define S2_PUD_SIZE			(_AC(1, UL) << S2_PUD_SHIFT)
														
 
															+#define S2_PUD_SIZE			(1UL << S2_PUD_SHIFT)
														
 
															 #define S2_PUD_MASK			(~(S2_PUD_SIZE - 1))
														
 
															-#define stage2_pgd_none(pgd)				pgd_none(pgd)
														
 
															-#define stage2_pgd_clear(pgd)				pgd_clear(pgd)
														
 
															-#define stage2_pgd_present(pgd)				pgd_present(pgd)
														
 
															-#define stage2_pgd_populate(pgd, pud)			pgd_populate(NULL, pgd, pud)
														
 
															-#define stage2_pud_offset(pgd, address)			pud_offset(pgd, address)
														
 
															-#define stage2_pud_free(pud)				pud_free(NULL, pud)
														
 
															+static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pud(kvm))
														
 
															+		return pgd_none(pgd);
														
 
															+	else
														
 
															+		return 0;
														
 
															+}
														
 
															-#define stage2_pud_table_empty(pudp)			kvm_page_empty(pudp)
														
 
															+static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pud(kvm))
														
 
															+		pgd_clear(pgdp);
														
 
															+}
														
 
															-static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end)
														
 
															+static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd)
														
 
															 {
														
 
															-	phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
														
 
															+	if (kvm_stage2_has_pud(kvm))
														
 
															+		return pgd_present(pgd);
														
 
															+	else
														
 
															+		return 1;
														
 
															+}
														
 
															-	return (boundary - 1 < end - 1) ? boundary : end;
														
 
															+static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pud(kvm))
														
 
															+		pgd_populate(NULL, pgd, pud);
														
 
															+}
														
 
															+
														
 
															+static inline pud_t *stage2_pud_offset(struct kvm *kvm,
														
 
															+				       pgd_t *pgd, unsigned long address)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pud(kvm))
														
 
															+		return pud_offset(pgd, address);
														
 
															+	else
														
 
															+		return (pud_t *)pgd;
														
 
															 }
														
 
															-#endif		/* STAGE2_PGTABLE_LEVELS > 3 */
														
 
															+static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pud(kvm))
														
 
															+		pud_free(NULL, pud);
														
 
															+}
														
 
															+static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pud(kvm))
														
 
															+		return kvm_page_empty(pudp);
														
 
															+	else
														
 
															+		return false;
														
 
															+}
														
 
															-#if STAGE2_PGTABLE_LEVELS > 2
														
 
															+static inline phys_addr_t
														
 
															+stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pud(kvm)) {
														
 
															+		phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
														
 
															+
														
 
															+		return (boundary - 1 < end - 1) ? boundary : end;
														
 
															+	} else {
														
 
															+		return end;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* Stage2 PMD definitions when the level is present */
														
 
															+static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
														
 
															+{
														
 
															+	return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
														
 
															+}
														
 
															 #define S2_PMD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
														
 
															-#define S2_PMD_SIZE			(_AC(1, UL) << S2_PMD_SHIFT)
														
 
															+#define S2_PMD_SIZE			(1UL << S2_PMD_SHIFT)
														
 
															 #define S2_PMD_MASK			(~(S2_PMD_SIZE - 1))
														
 
															-#define stage2_pud_none(pud)				pud_none(pud)
														
 
															-#define stage2_pud_clear(pud)				pud_clear(pud)
														
 
															-#define stage2_pud_present(pud)				pud_present(pud)
														
 
															-#define stage2_pud_populate(pud, pmd)			pud_populate(NULL, pud, pmd)
														
 
															-#define stage2_pmd_offset(pud, address)			pmd_offset(pud, address)
														
 
															-#define stage2_pmd_free(pmd)				pmd_free(NULL, pmd)
														
 
															+static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		return pud_none(pud);
														
 
															+	else
														
 
															+		return 0;
														
 
															+}
														
 
															+
														
 
															+static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		pud_clear(pud);
														
 
															+}
														
 
															-#define stage2_pud_huge(pud)				pud_huge(pud)
														
 
															-#define stage2_pmd_table_empty(pmdp)			kvm_page_empty(pmdp)
														
 
															+static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		return pud_present(pud);
														
 
															+	else
														
 
															+		return 1;
														
 
															+}
														
 
															-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
														
 
															+static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
														
 
															 {
														
 
															-	phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		pud_populate(NULL, pud, pmd);
														
 
															+}
														
 
															-	return (boundary - 1 < end - 1) ? boundary : end;
														
 
															+static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
														
 
															+				       pud_t *pud, unsigned long address)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		return pmd_offset(pud, address);
														
 
															+	else
														
 
															+		return (pmd_t *)pud;
														
 
															 }
														
 
															-#endif		/* STAGE2_PGTABLE_LEVELS > 2 */
														
 
															+static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		pmd_free(NULL, pmd);
														
 
															+}
														
 
															+
														
 
															+static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		return pud_huge(pud);
														
 
															+	else
														
 
															+		return 0;
														
 
															+}
														
 
															+
														
 
															+static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm))
														
 
															+		return kvm_page_empty(pmdp);
														
 
															+	else
														
 
															+		return 0;
														
 
															+}
														
 
															-#define stage2_pte_table_empty(ptep)			kvm_page_empty(ptep)
														
 
															+static inline phys_addr_t
														
 
															+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
														
 
															+{
														
 
															+	if (kvm_stage2_has_pmd(kvm)) {
														
 
															+		phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
														
 
															-#if STAGE2_PGTABLE_LEVELS == 2
														
 
															-#include <asm/stage2_pgtable-nopmd.h>
														
 
															-#elif STAGE2_PGTABLE_LEVELS == 3
														
 
															-#include <asm/stage2_pgtable-nopud.h>
														
 
															-#endif
														
 
															+		return (boundary - 1 < end - 1) ? boundary : end;
														
 
															+	} else {
														
 
															+		return end;
														
 
															+	}
														
 
															+}
														
 
															+static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
														
 
															+{
														
 
															+	return kvm_page_empty(ptep);
														
 
															+}
														
 
															-#define stage2_pgd_index(addr)				(((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
														
 
															+static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
														
 
															+{
														
 
															+	return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
														
 
															+}
														
 
															-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
														
 
															+static inline phys_addr_t
														
 
															+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
														
 
															 {
														
 
															-	phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK;
														
 
															+	phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm);
														
 
															 	return (boundary - 1 < end - 1) ? boundary : end;
														
 
															 }
														
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -391,15 +391,15 @@ int __attribute_const__ kvm_target_cpu(void)
 
															 			return KVM_ARM_TARGET_CORTEX_A53;
														
 
															 		case ARM_CPU_PART_CORTEX_A57:
														
 
															 			return KVM_ARM_TARGET_CORTEX_A57;
														
 
															-		};
														
 
															+		}
														
 
															 		break;
														
 
															 	case ARM_CPU_IMP_APM:
														
 
															 		switch (part_number) {
														
 
															 		case APM_CPU_PART_POTENZA:
														
 
															 			return KVM_ARM_TARGET_XGENE_POTENZA;
														
 
															-		};
														
 
															+		}
														
 
															 		break;
														
 
															-	};
														
 
															+	}
														
 
															 	/* Return a default generic target */
														
 
															 	return KVM_ARM_TARGET_GENERIC_V8;
														
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 
															 		 */
														
 
															 		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
														
 
															 		return 0;
														
 
															+	case ARM_EXCEPTION_IL:
														
 
															+		/*
														
 
															+		 * We attempted an illegal exception return.  Guest state must
														
 
															+		 * have been corrupted somehow.  Give up.
														
 
															+		 */
														
 
															+		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
														
 
															+		return -EINVAL;
														
 
															 	default:
														
 
															 		kvm_pr_unimpl("Unsupported exception type: %d",
														
 
															 			      exception_index);
														
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o
 
															 obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
														
 
															 obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
														
 
															 obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
														
 
															-obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
														
 
															 # KVM code is run at a different exception code with a different map, so
														
 
															 # compiler instrumentation that inserts callbacks or checks into the code may
														
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -162,6 +162,20 @@ el1_error:
 
															 	mov	x0, #ARM_EXCEPTION_EL1_SERROR
														
 
															 	b	__guest_exit
														
 
															+el2_sync:
														
 
															+	/* Check for illegal exception return, otherwise panic */
														
 
															+	mrs	x0, spsr_el2
														
 
															+
														
 
															+	/* if this was something else, then panic! */
														
 
															+	tst	x0, #PSR_IL_BIT
														
 
															+	b.eq	__hyp_panic
														
 
															+
														
 
															+	/* Let's attempt a recovery from the illegal exception return */
														
 
															+	get_vcpu_ptr	x1, x0
														
 
															+	mov	x0, #ARM_EXCEPTION_IL
														
 
															+	b	__guest_exit
														
 
															+
														
 
															+
														
 
															 el2_error:
														
 
															 	ldp	x0, x1, [sp], #16
														
@@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector)
 
															 	invalid_vect	el2t_fiq_invalid	// FIQ EL2t
														
 
															 	invalid_vect	el2t_error_invalid	// Error EL2t
														
 
															-	invalid_vect	el2h_sync_invalid	// Synchronous EL2h
														
 
															+	valid_vect	el2_sync		// Synchronous EL2h
														
 
															 	invalid_vect	el2h_irq_invalid	// IRQ EL2h
														
 
															 	invalid_vect	el2h_fiq_invalid	// FIQ EL2h
														
 
															 	valid_vect	el2_error		// Error EL2h
														
--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ b/arch/arm64/kvm/hyp/s2-setup.c
@@ -1,90 +0,0 @@
 
															-/*
														
 
															- * Copyright (C) 2016 - ARM Ltd
														
 
															- * Author: Marc Zyngier <marc.zyngier@arm.com>
														
 
															- *
														
 
															- * This program is free software; you can redistribute it and/or modify
														
 
															- * it under the terms of the GNU General Public License version 2 as
														
 
															- * published by the Free Software Foundation.
														
 
															- *
														
 
															- * This program is distributed in the hope that it will be useful,
														
 
															- * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															- * GNU General Public License for more details.
														
 
															- *
														
 
															- * You should have received a copy of the GNU General Public License
														
 
															- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
														
 
															- */
														
 
															-
														
 
															-#include <linux/types.h>
														
 
															-#include <asm/kvm_arm.h>
														
 
															-#include <asm/kvm_asm.h>
														
 
															-#include <asm/kvm_hyp.h>
														
 
															-
														
 
															-u32 __hyp_text __init_stage2_translation(void)
														
 
															-{
														
 
															-	u64 val = VTCR_EL2_FLAGS;
														
 
															-	u64 parange;
														
 
															-	u64 tmp;
														
 
															-
														
 
															-	/*
														
 
															-	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS
														
 
															-	 * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while
														
 
															-	 * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2...
														
 
															-	 */
														
 
															-	parange = read_sysreg(id_aa64mmfr0_el1) & 7;
														
 
															-	if (parange > ID_AA64MMFR0_PARANGE_MAX)
														
 
															-		parange = ID_AA64MMFR0_PARANGE_MAX;
														
 
															-	val |= parange << 16;
														
 
															-
														
 
															-	/* Compute the actual PARange... */
														
 
															-	switch (parange) {
														
 
															-	case 0:
														
 
															-		parange = 32;
														
 
															-		break;
														
 
															-	case 1:
														
 
															-		parange = 36;
														
 
															-		break;
														
 
															-	case 2:
														
 
															-		parange = 40;
														
 
															-		break;
														
 
															-	case 3:
														
 
															-		parange = 42;
														
 
															-		break;
														
 
															-	case 4:
														
 
															-		parange = 44;
														
 
															-		break;
														
 
															-	case 5:
														
 
															-	default:
														
 
															-		parange = 48;
														
 
															-		break;
														
 
															-	}
														
 
															-
														
 
															-	/*
														
 
															-	 * ... and clamp it to 40 bits, unless we have some braindead
														
 
															-	 * HW that implements less than that. In all cases, we'll
														
 
															-	 * return that value for the rest of the kernel to decide what
														
 
															-	 * to do.
														
 
															-	 */
														
 
															-	val |= 64 - (parange > 40 ? 40 : parange);
														
 
															-
														
 
															-	/*
														
 
															-	 * Check the availability of Hardware Access Flag / Dirty Bit
														
 
															-	 * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2.
														
 
															-	 */
														
 
															-	tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf;
														
 
															-	if (tmp)
														
 
															-		val |= VTCR_EL2_HA;
														
 
															-
														
 
															-	/*
														
 
															-	 * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
														
 
															-	 * bit in VTCR_EL2.
														
 
															-	 */
														
 
															-	tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf;
														
 
															-	val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ?
														
 
															-			VTCR_EL2_VS_16BIT :
														
 
															-			VTCR_EL2_VS_8BIT;
														
 
															-
														
 
															-	write_sysreg(val, vtcr_el2);
														
 
															-
														
 
															-	return parange;
														
 
															-}
														
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void)
 
															 static void __hyp_text __activate_vm(struct kvm *kvm)
														
 
															 {
														
 
															-	write_sysreg(kvm->arch.vttbr, vttbr_el2);
														
 
															+	__load_guest_stage2(kvm);
														
 
															 }
														
 
															 static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
														
@@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar)
 
															 		return false; /* Translation failed, back to guest */
														
 
															 	/* Convert PAR to HPFAR format */
														
 
															-	*hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4;
														
 
															+	*hpfar = PAR_TO_HPFAR(tmp);
														
 
															 	return true;
														
 
															 }
														
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
 
															 static void __hyp_text
														
 
															 __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
														
 
															 {
														
 
															+	u64 pstate = ctxt->gp_regs.regs.pstate;
														
 
															+	u64 mode = pstate & PSR_AA32_MODE_MASK;
														
 
															+
														
 
															+	/*
														
 
															+	 * Safety check to ensure we're setting the CPU up to enter the guest
														
 
															+	 * in a less privileged mode.
														
 
															+	 *
														
 
															+	 * If we are attempting a return to EL2 or higher in AArch64 state,
														
 
															+	 * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that
														
 
															+	 * we'll take an illegal exception state exception immediately after
														
 
															+	 * the ERET to the guest.  Attempts to return to AArch32 Hyp will
														
 
															+	 * result in an illegal exception return because EL2's execution state
														
 
															+	 * is determined by SCR_EL3.RW.
														
 
															+	 */
														
 
															+	if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t)
														
 
															+		pstate = PSR_MODE_EL2h | PSR_IL_BIT;
														
 
															+
														
 
															 	write_sysreg_el2(ctxt->gp_regs.regs.pc,		elr);
														
 
															-	write_sysreg_el2(ctxt->gp_regs.regs.pstate,	spsr);
														
 
															+	write_sysreg_el2(pstate,			spsr);
														
 
															 	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
														
 
															 		write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
														
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
 
															 	 * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so
														
 
															 	 * let's flip TGE before executing the TLB operation.
														
 
															 	 */
														
 
															-	write_sysreg(kvm->arch.vttbr, vttbr_el2);
														
 
															+	__load_guest_stage2(kvm);
														
 
															 	val = read_sysreg(hcr_el2);
														
 
															 	val &= ~HCR_TGE;
														
 
															 	write_sysreg(val, hcr_el2);
														
@@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
 
															 static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm)
														
 
															 {
														
 
															-	write_sysreg(kvm->arch.vttbr, vttbr_el2);
														
 
															+	__load_guest_stage2(kvm);
														
 
															 	isb();
														
 
															 }
														
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -26,6 +26,7 @@
 
															 #include <kvm/arm_arch_timer.h>
														
 
															+#include <asm/cpufeature.h>
														
 
															 #include <asm/cputype.h>
														
 
															 #include <asm/ptrace.h>
														
 
															 #include <asm/kvm_arm.h>
														
@@ -33,6 +34,9 @@
 
															 #include <asm/kvm_coproc.h>
														
 
															 #include <asm/kvm_mmu.h>
														
 
															+/* Maximum phys_shift supported for any VM on this host */
														
 
															+static u32 kvm_ipa_limit;
														
 
															+
														
 
															 /*
														
 
															  * ARMv8 Reset Values
														
 
															  */
														
@@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void)
 
															 }
														
 
															 /**
														
 
															- * kvm_arch_dev_ioctl_check_extension
														
 
															+ * kvm_arch_vm_ioctl_check_extension
														
 
															  *
														
 
															  * We currently assume that the number of HW registers is uniform
														
 
															  * across all CPUs (see cpuinfo_sanity_check).
														
 
															  */
														
 
															-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
														
 
															+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
														
 
															 {
														
 
															 	int r;
														
@@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 
															 		break;
														
 
															 	case KVM_CAP_SET_GUEST_DEBUG:
														
 
															 	case KVM_CAP_VCPU_ATTRIBUTES:
														
 
															-	case KVM_CAP_VCPU_EVENTS:
														
 
															 		r = 1;
														
 
															 		break;
														
 
															+	case KVM_CAP_ARM_VM_IPA_SIZE:
														
 
															+		r = kvm_ipa_limit;
														
 
															+		break;
														
 
															 	default:
														
 
															 		r = 0;
														
 
															 	}
														
@@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 
															 	/* Reset timer */
														
 
															 	return kvm_timer_vcpu_reset(vcpu);
														
 
															 }
														
 
															+
														
 
															+void kvm_set_ipa_limit(void)
														
 
															+{
														
 
															+	unsigned int ipa_max, pa_max, va_max, parange;
														
 
															+
														
 
															+	parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7;
														
 
															+	pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
														
 
															+
														
 
															+	/* Clamp the IPA limit to the PA size supported by the kernel */
														
 
															+	ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
														
 
															+	/*
														
 
															+	 * Since our stage2 table is dependent on the stage1 page table code,
														
 
															+	 * we must always honor the following condition:
														
 
															+	 *
														
 
															+	 *  Number of levels in Stage1 >= Number of levels in Stage2.
														
 
															+	 *
														
 
															+	 * So clamp the ipa limit further down to limit the number of levels.
														
 
															+	 * Since we can concatenate upto 16 tables at entry level, we could
														
 
															+	 * go upto 4bits above the maximum VA addressible with the current
														
 
															+	 * number of levels.
														
 
															+	 */
														
 
															+	va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
														
 
															+	va_max += 4;
														
 
															+
														
 
															+	if (va_max < ipa_max)
														
 
															+		ipa_max = va_max;
														
 
															+
														
 
															+	/*
														
 
															+	 * If the final limit is lower than the real physical address
														
 
															+	 * limit of the CPUs, report the reason.
														
 
															+	 */
														
 
															+	if (ipa_max < pa_max)
														
 
															+		pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
														
 
															+			(va_max < pa_max) ? "Virtual" : "Physical");
														
 
															+
														
 
															+	WARN(ipa_max < KVM_PHYS_SHIFT,
														
 
															+	     "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
														
 
															+	kvm_ipa_limit = ipa_max;
														
 
															+	kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Configure the VTCR_EL2 for this VM. The VTCR value is common
														
 
															+ * across all the physical CPUs on the system. We use system wide
														
 
															+ * sanitised values to fill in different fields, except for Hardware
														
 
															+ * Management of Access Flags. HA Flag is set unconditionally on
														
 
															+ * all CPUs, as it is safe to run with or without the feature and
														
 
															+ * the bit is RES0 on CPUs that don't support it.
														
 
															+ */
														
 
															+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
														
 
															+{
														
 
															+	u64 vtcr = VTCR_EL2_FLAGS;
														
 
															+	u32 parange, phys_shift;
														
 
															+	u8 lvls;
														
 
															+
														
 
															+	if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
														
 
															+	if (phys_shift) {
														
 
															+		if (phys_shift > kvm_ipa_limit ||
														
 
															+		    phys_shift < 32)
														
 
															+			return -EINVAL;
														
 
															+	} else {
														
 
															+		phys_shift = KVM_PHYS_SHIFT;
														
 
															+	}
														
 
															+
														
 
															+	parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7;
														
 
															+	if (parange > ID_AA64MMFR0_PARANGE_MAX)
														
 
															+		parange = ID_AA64MMFR0_PARANGE_MAX;
														
 
															+	vtcr |= parange << VTCR_EL2_PS_SHIFT;
														
 
															+
														
 
															+	vtcr |= VTCR_EL2_T0SZ(phys_shift);
														
 
															+	/*
														
 
															+	 * Use a minimum 2 level page table to prevent splitting
														
 
															+	 * host PMD huge pages at stage2.
														
 
															+	 */
														
 
															+	lvls = stage2_pgtable_levels(phys_shift);
														
 
															+	if (lvls < 2)
														
 
															+		lvls = 2;
														
 
															+	vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
														
 
															+
														
 
															+	/*
														
 
															+	 * Enable the Hardware Access Flag management, unconditionally
														
 
															+	 * on all CPUs. The features is RES0 on CPUs without the support
														
 
															+	 * and must be ignored by the CPUs.
														
 
															+	 */
														
 
															+	vtcr |= VTCR_EL2_HA;
														
 
															+
														
 
															+	/* Set the vmid bits */
														
 
															+	vtcr |= (kvm_get_vmid_bits() == 16) ?
														
 
															+		VTCR_EL2_VS_16BIT :
														
 
															+		VTCR_EL2_VS_8BIT;
														
 
															+	kvm->arch.vtcr = vtcr;
														
 
															+	return 0;
														
 
															+}
														
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache;
 
															 extern long flush_count_cache;
														
 
															+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
														
 
															+void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
														
 
															+void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
														
 
															+#else
														
 
															+static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
														
 
															+				     bool preserve_nv) { }
														
 
															+static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
														
 
															+					bool preserve_nv) { }
														
 
															+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
														
 
															+
														
 
															+void kvmhv_save_host_pmu(void);
														
 
															+void kvmhv_load_host_pmu(void);
														
 
															+void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
														
 
															+void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
														
 
															+
														
 
															+int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
														
 
															+
														
 
															+long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
														
 
															+long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
														
 
															+			unsigned long dabrx);
														
 
															+
														
 
															 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
														
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 
															 	BUG();
														
 
															 }
														
 
															+static inline unsigned int ap_to_shift(unsigned long ap)
														
 
															+{
														
 
															+	int psize;
														
 
															+
														
 
															+	for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
														
 
															+		if (mmu_psize_defs[psize].ap == ap)
														
 
															+			return mmu_psize_defs[psize].shift;
														
 
															+	}
														
 
															+
														
 
															+	return -1;
														
 
															+}
														
 
															+
														
 
															 static inline unsigned long get_sllp_encoding(int psize)
														
 
															 {
														
 
															 	unsigned long sllp;
														
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
 
															 					unsigned long addr,
														
 
															 					unsigned long page_size);
														
 
															 extern void radix__flush_pwc_lpid(unsigned int lpid);
														
 
															+extern void radix__flush_tlb_lpid(unsigned int lpid);
														
 
															 extern void radix__local_flush_tlb_lpid(unsigned int lpid);
														
 
															 extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
														
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -322,6 +322,11 @@
 
															 #define H_GET_24X7_DATA		0xF07C
														
 
															 #define H_GET_PERF_COUNTER_INFO	0xF080
														
 
															+/* Platform-specific hcalls used for nested HV KVM */
														
 
															+#define H_SET_PARTITION_TABLE	0xF800
														
 
															+#define H_ENTER_NESTED		0xF804
														
 
															+#define H_TLB_INVALIDATE	0xF808
														
 
															+
														
 
															 /* Values for 2nd argument to H_SET_MODE */
														
 
															 #define H_SET_MODE_RESOURCE_SET_CIABR		1
														
 
															 #define H_SET_MODE_RESOURCE_SET_DAWR		2
														
@@ -461,6 +466,42 @@ struct h_cpu_char_result {
 
															 	u64 behaviour;
														
 
															 };
														
 
															+/* Register state for entering a nested guest with H_ENTER_NESTED */
														
 
															+struct hv_guest_state {
														
 
															+	u64 version;		/* version of this structure layout */
														
 
															+	u32 lpid;
														
 
															+	u32 vcpu_token;
														
 
															+	/* These registers are hypervisor privileged (at least for writing) */
														
 
															+	u64 lpcr;
														
 
															+	u64 pcr;
														
 
															+	u64 amor;
														
 
															+	u64 dpdes;
														
 
															+	u64 hfscr;
														
 
															+	s64 tb_offset;
														
 
															+	u64 dawr0;
														
 
															+	u64 dawrx0;
														
 
															+	u64 ciabr;
														
 
															+	u64 hdec_expiry;
														
 
															+	u64 purr;
														
 
															+	u64 spurr;
														
 
															+	u64 ic;
														
 
															+	u64 vtb;
														
 
															+	u64 hdar;
														
 
															+	u64 hdsisr;
														
 
															+	u64 heir;
														
 
															+	u64 asdr;
														
 
															+	/* These are OS privileged but need to be set late in guest entry */
														
 
															+	u64 srr0;
														
 
															+	u64 srr1;
														
 
															+	u64 sprg[4];
														
 
															+	u64 pidr;
														
 
															+	u64 cfar;
														
 
															+	u64 ppr;
														
 
															+};
														
 
															+
														
 
															+/* Latest version of hv_guest_state structure */
														
 
															+#define HV_GUEST_STATE_VERSION	1
														
 
															+
														
 
															 #endif /* __ASSEMBLY__ */
														
 
															 #endif /* __KERNEL__ */
														
 
															 #endif /* _ASM_POWERPC_HVCALL_H */
														
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -126,7 +126,7 @@ struct iommu_table {
 
															 	int it_nid;
														
 
															 };
														
 
															-#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
														
 
															+#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
														
 
															 		((tbl)->it_ops->useraddrptr((tbl), (entry), false))
														
 
															 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
														
 
															 		((tbl)->it_ops->useraddrptr((tbl), (entry), true))
														
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -84,7 +84,6 @@
 
															 #define BOOK3S_INTERRUPT_INST_STORAGE	0x400
														
 
															 #define BOOK3S_INTERRUPT_INST_SEGMENT	0x480
														
 
															 #define BOOK3S_INTERRUPT_EXTERNAL	0x500
														
 
															-#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL	0x501
														
 
															 #define BOOK3S_INTERRUPT_EXTERNAL_HV	0x502
														
 
															 #define BOOK3S_INTERRUPT_ALIGNMENT	0x600
														
 
															 #define BOOK3S_INTERRUPT_PROGRAM	0x700
														
@@ -134,8 +133,7 @@
 
															 #define BOOK3S_IRQPRIO_EXTERNAL			14
														
 
															 #define BOOK3S_IRQPRIO_DECREMENTER		15
														
 
															 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR	16
														
 
															-#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL		17
														
 
															-#define BOOK3S_IRQPRIO_MAX			18
														
 
															+#define BOOK3S_IRQPRIO_MAX			17
														
 
															 #define BOOK3S_HFLAG_DCBZ32			0x1
														
 
															 #define BOOK3S_HFLAG_SLB			0x2
														
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 
															 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
														
 
															 			struct kvm_vcpu *vcpu,
														
 
															 			unsigned long ea, unsigned long dsisr);
														
 
															+extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
														
 
															+				      struct kvmppc_pte *gpte, u64 root,
														
 
															+				      u64 *pte_ret_p);
														
 
															+extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
														
 
															+			struct kvmppc_pte *gpte, u64 table,
														
 
															+			int table_index, u64 *pte_ret_p);
														
 
															 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
														
 
															 			struct kvmppc_pte *gpte, bool data, bool iswrite);
														
 
															+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
														
 
															+			unsigned int shift, struct kvm_memory_slot *memslot,
														
 
															+			unsigned int lpid);
														
 
															+extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
														
 
															+				    bool writing, unsigned long gpa,
														
 
															+				    unsigned int lpid);
														
 
															+extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
														
 
															+				unsigned long gpa,
														
 
															+				struct kvm_memory_slot *memslot,
														
 
															+				bool writing, bool kvm_ro,
														
 
															+				pte_t *inserted_pte, unsigned int *levelp);
														
 
															 extern int kvmppc_init_vm_radix(struct kvm *kvm);
														
 
															 extern void kvmppc_free_radix(struct kvm *kvm);
														
 
															+extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
														
 
															+				      unsigned int lpid);
														
 
															 extern int kvmppc_radix_init(void);
														
 
															 extern void kvmppc_radix_exit(void);
														
 
															 extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
														
 
															 			unsigned long gfn);
														
 
															+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
														
 
															+			     unsigned long gpa, unsigned int shift,
														
 
															+			     struct kvm_memory_slot *memslot,
														
 
															+			     unsigned int lpid);
														
 
															 extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
														
 
															 			unsigned long gfn);
														
 
															 extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
														
@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 
															 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
														
 
															 #endif
														
 
															+long kvmhv_nested_init(void);
														
 
															+void kvmhv_nested_exit(void);
														
 
															+void kvmhv_vm_nested_init(struct kvm *kvm);
														
 
															+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
														
 
															+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
														
 
															+void kvmhv_release_all_nested(struct kvm *kvm);
														
 
															+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
														
 
															+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
														
 
															+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
														
 
															+			  u64 time_limit, unsigned long lpcr);
														
 
															+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
														
 
															+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
														
 
															+				   struct hv_guest_state *hr);
														
 
															+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
														
 
															+
														
 
															 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
														
 
															 extern int kvm_irq_bypass;
														
@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
															 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
														
 
															 {
														
 
															-	vcpu->arch.cr = val;
														
 
															+	vcpu->arch.regs.ccr = val;
														
 
															 }
														
 
															 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	return vcpu->arch.cr;
														
 
															+	return vcpu->arch.regs.ccr;
														
 
															 }
														
 
															 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
														
@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 
															 /* TO = 31 for unconditional trap */
														
 
															 #define INS_TW				0x7fe00008
														
 
															-/* LPIDs we support with this build -- runtime limit may be lower */
														
 
															-#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
														
 
															-
														
 
															 #define SPLIT_HACK_MASK			0xff000000
														
 
															 #define SPLIT_HACK_OFFS			0xfb000000
														
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -23,6 +23,108 @@
 
															 #include <linux/string.h>
														
 
															 #include <asm/bitops.h>
														
 
															 #include <asm/book3s/64/mmu-hash.h>
														
 
															+#include <asm/cpu_has_feature.h>
														
 
															+#include <asm/ppc-opcode.h>
														
 
															+
														
 
															+#ifdef CONFIG_PPC_PSERIES
														
 
															+static inline bool kvmhv_on_pseries(void)
														
 
															+{
														
 
															+	return !cpu_has_feature(CPU_FTR_HVMODE);
														
 
															+}
														
 
															+#else
														
 
															+static inline bool kvmhv_on_pseries(void)
														
 
															+{
														
 
															+	return false;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+/*
														
 
															+ * Structure for a nested guest, that is, for a guest that is managed by
														
 
															+ * one of our guests.
														
 
															+ */
														
 
															+struct kvm_nested_guest {
														
 
															+	struct kvm *l1_host;		/* L1 VM that owns this nested guest */
														
 
															+	int l1_lpid;			/* lpid L1 guest thinks this guest is */
														
 
															+	int shadow_lpid;		/* real lpid of this nested guest */
														
 
															+	pgd_t *shadow_pgtable;		/* our page table for this guest */
														
 
															+	u64 l1_gr_to_hr;		/* L1's addr of part'n-scoped table */
														
 
															+	u64 process_table;		/* process table entry for this guest */
														
 
															+	long refcnt;			/* number of pointers to this struct */
														
 
															+	struct mutex tlb_lock;		/* serialize page faults and tlbies */
														
 
															+	struct kvm_nested_guest *next;
														
 
															+	cpumask_t need_tlb_flush;
														
 
															+	cpumask_t cpu_in_guest;
														
 
															+	short prev_cpu[NR_CPUS];
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * We define a nested rmap entry as a single 64-bit quantity
														
 
															+ * 0xFFF0000000000000	12-bit lpid field
														
 
															+ * 0x000FFFFFFFFFF000	40-bit guest 4k page frame number
														
 
															+ * 0x0000000000000001	1-bit  single entry flag
														
 
															+ */
														
 
															+#define RMAP_NESTED_LPID_MASK		0xFFF0000000000000UL
														
 
															+#define RMAP_NESTED_LPID_SHIFT		(52)
														
 
															+#define RMAP_NESTED_GPA_MASK		0x000FFFFFFFFFF000UL
														
 
															+#define RMAP_NESTED_IS_SINGLE_ENTRY	0x0000000000000001UL
														
 
															+
														
 
															+/* Structure for a nested guest rmap entry */
														
 
															+struct rmap_nested {
														
 
															+	struct llist_node list;
														
 
															+	u64 rmap;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * for_each_nest_rmap_safe - iterate over the list of nested rmap entries
														
 
															+ *			     safe against removal of the list entry or NULL list
														
 
															+ * @pos:	a (struct rmap_nested *) to use as a loop cursor
														
 
															+ * @node:	pointer to the first entry
														
 
															+ *		NOTE: this can be NULL
														
 
															+ * @rmapp:	an (unsigned long *) in which to return the rmap entries on each
														
 
															+ *		iteration
														
 
															+ *		NOTE: this must point to already allocated memory
														
 
															+ *
														
 
															+ * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
														
 
															+ * rmap entry in the memslot. The list is always terminated by a "single entry"
														
 
															+ * stored in the list element of the final entry of the llist. If there is ONLY
														
 
															+ * a single entry then this is itself in the rmap entry of the memslot, not a
														
 
															+ * llist head pointer.
														
 
															+ *
														
 
															+ * Note that the iterator below assumes that a nested rmap entry is always
														
 
															+ * non-zero.  This is true for our usage because the LPID field is always
														
 
															+ * non-zero (zero is reserved for the host).
														
 
															+ *
														
 
															+ * This should be used to iterate over the list of rmap_nested entries with
														
 
															+ * processing done on the u64 rmap value given by each iteration. This is safe
														
 
															+ * against removal of list entries and it is always safe to call free on (pos).
														
 
															+ *
														
 
															+ * e.g.
														
 
															+ * struct rmap_nested *cursor;
														
 
															+ * struct llist_node *first;
														
 
															+ * unsigned long rmap;
														
 
															+ * for_each_nest_rmap_safe(cursor, first, &rmap) {
														
 
															+ *	do_something(rmap);
														
 
															+ *	free(cursor);
														
 
															+ * }
														
 
															+ */
														
 
															+#define for_each_nest_rmap_safe(pos, node, rmapp)			       \
														
 
															+	for ((pos) = llist_entry((node), typeof(*(pos)), list);		       \
														
 
															+	     (node) &&							       \
														
 
															+	     (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?     \
														
 
															+			  ((u64) (node)) : ((pos)->rmap))) &&		       \
														
 
															+	     (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?      \
														
 
															+			 ((struct llist_node *) ((pos) = NULL)) :	       \
														
 
															+			 (pos)->list.next)), true);			       \
														
 
															+	     (pos) = llist_entry((node), typeof(*(pos)), list))
														
 
															+
														
 
															+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
														
 
															+					  bool create);
														
 
															+void kvmhv_put_nested(struct kvm_nested_guest *gp);
														
 
															+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid);
														
 
															+
														
 
															+/* Encoding of first parameter for H_TLB_INVALIDATE */
														
 
															+#define H_TLBIE_P1_ENC(ric, prs, r)	(___PPC_RIC(ric) | ___PPC_PRS(prs) | \
														
 
															+					 ___PPC_R(r))
														
 
															 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
														
 
															 #define PPC_MIN_HPT_ORDER	18
														
@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 
															 }
														
 
															 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
														
 
															+extern void kvmhv_radix_debugfs_init(struct kvm *kvm);
														
 
															 extern void kvmhv_rm_send_ipi(int cpu);
														
@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr)
 
															 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
														
 
															 static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	vcpu->arch.cr  = vcpu->arch.cr_tm;
														
 
															+	vcpu->arch.regs.ccr  = vcpu->arch.cr_tm;
														
 
															 	vcpu->arch.regs.xer = vcpu->arch.xer_tm;
														
 
															 	vcpu->arch.regs.link  = vcpu->arch.lr_tm;
														
 
															 	vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
														
@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
 
															 static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	vcpu->arch.cr_tm  = vcpu->arch.cr;
														
 
															+	vcpu->arch.cr_tm  = vcpu->arch.regs.ccr;
														
 
															 	vcpu->arch.xer_tm = vcpu->arch.regs.xer;
														
 
															 	vcpu->arch.lr_tm  = vcpu->arch.regs.link;
														
 
															 	vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
														
@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 
															 }
														
 
															 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
														
 
															+extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
														
 
															+			     unsigned long gpa, unsigned int level,
														
 
															+			     unsigned long mmu_seq, unsigned int lpid,
														
 
															+			     unsigned long *rmapp, struct rmap_nested **n_rmap);
														
 
															+extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
														
 
															+				   struct rmap_nested **n_rmap);
														
 
															+extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
														
 
															+				struct kvm_memory_slot *memslot,
														
 
															+				unsigned long gpa, unsigned long hpa,
														
 
															+				unsigned long nbytes);
														
 
															+
														
 
															 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
														
 
															 #endif /* __ASM_KVM_BOOK3S_64_H__ */
														
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -25,6 +25,9 @@
 
															 #define XICS_MFRR		0xc
														
 
															 #define XICS_IPI		2	/* interrupt source # for IPIs */
														
 
															+/* LPIDs we support with this build -- runtime limit may be lower */
														
 
															+#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
														
 
															+
														
 
															 /* Maximum number of threads per physical core */
														
 
															 #define MAX_SMT_THREADS		8
														
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
															 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
														
 
															 {
														
 
															-	vcpu->arch.cr = val;
														
 
															+	vcpu->arch.regs.ccr = val;
														
 
															 }
														
 
															 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	return vcpu->arch.cr;
														
 
															+	return vcpu->arch.regs.ccr;
														
 
															 }
														
 
															 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
														
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															 #include <asm/kvm_book3s_asm.h>		/* for MAX_SMT_THREADS */
														
 
															 #define KVM_MAX_VCPU_ID		(MAX_SMT_THREADS * KVM_MAX_VCORES)
														
 
															+#define KVM_MAX_NESTED_GUESTS	KVMPPC_NR_LPIDS
														
 
															 #else
														
 
															 #define KVM_MAX_VCPU_ID		KVM_MAX_VCPUS
														
@@ -94,6 +95,7 @@ struct dtl_entry;
 
															 struct kvmppc_vcpu_book3s;
														
 
															 struct kvmppc_book3s_shadow_vcpu;
														
 
															+struct kvm_nested_guest;
														
 
															 struct kvm_vm_stat {
														
 
															 	ulong remote_tlb_flush;
														
@@ -287,10 +289,12 @@ struct kvm_arch {
 
															 	u8 radix;
														
 
															 	u8 fwnmi_enabled;
														
 
															 	bool threads_indep;
														
 
															+	bool nested_enable;
														
 
															 	pgd_t *pgtable;
														
 
															 	u64 process_table;
														
 
															 	struct dentry *debugfs_dir;
														
 
															 	struct dentry *htab_dentry;
														
 
															+	struct dentry *radix_dentry;
														
 
															 	struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
														
 
															 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
														
 
															 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
														
@@ -311,6 +315,9 @@ struct kvm_arch {
 
															 #endif
														
 
															 	struct kvmppc_ops *kvm_ops;
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															+	u64 l1_ptcr;
														
 
															+	int max_nested_lpid;
														
 
															+	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
														
 
															 	/* This array can grow quite large, keep it at the end */
														
 
															 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
														
 
															 #endif
														
@@ -360,7 +367,9 @@ struct kvmppc_pte {
 
															 	bool may_write		: 1;
														
 
															 	bool may_execute	: 1;
														
 
															 	unsigned long wimg;
														
 
															+	unsigned long rc;
														
 
															 	u8 page_size;		/* MMU_PAGE_xxx */
														
 
															+	u8 page_shift;
														
 
															 };
														
 
															 struct kvmppc_mmu {
														
@@ -537,8 +546,6 @@ struct kvm_vcpu_arch {
 
															 	ulong tar;
														
 
															 #endif
														
 
															-	u32 cr;
														
 
															-
														
 
															 #ifdef CONFIG_PPC_BOOK3S
														
 
															 	ulong hflags;
														
 
															 	ulong guest_owned_ext;
														
@@ -707,6 +714,7 @@ struct kvm_vcpu_arch {
 
															 	u8 hcall_needed;
														
 
															 	u8 epr_flags; /* KVMPPC_EPR_xxx */
														
 
															 	u8 epr_needed;
														
 
															+	u8 external_oneshot;	/* clear external irq after delivery */
														
 
															 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
														
@@ -781,6 +789,10 @@ struct kvm_vcpu_arch {
 
															 	u32 emul_inst;
														
 
															 	u32 online;
														
 
															+
														
 
															+	/* For support of nested guests */
														
 
															+	struct kvm_nested_guest *nested;
														
 
															+	u32 nested_vcpu_id;
														
 
															 #endif
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
														
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
 
															 		(iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
														
 
															 				(stt)->size, (ioba), (npages)) ?        \
														
 
															 				H_PARAMETER : H_SUCCESS)
														
 
															-extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
														
 
															-		unsigned long tce);
														
 
															-extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
														
 
															+extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
														
 
															 		unsigned long *ua, unsigned long **prmap);
														
 
															 extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
														
 
															 		unsigned long idx, unsigned long tce);
														
@@ -327,6 +325,7 @@ struct kvmppc_ops {
 
															 	int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
														
 
															 			    unsigned long flags);
														
 
															 	void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
														
 
															+	int (*enable_nested)(struct kvm *kvm);
														
 
															 };
														
 
															 extern struct kvmppc_ops *kvmppc_hv_ops;
														
@@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 
															 extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
														
 
															 			       int level, bool line_status);
														
 
															+extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
														
 
															 #else
														
 
															 static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
														
 
															 				       u32 priority) { return -1; }
														
@@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
 
															 static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
														
 
															 				      int level, bool line_status) { return -ENODEV; }
														
 
															+static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
														
 
															 #endif /* CONFIG_KVM_XIVE */
														
 
															 /*
														
@@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 
															                     unsigned long mfrr);
														
 
															 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
														
 
															 int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
														
 
															+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu);
														
 
															 /*
														
 
															  * Host-side operations we want to set up while running in real
														
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -104,6 +104,7 @@
 
															 #define OP_31_XOP_LHZUX     311
														
 
															 #define OP_31_XOP_MSGSNDP   142
														
 
															 #define OP_31_XOP_MSGCLRP   174
														
 
															+#define OP_31_XOP_TLBIE     306
														
 
															 #define OP_31_XOP_MFSPR     339
														
 
															 #define OP_31_XOP_LWAX      341
														
 
															 #define OP_31_XOP_LHAX      343
														
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -415,6 +415,7 @@
 
															 #define   HFSCR_DSCR	__MASK(FSCR_DSCR_LG)
														
 
															 #define   HFSCR_VECVSX	__MASK(FSCR_VECVSX_LG)
														
 
															 #define   HFSCR_FP	__MASK(FSCR_FP_LG)
														
 
															+#define   HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56)	/* interrupt cause */
														
 
															 #define SPRN_TAR	0x32f	/* Target Address Register */
														
 
															 #define SPRN_LPCR	0x13E	/* LPAR Control Register */
														
 
															 #define   LPCR_VPM0		ASM_CONST(0x8000000000000000)
														
@@ -766,6 +767,7 @@
 
															 #define SPRN_HSRR0	0x13A	/* Save/Restore Register 0 */
														
 
															 #define SPRN_HSRR1	0x13B	/* Save/Restore Register 1 */
														
 
															 #define   HSRR1_DENORM		0x00100000 /* Denorm exception */
														
 
															+#define   HSRR1_HISI_WRITE	0x00010000 /* HISI bcs couldn't update mem */
														
 
															 #define SPRN_TBCTL	0x35f	/* PA6T Timebase control register */
														
 
															 #define   TBCTL_FREEZE		0x0000000000000000ull /* Freeze all tbs */
														
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char {
 
															 #define KVM_REG_PPC_DEC_EXPIRY	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
														
 
															 #define KVM_REG_PPC_ONLINE	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
														
 
															+#define KVM_REG_PPC_PTCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
														
 
															 /* Transactional Memory checkpointed state:
														
 
															  * This is all GPRs, all VSX regs and a subset of SPRs
														
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -438,7 +438,7 @@ int main(void)
 
															 #ifdef CONFIG_PPC_BOOK3S
														
 
															 	OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
														
 
															 #endif
														
 
															-	OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
														
 
															+	OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
														
 
															 	OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															 	OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
														
@@ -503,6 +503,7 @@ int main(void)
 
															 	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
														
 
															 	OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
														
 
															 	OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
														
 
															+	OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
														
 
															 	OFFSET(VCPU_CPU, kvm_vcpu, cpu);
														
 
															 	OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
														
 
															 #endif
														
@@ -695,7 +696,7 @@ int main(void)
 
															 #endif /* CONFIG_PPC_BOOK3S_64 */
														
 
															 #else /* CONFIG_PPC_BOOK3S */
														
 
															-	OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
														
 
															+	OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
														
 
															 	OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
														
 
															 	OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
														
 
															 	OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
														
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -147,8 +147,8 @@ __init_hvmode_206:
 
															 	rldicl.	r0,r3,4,63
														
 
															 	bnelr
														
 
															 	ld	r5,CPU_SPEC_FEATURES(r4)
														
 
															-	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
														
 
															-	xor	r5,r5,r6
														
 
															+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
														
 
															+	andc	r5,r5,r6
														
 
															 	std	r5,CPU_SPEC_FEATURES(r4)
														
 
															 	blr
														
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -75,7 +75,8 @@ kvm-hv-y += \
 
															 	book3s_hv.o \
														
 
															 	book3s_hv_interrupts.o \
														
 
															 	book3s_64_mmu_hv.o \
														
 
															-	book3s_64_mmu_radix.o
														
 
															+	book3s_64_mmu_radix.o \
														
 
															+	book3s_hv_nested.o
														
 
															 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
														
 
															 	book3s_hv_tm.o
														
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
 
															 {
														
 
															 	if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
														
 
															 		ulong pc = kvmppc_get_pc(vcpu);
														
 
															+		ulong lr = kvmppc_get_lr(vcpu);
														
 
															 		if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
														
 
															 			kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
														
 
															+		if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
														
 
															+			kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
														
 
															 		vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
														
 
															 	}
														
 
															 }
														
@@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 
															 	case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;		break;
														
 
															 	case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;		break;
														
 
															 	case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;		break;
														
 
															-	case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL;	break;
														
 
															 	case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;		break;
														
 
															 	case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;		break;
														
 
															 	case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;		break;
														
@@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
 
															 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
														
 
															                                 struct kvm_interrupt *irq)
														
 
															 {
														
 
															-	unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
														
 
															-
														
 
															-	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
														
 
															-		vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
														
 
															+	/*
														
 
															+	 * This case (KVM_INTERRUPT_SET) should never actually arise for
														
 
															+	 * a pseries guest (because pseries guests expect their interrupt
														
 
															+	 * controllers to continue asserting an external interrupt request
														
 
															+	 * until it is acknowledged at the interrupt controller), but is
														
 
															+	 * included to avoid ABI breakage and potentially for other
														
 
															+	 * sorts of guest.
														
 
															+	 *
														
 
															+	 * There is a subtlety here: HV KVM does not test the
														
 
															+	 * external_oneshot flag in the code that synthesizes
														
 
															+	 * external interrupts for the guest just before entering
														
 
															+	 * the guest.  That is OK even if userspace did do a
														
 
															+	 * KVM_INTERRUPT_SET on a pseries guest vcpu, because the
														
 
															+	 * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
														
 
															+	 * which ends up doing a smp_send_reschedule(), which will
														
 
															+	 * pull the guest all the way out to the host, meaning that
														
 
															+	 * we will call kvmppc_core_prepare_to_enter() before entering
														
 
															+	 * the guest again, and that will handle the external_oneshot
														
 
															+	 * flag correctly.
														
 
															+	 */
														
 
															+	if (irq->irq == KVM_INTERRUPT_SET)
														
 
															+		vcpu->arch.external_oneshot = 1;
														
 
															-	kvmppc_book3s_queue_irqprio(vcpu, vec);
														
 
															+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
														
 
															 }
														
 
															 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
														
 
															-	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
														
 
															 }
														
 
															 void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
														
@@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
 
															 		vec = BOOK3S_INTERRUPT_DECREMENTER;
														
 
															 		break;
														
 
															 	case BOOK3S_IRQPRIO_EXTERNAL:
														
 
															-	case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
														
 
															 		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
														
 
															 		vec = BOOK3S_INTERRUPT_EXTERNAL;
														
 
															 		break;
														
@@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
 
															 		case BOOK3S_IRQPRIO_DECREMENTER:
														
 
															 			/* DEC interrupts get cleared by mtdec */
														
 
															 			return false;
														
 
															-		case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
														
 
															-			/* External interrupts get cleared by userspace */
														
 
															+		case BOOK3S_IRQPRIO_EXTERNAL:
														
 
															+			/*
														
 
															+			 * External interrupts get cleared by userspace
														
 
															+			 * except when set by the KVM_INTERRUPT ioctl with
														
 
															+			 * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
														
 
															+			 */
														
 
															+			if (vcpu->arch.external_oneshot) {
														
 
															+				vcpu->arch.external_oneshot = 0;
														
 
															+				return true;
														
 
															+			}
														
 
															 			return false;
														
 
															 	}
														
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void)
 
															 {
														
 
															 	unsigned long host_lpid, rsvd_lpid;
														
 
															-	if (!cpu_has_feature(CPU_FTR_HVMODE))
														
 
															-		return -EINVAL;
														
 
															-
														
 
															 	if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
														
 
															 		return -EINVAL;
														
 
															 	/* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
														
 
															-	host_lpid = mfspr(SPRN_LPID);
														
 
															+	host_lpid = 0;
														
 
															+	if (cpu_has_feature(CPU_FTR_HVMODE))
														
 
															+		host_lpid = mfspr(SPRN_LPID);
														
 
															 	rsvd_lpid = LPID_RSVD;
														
 
															 	kvmppc_init_lpid(rsvd_lpid + 1);
														
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -10,6 +10,9 @@
 
															 #include <linux/string.h>
														
 
															 #include <linux/kvm.h>
														
 
															 #include <linux/kvm_host.h>
														
 
															+#include <linux/anon_inodes.h>
														
 
															+#include <linux/file.h>
														
 
															+#include <linux/debugfs.h>
														
 
															 #include <asm/kvm_ppc.h>
														
 
															 #include <asm/kvm_book3s.h>
														
@@ -26,87 +29,74 @@
 
															  */
														
 
															 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
														
 
															-int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
														
 
															-			   struct kvmppc_pte *gpte, bool data, bool iswrite)
														
 
															+int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
														
 
															+			       struct kvmppc_pte *gpte, u64 root,
														
 
															+			       u64 *pte_ret_p)
														
 
															 {
														
 
															 	struct kvm *kvm = vcpu->kvm;
														
 
															-	u32 pid;
														
 
															 	int ret, level, ps;
														
 
															-	__be64 prte, rpte;
														
 
															-	unsigned long ptbl;
														
 
															-	unsigned long root, pte, index;
														
 
															-	unsigned long rts, bits, offset;
														
 
															-	unsigned long gpa;
														
 
															-	unsigned long proc_tbl_size;
														
 
															-
														
 
															-	/* Work out effective PID */
														
 
															-	switch (eaddr >> 62) {
														
 
															-	case 0:
														
 
															-		pid = vcpu->arch.pid;
														
 
															-		break;
														
 
															-	case 3:
														
 
															-		pid = 0;
														
 
															-		break;
														
 
															-	default:
														
 
															-		return -EINVAL;
														
 
															-	}
														
 
															-	proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
														
 
															-	if (pid * 16 >= proc_tbl_size)
														
 
															-		return -EINVAL;
														
 
															-
														
 
															-	/* Read partition table to find root of tree for effective PID */
														
 
															-	ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
														
 
															-	ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
														
 
															-	if (ret)
														
 
															-		return ret;
														
 
															+	unsigned long rts, bits, offset, index;
														
 
															+	u64 pte, base, gpa;
														
 
															+	__be64 rpte;
														
 
															-	root = be64_to_cpu(prte);
														
 
															 	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
														
 
															 		((root & RTS2_MASK) >> RTS2_SHIFT);
														
 
															 	bits = root & RPDS_MASK;
														
 
															-	root = root & RPDB_MASK;
														
 
															+	base = root & RPDB_MASK;
														
 
															 	offset = rts + 31;
														
 
															-	/* current implementations only support 52-bit space */
														
 
															+	/* Current implementations only support 52-bit space */
														
 
															 	if (offset != 52)
														
 
															 		return -EINVAL;
														
 
															+	/* Walk each level of the radix tree */
														
 
															 	for (level = 3; level >= 0; --level) {
														
 
															+		u64 addr;
														
 
															+		/* Check a valid size */
														
 
															 		if (level && bits != p9_supported_radix_bits[level])
														
 
															 			return -EINVAL;
														
 
															 		if (level == 0 && !(bits == 5 || bits == 9))
														
 
															 			return -EINVAL;
														
 
															 		offset -= bits;
														
 
															 		index = (eaddr >> offset) & ((1UL << bits) - 1);
														
 
															-		/* check that low bits of page table base are zero */
														
 
															-		if (root & ((1UL << (bits + 3)) - 1))
														
 
															+		/* Check that low bits of page table base are zero */
														
 
															+		if (base & ((1UL << (bits + 3)) - 1))
														
 
															 			return -EINVAL;
														
 
															-		ret = kvm_read_guest(kvm, root + index * 8,
														
 
															-				     &rpte, sizeof(rpte));
														
 
															-		if (ret)
														
 
															+		/* Read the entry from guest memory */
														
 
															+		addr = base + (index * sizeof(rpte));
														
 
															+		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
														
 
															+		if (ret) {
														
 
															+			if (pte_ret_p)
														
 
															+				*pte_ret_p = addr;
														
 
															 			return ret;
														
 
															+		}
														
 
															 		pte = __be64_to_cpu(rpte);
														
 
															 		if (!(pte & _PAGE_PRESENT))
														
 
															 			return -ENOENT;
														
 
															+		/* Check if a leaf entry */
														
 
															 		if (pte & _PAGE_PTE)
														
 
															 			break;
														
 
															-		bits = pte & 0x1f;
														
 
															-		root = pte & 0x0fffffffffffff00ul;
														
 
															+		/* Get ready to walk the next level */
														
 
															+		base = pte & RPDB_MASK;
														
 
															+		bits = pte & RPDS_MASK;
														
 
															 	}
														
 
															-	/* need a leaf at lowest level; 512GB pages not supported */
														
 
															+
														
 
															+	/* Need a leaf at lowest level; 512GB pages not supported */
														
 
															 	if (level < 0 || level == 3)
														
 
															 		return -EINVAL;
														
 
															-	/* offset is now log base 2 of the page size */
														
 
															+	/* We found a valid leaf PTE */
														
 
															+	/* Offset is now log base 2 of the page size */
														
 
															 	gpa = pte & 0x01fffffffffff000ul;
														
 
															 	if (gpa & ((1ul << offset) - 1))
														
 
															 		return -EINVAL;
														
 
															-	gpa += eaddr & ((1ul << offset) - 1);
														
 
															+	gpa |= eaddr & ((1ul << offset) - 1);
														
 
															 	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
														
 
															 		if (offset == mmu_psize_defs[ps].shift)
														
 
															 			break;
														
 
															 	gpte->page_size = ps;
														
 
															+	gpte->page_shift = offset;
														
 
															 	gpte->eaddr = eaddr;
														
 
															 	gpte->raddr = gpa;
														
@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
															 	gpte->may_read = !!(pte & _PAGE_READ);
														
 
															 	gpte->may_write = !!(pte & _PAGE_WRITE);
														
 
															 	gpte->may_execute = !!(pte & _PAGE_EXEC);
														
 
															+
														
 
															+	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
														
 
															+
														
 
															+	if (pte_ret_p)
														
 
															+		*pte_ret_p = pte;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Used to walk a partition or process table radix tree in guest memory
														
 
															+ * Note: We exploit the fact that a partition table and a process
														
 
															+ * table have the same layout, a partition-scoped page table and a
														
 
															+ * process-scoped page table have the same layout, and the 2nd
														
 
															+ * doubleword of a partition table entry has the same layout as
														
 
															+ * the PTCR register.
														
 
															+ */
														
 
															+int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
														
 
															+				     struct kvmppc_pte *gpte, u64 table,
														
 
															+				     int table_index, u64 *pte_ret_p)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	int ret;
														
 
															+	unsigned long size, ptbl, root;
														
 
															+	struct prtb_entry entry;
														
 
															+
														
 
															+	if ((table & PRTS_MASK) > 24)
														
 
															+		return -EINVAL;
														
 
															+	size = 1ul << ((table & PRTS_MASK) + 12);
														
 
															+
														
 
															+	/* Is the table big enough to contain this entry? */
														
 
															+	if ((table_index * sizeof(entry)) >= size)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	/* Read the table to find the root of the radix tree */
														
 
															+	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
														
 
															+	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	/* Root is stored in the first double word */
														
 
															+	root = be64_to_cpu(entry.prtb0);
														
 
															+
														
 
															+	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
														
 
															+}
														
 
															+
														
 
															+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
														
 
															+			   struct kvmppc_pte *gpte, bool data, bool iswrite)
														
 
															+{
														
 
															+	u32 pid;
														
 
															+	u64 pte;
														
 
															+	int ret;
														
 
															+
														
 
															+	/* Work out effective PID */
														
 
															+	switch (eaddr >> 62) {
														
 
															+	case 0:
														
 
															+		pid = vcpu->arch.pid;
														
 
															+		break;
														
 
															+	case 3:
														
 
															+		pid = 0;
														
 
															+		break;
														
 
															+	default:
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
														
 
															+				vcpu->kvm->arch.process_table, pid, &pte);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	/* Check privilege (applies only to process scoped translations) */
														
 
															 	if (kvmppc_get_msr(vcpu) & MSR_PR) {
														
 
															 		if (pte & _PAGE_PRIVILEGED) {
														
 
															 			gpte->may_read = 0;
														
@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
															 }
														
 
															 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
														
 
															-				    unsigned int pshift)
														
 
															+				    unsigned int pshift, unsigned int lpid)
														
 
															 {
														
 
															 	unsigned long psize = PAGE_SIZE;
														
 
															+	int psi;
														
 
															+	long rc;
														
 
															+	unsigned long rb;
														
 
															 	if (pshift)
														
 
															 		psize = 1UL << pshift;
														
 
															+	else
														
 
															+		pshift = PAGE_SHIFT;
														
 
															 	addr &= ~(psize - 1);
														
 
															-	radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
														
 
															+
														
 
															+	if (!kvmhv_on_pseries()) {
														
 
															+		radix__flush_tlb_lpid_page(lpid, addr, psize);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	psi = shift_to_mmu_psize(pshift);
														
 
															+	rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
														
 
															+	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
														
 
															+				lpid, rb);
														
 
															+	if (rc)
														
 
															+		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
														
 
															 }
														
 
															-static void kvmppc_radix_flush_pwc(struct kvm *kvm)
														
 
															+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
														
 
															 {
														
 
															-	radix__flush_pwc_lpid(kvm->arch.lpid);
														
 
															+	long rc;
														
 
															+
														
 
															+	if (!kvmhv_on_pseries()) {
														
 
															+		radix__flush_pwc_lpid(lpid);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
														
 
															+				lpid, TLBIEL_INVAL_SET_LPID);
														
 
															+	if (rc)
														
 
															+		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
														
 
															 }
														
 
															 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
														
@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
 
															 	kmem_cache_free(kvm_pmd_cache, pmdp);
														
 
															 }
														
 
															-static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
														
 
															-			     unsigned long gpa, unsigned int shift)
														
 
															+/* Called with kvm->mmu_lock held */
														
 
															+void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
														
 
															+		      unsigned int shift, struct kvm_memory_slot *memslot,
														
 
															+		      unsigned int lpid)
														
 
															 {
														
 
															-	unsigned long page_size = 1ul << shift;
														
 
															 	unsigned long old;
														
 
															+	unsigned long gfn = gpa >> PAGE_SHIFT;
														
 
															+	unsigned long page_size = PAGE_SIZE;
														
 
															+	unsigned long hpa;
														
 
															 	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
														
 
															-	kvmppc_radix_tlbie_page(kvm, gpa, shift);
														
 
															-	if (old & _PAGE_DIRTY) {
														
 
															-		unsigned long gfn = gpa >> PAGE_SHIFT;
														
 
															-		struct kvm_memory_slot *memslot;
														
 
															+	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
														
 
															+
														
 
															+	/* The following only applies to L1 entries */
														
 
															+	if (lpid != kvm->arch.lpid)
														
 
															+		return;
														
 
															+	if (!memslot) {
														
 
															 		memslot = gfn_to_memslot(kvm, gfn);
														
 
															-		if (memslot && memslot->dirty_bitmap)
														
 
															-			kvmppc_update_dirty_map(memslot, gfn, page_size);
														
 
															+		if (!memslot)
														
 
															+			return;
														
 
															 	}
														
 
															+	if (shift)
														
 
															+		page_size = 1ul << shift;
														
 
															+
														
 
															+	gpa &= ~(page_size - 1);
														
 
															+	hpa = old & PTE_RPN_MASK;
														
 
															+	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
														
 
															+
														
 
															+	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
														
 
															+		kvmppc_update_dirty_map(memslot, gfn, page_size);
														
 
															 }
														
 
															 /*
														
@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
 
															  * and emit a warning if encountered, but there may already be data
														
 
															  * corruption due to the unexpected mappings.
														
 
															  */
														
 
															-static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
														
 
															+static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
														
 
															+				  unsigned int lpid)
														
 
															 {
														
 
															 	if (full) {
														
 
															 		memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
														
@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
 
															 			WARN_ON_ONCE(1);
														
 
															 			kvmppc_unmap_pte(kvm, p,
														
 
															 					 pte_pfn(*p) << PAGE_SHIFT,
														
 
															-					 PAGE_SHIFT);
														
 
															+					 PAGE_SHIFT, NULL, lpid);
														
 
															 		}
														
 
															 	}
														
 
															 	kvmppc_pte_free(pte);
														
 
															 }
														
 
															-static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
														
 
															+static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
														
 
															+				  unsigned int lpid)
														
 
															 {
														
 
															 	unsigned long im;
														
 
															 	pmd_t *p = pmd;
														
@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
 
															 				WARN_ON_ONCE(1);
														
 
															 				kvmppc_unmap_pte(kvm, (pte_t *)p,
														
 
															 					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
														
 
															-					 PMD_SHIFT);
														
 
															+					 PMD_SHIFT, NULL, lpid);
														
 
															 			}
														
 
															 		} else {
														
 
															 			pte_t *pte;
														
 
															 			pte = pte_offset_map(p, 0);
														
 
															-			kvmppc_unmap_free_pte(kvm, pte, full);
														
 
															+			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
														
 
															 			pmd_clear(p);
														
 
															 		}
														
 
															 	}
														
 
															 	kvmppc_pmd_free(pmd);
														
 
															 }
														
 
															-static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
														
 
															+static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
														
 
															+				  unsigned int lpid)
														
 
															 {
														
 
															 	unsigned long iu;
														
 
															 	pud_t *p = pud;
														
@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
 
															 			pmd_t *pmd;
														
 
															 			pmd = pmd_offset(p, 0);
														
 
															-			kvmppc_unmap_free_pmd(kvm, pmd, true);
														
 
															+			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
														
 
															 			pud_clear(p);
														
 
															 		}
														
 
															 	}
														
 
															 	pud_free(kvm->mm, pud);
														
 
															 }
														
 
															-void kvmppc_free_radix(struct kvm *kvm)
														
 
															+void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
														
 
															 {
														
 
															 	unsigned long ig;
														
 
															-	pgd_t *pgd;
														
 
															-	if (!kvm->arch.pgtable)
														
 
															-		return;
														
 
															-	pgd = kvm->arch.pgtable;
														
 
															 	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
														
 
															 		pud_t *pud;
														
 
															 		if (!pgd_present(*pgd))
														
 
															 			continue;
														
 
															 		pud = pud_offset(pgd, 0);
														
 
															-		kvmppc_unmap_free_pud(kvm, pud);
														
 
															+		kvmppc_unmap_free_pud(kvm, pud, lpid);
														
 
															 		pgd_clear(pgd);
														
 
															 	}
														
 
															-	pgd_free(kvm->mm, kvm->arch.pgtable);
														
 
															-	kvm->arch.pgtable = NULL;
														
 
															+}
														
 
															+
														
 
															+void kvmppc_free_radix(struct kvm *kvm)
														
 
															+{
														
 
															+	if (kvm->arch.pgtable) {
														
 
															+		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
														
 
															+					  kvm->arch.lpid);
														
 
															+		pgd_free(kvm->mm, kvm->arch.pgtable);
														
 
															+		kvm->arch.pgtable = NULL;
														
 
															+	}
														
 
															 }
														
 
															 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
														
 
															-					      unsigned long gpa)
														
 
															+					unsigned long gpa, unsigned int lpid)
														
 
															 {
														
 
															 	pte_t *pte = pte_offset_kernel(pmd, 0);
														
@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
 
															 	 * flushing the PWC again.
														
 
															 	 */
														
 
															 	pmd_clear(pmd);
														
 
															-	kvmppc_radix_flush_pwc(kvm);
														
 
															+	kvmppc_radix_flush_pwc(kvm, lpid);
														
 
															-	kvmppc_unmap_free_pte(kvm, pte, false);
														
 
															+	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
														
 
															 }
														
 
															 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
														
 
															-					unsigned long gpa)
														
 
															+					unsigned long gpa, unsigned int lpid)
														
 
															 {
														
 
															 	pmd_t *pmd = pmd_offset(pud, 0);
														
@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 
															 	 * so can be freed without flushing the PWC again.
														
 
															 	 */
														
 
															 	pud_clear(pud);
														
 
															-	kvmppc_radix_flush_pwc(kvm);
														
 
															+	kvmppc_radix_flush_pwc(kvm, lpid);
														
 
															-	kvmppc_unmap_free_pmd(kvm, pmd, false);
														
 
															+	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
														
 
															 }
														
 
															 /*
														
@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 
															  */
														
 
															 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
														
 
															-static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
														
 
															-			     unsigned int level, unsigned long mmu_seq)
														
 
															+int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
														
 
															+		      unsigned long gpa, unsigned int level,
														
 
															+		      unsigned long mmu_seq, unsigned int lpid,
														
 
															+		      unsigned long *rmapp, struct rmap_nested **n_rmap)
														
 
															 {
														
 
															 	pgd_t *pgd;
														
 
															 	pud_t *pud, *new_pud = NULL;
														
@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 	int ret;
														
 
															 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
														
 
															-	pgd = kvm->arch.pgtable + pgd_index(gpa);
														
 
															+	pgd = pgtable + pgd_index(gpa);
														
 
															 	pud = NULL;
														
 
															 	if (pgd_present(*pgd))
														
 
															 		pud = pud_offset(pgd, gpa);
														
@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 			goto out_unlock;
														
 
															 		}
														
 
															 		/* Valid 1GB page here already, remove it */
														
 
															-		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
														
 
															+		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
														
 
															+				 lpid);
														
 
															 	}
														
 
															 	if (level == 2) {
														
 
															 		if (!pud_none(*pud)) {
														
@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 			 * install a large page, so remove and free the page
														
 
															 			 * table page.
														
 
															 			 */
														
 
															-			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
														
 
															+			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
														
 
															 		}
														
 
															 		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
														
 
															+		if (rmapp && n_rmap)
														
 
															+			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
														
 
															 		ret = 0;
														
 
															 		goto out_unlock;
														
 
															 	}
														
@@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
														
 
															 							PTE_BITS_MUST_MATCH);
														
 
															 			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
														
 
															-					      0, pte_val(pte), lgpa, PMD_SHIFT);
														
 
															+					0, pte_val(pte), lgpa, PMD_SHIFT);
														
 
															 			ret = 0;
														
 
															 			goto out_unlock;
														
 
															 		}
														
@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 			goto out_unlock;
														
 
															 		}
														
 
															 		/* Valid 2MB page here already, remove it */
														
 
															-		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
														
 
															+		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
														
 
															+				 lpid);
														
 
															 	}
														
 
															 	if (level == 1) {
														
 
															 		if (!pmd_none(*pmd)) {
														
@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 			 * install a large page, so remove and free the page
														
 
															 			 * table page.
														
 
															 			 */
														
 
															-			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
														
 
															+			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
														
 
															 		}
														
 
															 		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
														
 
															+		if (rmapp && n_rmap)
														
 
															+			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
														
 
															 		ret = 0;
														
 
															 		goto out_unlock;
														
 
															 	}
														
@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 		goto out_unlock;
														
 
															 	}
														
 
															 	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
														
 
															+	if (rmapp && n_rmap)
														
 
															+		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
														
 
															 	ret = 0;
														
 
															  out_unlock:
														
@@ -521,95 +640,49 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
															 	return ret;
														
 
															 }
														
 
															-int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
														
 
															-				   unsigned long ea, unsigned long dsisr)
														
 
															+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
														
 
															+			     unsigned long gpa, unsigned int lpid)
														
 
															+{
														
 
															+	unsigned long pgflags;
														
 
															+	unsigned int shift;
														
 
															+	pte_t *ptep;
														
 
															+
														
 
															+	/*
														
 
															+	 * Need to set an R or C bit in the 2nd-level tables;
														
 
															+	 * since we are just helping out the hardware here,
														
 
															+	 * it is sufficient to do what the hardware does.
														
 
															+	 */
														
 
															+	pgflags = _PAGE_ACCESSED;
														
 
															+	if (writing)
														
 
															+		pgflags |= _PAGE_DIRTY;
														
 
															+	/*
														
 
															+	 * We are walking the secondary (partition-scoped) page table here.
														
 
															+	 * We can do this without disabling irq because the Linux MM
														
 
															+	 * subsystem doesn't do THP splits and collapses on this tree.
														
 
															+	 */
														
 
															+	ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
														
 
															+	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
														
 
															+		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
														
 
															+		return true;
														
 
															+	}
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
														
 
															+				   unsigned long gpa,
														
 
															+				   struct kvm_memory_slot *memslot,
														
 
															+				   bool writing, bool kvm_ro,
														
 
															+				   pte_t *inserted_pte, unsigned int *levelp)
														
 
															 {
														
 
															 	struct kvm *kvm = vcpu->kvm;
														
 
															-	unsigned long mmu_seq;
														
 
															-	unsigned long gpa, gfn, hva;
														
 
															-	struct kvm_memory_slot *memslot;
														
 
															 	struct page *page = NULL;
														
 
															-	long ret;
														
 
															-	bool writing;
														
 
															+	unsigned long mmu_seq;
														
 
															+	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
														
 
															 	bool upgrade_write = false;
														
 
															 	bool *upgrade_p = &upgrade_write;
														
 
															 	pte_t pte, *ptep;
														
 
															-	unsigned long pgflags;
														
 
															 	unsigned int shift, level;
														
 
															-
														
 
															-	/* Check for unusual errors */
														
 
															-	if (dsisr & DSISR_UNSUPP_MMU) {
														
 
															-		pr_err("KVM: Got unsupported MMU fault\n");
														
 
															-		return -EFAULT;
														
 
															-	}
														
 
															-	if (dsisr & DSISR_BADACCESS) {
														
 
															-		/* Reflect to the guest as DSI */
														
 
															-		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
														
 
															-		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
														
 
															-		return RESUME_GUEST;
														
 
															-	}
														
 
															-
														
 
															-	/* Translate the logical address and get the page */
														
 
															-	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
														
 
															-	gpa &= ~0xF000000000000000ul;
														
 
															-	gfn = gpa >> PAGE_SHIFT;
														
 
															-	if (!(dsisr & DSISR_PRTABLE_FAULT))
														
 
															-		gpa |= ea & 0xfff;
														
 
															-	memslot = gfn_to_memslot(kvm, gfn);
														
 
															-
														
 
															-	/* No memslot means it's an emulated MMIO region */
														
 
															-	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
														
 
															-		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
														
 
															-			     DSISR_SET_RC)) {
														
 
															-			/*
														
 
															-			 * Bad address in guest page table tree, or other
														
 
															-			 * unusual error - reflect it to the guest as DSI.
														
 
															-			 */
														
 
															-			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
														
 
															-			return RESUME_GUEST;
														
 
															-		}
														
 
															-		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
														
 
															-					      dsisr & DSISR_ISSTORE);
														
 
															-	}
														
 
															-
														
 
															-	writing = (dsisr & DSISR_ISSTORE) != 0;
														
 
															-	if (memslot->flags & KVM_MEM_READONLY) {
														
 
															-		if (writing) {
														
 
															-			/* give the guest a DSI */
														
 
															-			dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
														
 
															-			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
														
 
															-			return RESUME_GUEST;
														
 
															-		}
														
 
															-		upgrade_p = NULL;
														
 
															-	}
														
 
															-
														
 
															-	if (dsisr & DSISR_SET_RC) {
														
 
															-		/*
														
 
															-		 * Need to set an R or C bit in the 2nd-level tables;
														
 
															-		 * since we are just helping out the hardware here,
														
 
															-		 * it is sufficient to do what the hardware does.
														
 
															-		 */
														
 
															-		pgflags = _PAGE_ACCESSED;
														
 
															-		if (writing)
														
 
															-			pgflags |= _PAGE_DIRTY;
														
 
															-		/*
														
 
															-		 * We are walking the secondary page table here. We can do this
														
 
															-		 * without disabling irq.
														
 
															-		 */
														
 
															-		spin_lock(&kvm->mmu_lock);
														
 
															-		ptep = __find_linux_pte(kvm->arch.pgtable,
														
 
															-					gpa, NULL, &shift);
														
 
															-		if (ptep && pte_present(*ptep) &&
														
 
															-		    (!writing || pte_write(*ptep))) {
														
 
															-			kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
														
 
															-						gpa, shift);
														
 
															-			dsisr &= ~DSISR_SET_RC;
														
 
															-		}
														
 
															-		spin_unlock(&kvm->mmu_lock);
														
 
															-		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
														
 
															-			       DSISR_PROTFAULT | DSISR_SET_RC)))
														
 
															-			return RESUME_GUEST;
														
 
															-	}
														
 
															+	int ret;
														
 
															 	/* used to check for invalidations in progress */
														
 
															 	mmu_seq = kvm->mmu_notifier_seq;
														
@@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 	 * is that the page is writable.
														
 
															 	 */
														
 
															 	hva = gfn_to_hva_memslot(memslot, gfn);
														
 
															-	if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
														
 
															+	if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
														
 
															 		upgrade_write = true;
														
 
															 	} else {
														
 
															 		unsigned long pfn;
														
@@ -690,7 +763,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 	}
														
 
															 	/* Allocate space in the tree and write the PTE */
														
 
															-	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
														
 
															+	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
														
 
															+				mmu_seq, kvm->arch.lpid, NULL, NULL);
														
 
															+	if (inserted_pte)
														
 
															+		*inserted_pte = pte;
														
 
															+	if (levelp)
														
 
															+		*levelp = level;
														
 
															 	if (page) {
														
 
															 		if (!ret && (pte_val(pte) & _PAGE_WRITE))
														
@@ -698,6 +776,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 		put_page(page);
														
 
															 	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
														
 
															+				   unsigned long ea, unsigned long dsisr)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	unsigned long gpa, gfn;
														
 
															+	struct kvm_memory_slot *memslot;
														
 
															+	long ret;
														
 
															+	bool writing = !!(dsisr & DSISR_ISSTORE);
														
 
															+	bool kvm_ro = false;
														
 
															+
														
 
															+	/* Check for unusual errors */
														
 
															+	if (dsisr & DSISR_UNSUPP_MMU) {
														
 
															+		pr_err("KVM: Got unsupported MMU fault\n");
														
 
															+		return -EFAULT;
														
 
															+	}
														
 
															+	if (dsisr & DSISR_BADACCESS) {
														
 
															+		/* Reflect to the guest as DSI */
														
 
															+		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
														
 
															+		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
														
 
															+		return RESUME_GUEST;
														
 
															+	}
														
 
															+
														
 
															+	/* Translate the logical address */
														
 
															+	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
														
 
															+	gpa &= ~0xF000000000000000ul;
														
 
															+	gfn = gpa >> PAGE_SHIFT;
														
 
															+	if (!(dsisr & DSISR_PRTABLE_FAULT))
														
 
															+		gpa |= ea & 0xfff;
														
 
															+
														
 
															+	/* Get the corresponding memslot */
														
 
															+	memslot = gfn_to_memslot(kvm, gfn);
														
 
															+
														
 
															+	/* No memslot means it's an emulated MMIO region */
														
 
															+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
														
 
															+		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
														
 
															+			     DSISR_SET_RC)) {
														
 
															+			/*
														
 
															+			 * Bad address in guest page table tree, or other
														
 
															+			 * unusual error - reflect it to the guest as DSI.
														
 
															+			 */
														
 
															+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
														
 
															+			return RESUME_GUEST;
														
 
															+		}
														
 
															+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
														
 
															+	}
														
 
															+
														
 
															+	if (memslot->flags & KVM_MEM_READONLY) {
														
 
															+		if (writing) {
														
 
															+			/* give the guest a DSI */
														
 
															+			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
														
 
															+						       DSISR_PROTFAULT);
														
 
															+			return RESUME_GUEST;
														
 
															+		}
														
 
															+		kvm_ro = true;
														
 
															+	}
														
 
															+
														
 
															+	/* Failed to set the reference/change bits */
														
 
															+	if (dsisr & DSISR_SET_RC) {
														
 
															+		spin_lock(&kvm->mmu_lock);
														
 
															+		if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
														
 
															+					    writing, gpa, kvm->arch.lpid))
														
 
															+			dsisr &= ~DSISR_SET_RC;
														
 
															+		spin_unlock(&kvm->mmu_lock);
														
 
															+
														
 
															+		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
														
 
															+			       DSISR_PROTFAULT | DSISR_SET_RC)))
														
 
															+			return RESUME_GUEST;
														
 
															+	}
														
 
															+
														
 
															+	/* Try to insert a pte */
														
 
															+	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
														
 
															+					     kvm_ro, NULL, NULL);
														
 
															+
														
 
															 	if (ret == 0 || ret == -EAGAIN)
														
 
															 		ret = RESUME_GUEST;
														
 
															 	return ret;
														
@@ -710,20 +864,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
															 	pte_t *ptep;
														
 
															 	unsigned long gpa = gfn << PAGE_SHIFT;
														
 
															 	unsigned int shift;
														
 
															-	unsigned long old;
														
 
															 	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
														
 
															-	if (ptep && pte_present(*ptep)) {
														
 
															-		old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
														
 
															-					      gpa, shift);
														
 
															-		kvmppc_radix_tlbie_page(kvm, gpa, shift);
														
 
															-		if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
														
 
															-			unsigned long psize = PAGE_SIZE;
														
 
															-			if (shift)
														
 
															-				psize = 1ul << shift;
														
 
															-			kvmppc_update_dirty_map(memslot, gfn, psize);
														
 
															-		}
														
 
															-	}
														
 
															+	if (ptep && pte_present(*ptep))
														
 
															+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
														
 
															+				 kvm->arch.lpid);
														
 
															 	return 0;				
														
 
															 }
														
@@ -778,7 +923,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
 
															 			ret = 1 << (shift - PAGE_SHIFT);
														
 
															 		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
														
 
															 					gpa, shift);
														
 
															-		kvmppc_radix_tlbie_page(kvm, gpa, shift);
														
 
															+		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
														
 
															 	}
														
 
															 	return ret;
														
 
															 }
														
@@ -863,6 +1008,215 @@ static void pmd_ctor(void *addr)
 
															 	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
														
 
															 }
														
 
															+struct debugfs_radix_state {
														
 
															+	struct kvm	*kvm;
														
 
															+	struct mutex	mutex;
														
 
															+	unsigned long	gpa;
														
 
															+	int		lpid;
														
 
															+	int		chars_left;
														
 
															+	int		buf_index;
														
 
															+	char		buf[128];
														
 
															+	u8		hdr;
														
 
															+};
														
 
															+
														
 
															+static int debugfs_radix_open(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	struct kvm *kvm = inode->i_private;
														
 
															+	struct debugfs_radix_state *p;
														
 
															+
														
 
															+	p = kzalloc(sizeof(*p), GFP_KERNEL);
														
 
															+	if (!p)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	kvm_get_kvm(kvm);
														
 
															+	p->kvm = kvm;
														
 
															+	mutex_init(&p->mutex);
														
 
															+	file->private_data = p;
														
 
															+
														
 
															+	return nonseekable_open(inode, file);
														
 
															+}
														
 
															+
														
 
															+static int debugfs_radix_release(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	struct debugfs_radix_state *p = file->private_data;
														
 
															+
														
 
															+	kvm_put_kvm(p->kvm);
														
 
															+	kfree(p);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
														
 
															+				 size_t len, loff_t *ppos)
														
 
															+{
														
 
															+	struct debugfs_radix_state *p = file->private_data;
														
 
															+	ssize_t ret, r;
														
 
															+	unsigned long n;
														
 
															+	struct kvm *kvm;
														
 
															+	unsigned long gpa;
														
 
															+	pgd_t *pgt;
														
 
															+	struct kvm_nested_guest *nested;
														
 
															+	pgd_t pgd, *pgdp;
														
 
															+	pud_t pud, *pudp;
														
 
															+	pmd_t pmd, *pmdp;
														
 
															+	pte_t *ptep;
														
 
															+	int shift;
														
 
															+	unsigned long pte;
														
 
															+
														
 
															+	kvm = p->kvm;
														
 
															+	if (!kvm_is_radix(kvm))
														
 
															+		return 0;
														
 
															+
														
 
															+	ret = mutex_lock_interruptible(&p->mutex);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (p->chars_left) {
														
 
															+		n = p->chars_left;
														
 
															+		if (n > len)
														
 
															+			n = len;
														
 
															+		r = copy_to_user(buf, p->buf + p->buf_index, n);
														
 
															+		n -= r;
														
 
															+		p->chars_left -= n;
														
 
															+		p->buf_index += n;
														
 
															+		buf += n;
														
 
															+		len -= n;
														
 
															+		ret = n;
														
 
															+		if (r) {
														
 
															+			if (!n)
														
 
															+				ret = -EFAULT;
														
 
															+			goto out;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	gpa = p->gpa;
														
 
															+	nested = NULL;
														
 
															+	pgt = NULL;
														
 
															+	while (len != 0 && p->lpid >= 0) {
														
 
															+		if (gpa >= RADIX_PGTABLE_RANGE) {
														
 
															+			gpa = 0;
														
 
															+			pgt = NULL;
														
 
															+			if (nested) {
														
 
															+				kvmhv_put_nested(nested);
														
 
															+				nested = NULL;
														
 
															+			}
														
 
															+			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
														
 
															+			p->hdr = 0;
														
 
															+			if (p->lpid < 0)
														
 
															+				break;
														
 
															+		}
														
 
															+		if (!pgt) {
														
 
															+			if (p->lpid == 0) {
														
 
															+				pgt = kvm->arch.pgtable;
														
 
															+			} else {
														
 
															+				nested = kvmhv_get_nested(kvm, p->lpid, false);
														
 
															+				if (!nested) {
														
 
															+					gpa = RADIX_PGTABLE_RANGE;
														
 
															+					continue;
														
 
															+				}
														
 
															+				pgt = nested->shadow_pgtable;
														
 
															+			}
														
 
															+		}
														
 
															+		n = 0;
														
 
															+		if (!p->hdr) {
														
 
															+			if (p->lpid > 0)
														
 
															+				n = scnprintf(p->buf, sizeof(p->buf),
														
 
															+					      "\nNested LPID %d: ", p->lpid);
														
 
															+			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
														
 
															+				      "pgdir: %lx\n", (unsigned long)pgt);
														
 
															+			p->hdr = 1;
														
 
															+			goto copy;
														
 
															+		}
														
 
															+
														
 
															+		pgdp = pgt + pgd_index(gpa);
														
 
															+		pgd = READ_ONCE(*pgdp);
														
 
															+		if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
														
 
															+			gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		pudp = pud_offset(&pgd, gpa);
														
 
															+		pud = READ_ONCE(*pudp);
														
 
															+		if (!(pud_val(pud) & _PAGE_PRESENT)) {
														
 
															+			gpa = (gpa & PUD_MASK) + PUD_SIZE;
														
 
															+			continue;
														
 
															+		}
														
 
															+		if (pud_val(pud) & _PAGE_PTE) {
														
 
															+			pte = pud_val(pud);
														
 
															+			shift = PUD_SHIFT;
														
 
															+			goto leaf;
														
 
															+		}
														
 
															+
														
 
															+		pmdp = pmd_offset(&pud, gpa);
														
 
															+		pmd = READ_ONCE(*pmdp);
														
 
															+		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
														
 
															+			gpa = (gpa & PMD_MASK) + PMD_SIZE;
														
 
															+			continue;
														
 
															+		}
														
 
															+		if (pmd_val(pmd) & _PAGE_PTE) {
														
 
															+			pte = pmd_val(pmd);
														
 
															+			shift = PMD_SHIFT;
														
 
															+			goto leaf;
														
 
															+		}
														
 
															+
														
 
															+		ptep = pte_offset_kernel(&pmd, gpa);
														
 
															+		pte = pte_val(READ_ONCE(*ptep));
														
 
															+		if (!(pte & _PAGE_PRESENT)) {
														
 
															+			gpa += PAGE_SIZE;
														
 
															+			continue;
														
 
															+		}
														
 
															+		shift = PAGE_SHIFT;
														
 
															+	leaf:
														
 
															+		n = scnprintf(p->buf, sizeof(p->buf),
														
 
															+			      " %lx: %lx %d\n", gpa, pte, shift);
														
 
															+		gpa += 1ul << shift;
														
 
															+	copy:
														
 
															+		p->chars_left = n;
														
 
															+		if (n > len)
														
 
															+			n = len;
														
 
															+		r = copy_to_user(buf, p->buf, n);
														
 
															+		n -= r;
														
 
															+		p->chars_left -= n;
														
 
															+		p->buf_index = n;
														
 
															+		buf += n;
														
 
															+		len -= n;
														
 
															+		ret += n;
														
 
															+		if (r) {
														
 
															+			if (!ret)
														
 
															+				ret = -EFAULT;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	p->gpa = gpa;
														
 
															+	if (nested)
														
 
															+		kvmhv_put_nested(nested);
														
 
															+
														
 
															+ out:
														
 
															+	mutex_unlock(&p->mutex);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
														
 
															+			   size_t len, loff_t *ppos)
														
 
															+{
														
 
															+	return -EACCES;
														
 
															+}
														
 
															+
														
 
															+static const struct file_operations debugfs_radix_fops = {
														
 
															+	.owner	 = THIS_MODULE,
														
 
															+	.open	 = debugfs_radix_open,
														
 
															+	.release = debugfs_radix_release,
														
 
															+	.read	 = debugfs_radix_read,
														
 
															+	.write	 = debugfs_radix_write,
														
 
															+	.llseek	 = generic_file_llseek,
														
 
															+};
														
 
															+
														
 
															+void kvmhv_radix_debugfs_init(struct kvm *kvm)
														
 
															+{
														
 
															+	kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
														
 
															+						     kvm->arch.debugfs_dir, kvm,
														
 
															+						     &debugfs_radix_fops);
														
 
															+}
														
 
															+
														
 
															 int kvmppc_radix_init(void)
														
 
															 {
														
 
															 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
														
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
															 	return ret;
														
 
															 }
														
 
															+static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
														
 
															+		unsigned long tce)
														
 
															+{
														
 
															+	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
														
 
															+	enum dma_data_direction dir = iommu_tce_direction(tce);
														
 
															+	struct kvmppc_spapr_tce_iommu_table *stit;
														
 
															+	unsigned long ua = 0;
														
 
															+
														
 
															+	/* Allow userspace to poison TCE table */
														
 
															+	if (dir == DMA_NONE)
														
 
															+		return H_SUCCESS;
														
 
															+
														
 
															+	if (iommu_tce_check_gpa(stt->page_shift, gpa))
														
 
															+		return H_TOO_HARD;
														
 
															+
														
 
															+	if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
														
 
															+		return H_TOO_HARD;
														
 
															+
														
 
															+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
														
 
															+		unsigned long hpa = 0;
														
 
															+		struct mm_iommu_table_group_mem_t *mem;
														
 
															+		long shift = stit->tbl->it_page_shift;
														
 
															+
														
 
															+		mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
														
 
															+		if (!mem)
														
 
															+			return H_TOO_HARD;
														
 
															+
														
 
															+		if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
														
 
															+			return H_TOO_HARD;
														
 
															+	}
														
 
															+
														
 
															+	return H_SUCCESS;
														
 
															+}
														
 
															+
														
 
															 static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
														
 
															 {
														
 
															 	unsigned long hpa = 0;
														
@@ -376,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
 
															 {
														
 
															 	struct mm_iommu_table_group_mem_t *mem = NULL;
														
 
															 	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
														
 
															-	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
														
 
															+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
														
 
															 	if (!pua)
														
 
															-		/* it_userspace allocation might be delayed */
														
 
															-		return H_TOO_HARD;
														
 
															+		return H_SUCCESS;
														
 
															 	mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
														
 
															 	if (!mem)
														
@@ -401,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 
															 	long ret;
														
 
															 	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
														
 
															-		return H_HARDWARE;
														
 
															+		return H_TOO_HARD;
														
 
															 	if (dir == DMA_NONE)
														
 
															 		return H_SUCCESS;
														
@@ -449,15 +482,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 
															 		return H_TOO_HARD;
														
 
															 	if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
														
 
															-		return H_HARDWARE;
														
 
															+		return H_TOO_HARD;
														
 
															 	if (mm_iommu_mapped_inc(mem))
														
 
															-		return H_CLOSED;
														
 
															+		return H_TOO_HARD;
														
 
															 	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
														
 
															 	if (WARN_ON_ONCE(ret)) {
														
 
															 		mm_iommu_mapped_dec(mem);
														
 
															-		return H_HARDWARE;
														
 
															+		return H_TOO_HARD;
														
 
															 	}
														
 
															 	if (dir != DMA_NONE)
														
@@ -517,8 +550,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
															 	idx = srcu_read_lock(&vcpu->kvm->srcu);
														
 
															-	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
														
 
															-			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
														
 
															+	if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
														
 
															 		ret = H_PARAMETER;
														
 
															 		goto unlock_exit;
														
 
															 	}
														
@@ -533,14 +565,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
															 			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
														
 
															 					entry, ua, dir);
														
 
															-		if (ret == H_SUCCESS)
														
 
															-			continue;
														
 
															-
														
 
															-		if (ret == H_TOO_HARD)
														
 
															+		if (ret != H_SUCCESS) {
														
 
															+			kvmppc_clear_tce(stit->tbl, entry);
														
 
															 			goto unlock_exit;
														
 
															-
														
 
															-		WARN_ON_ONCE(1);
														
 
															-		kvmppc_clear_tce(stit->tbl, entry);
														
 
															+		}
														
 
															 	}
														
 
															 	kvmppc_tce_put(stt, entry, tce);
														
@@ -583,7 +611,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
															 		return ret;
														
 
															 	idx = srcu_read_lock(&vcpu->kvm->srcu);
														
 
															-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
														
 
															+	if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
														
 
															 		ret = H_TOO_HARD;
														
 
															 		goto unlock_exit;
														
 
															 	}
														
@@ -599,10 +627,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
															 		ret = kvmppc_tce_validate(stt, tce);
														
 
															 		if (ret != H_SUCCESS)
														
 
															 			goto unlock_exit;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < npages; ++i) {
														
 
															+		/*
														
 
															+		 * This looks unsafe, because we validate, then regrab
														
 
															+		 * the TCE from userspace which could have been changed by
														
 
															+		 * another thread.
														
 
															+		 *
														
 
															+		 * But it actually is safe, because the relevant checks will be
														
 
															+		 * re-executed in the following code.  If userspace tries to
														
 
															+		 * change this dodgily it will result in a messier failure mode
														
 
															+		 * but won't threaten the host.
														
 
															+		 */
														
 
															+		if (get_user(tce, tces + i)) {
														
 
															+			ret = H_TOO_HARD;
														
 
															+			goto unlock_exit;
														
 
															+		}
														
 
															+		tce = be64_to_cpu(tce);
														
 
															-		if (kvmppc_gpa_to_ua(vcpu->kvm,
														
 
															-				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
														
 
															-				&ua, NULL))
														
 
															+		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
														
 
															 			return H_PARAMETER;
														
 
															 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
														
@@ -610,14 +654,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
															 					stit->tbl, entry + i, ua,
														
 
															 					iommu_tce_direction(tce));
														
 
															-			if (ret == H_SUCCESS)
														
 
															-				continue;
														
 
															-
														
 
															-			if (ret == H_TOO_HARD)
														
 
															+			if (ret != H_SUCCESS) {
														
 
															+				kvmppc_clear_tce(stit->tbl, entry);
														
 
															 				goto unlock_exit;
														
 
															-
														
 
															-			WARN_ON_ONCE(1);
														
 
															-			kvmppc_clear_tce(stit->tbl, entry);
														
 
															+			}
														
 
															 		}
														
 
															 		kvmppc_tce_put(stt, entry + i, tce);
														
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvmppc_find_table);
														
 
															+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															 /*
														
 
															  * Validates TCE address.
														
 
															  * At the moment flags and page mask are validated.
														
@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
															  * to the table and user space is supposed to process them), we can skip
														
 
															  * checking other things (such as TCE is a guest RAM address or the page
														
 
															  * was actually allocated).
														
 
															- *
														
 
															- * WARNING: This will be called in real-mode on HV KVM and virtual
														
 
															- *          mode on PR KVM
														
 
															  */
														
 
															-long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
														
 
															+static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
														
 
															+		unsigned long tce)
														
 
															 {
														
 
															 	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
														
 
															 	enum dma_data_direction dir = iommu_tce_direction(tce);
														
 
															+	struct kvmppc_spapr_tce_iommu_table *stit;
														
 
															+	unsigned long ua = 0;
														
 
															 	/* Allow userspace to poison TCE table */
														
 
															 	if (dir == DMA_NONE)
														
@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
 
															 	if (iommu_tce_check_gpa(stt->page_shift, gpa))
														
 
															 		return H_PARAMETER;
														
 
															+	if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
														
 
															+		return H_TOO_HARD;
														
 
															+
														
 
															+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
														
 
															+		unsigned long hpa = 0;
														
 
															+		struct mm_iommu_table_group_mem_t *mem;
														
 
															+		long shift = stit->tbl->it_page_shift;
														
 
															+
														
 
															+		mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
														
 
															+		if (!mem)
														
 
															+			return H_TOO_HARD;
														
 
															+
														
 
															+		if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
														
 
															+			return H_TOO_HARD;
														
 
															+	}
														
 
															+
														
 
															 	return H_SUCCESS;
														
 
															 }
														
 
															-EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
														
 
															+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
														
 
															 /* Note on the use of page_address() in real mode,
														
 
															  *
														
@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvmppc_tce_put);
														
 
															-long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
														
 
															+long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
														
 
															 		unsigned long *ua, unsigned long **prmap)
														
 
															 {
														
 
															-	unsigned long gfn = gpa >> PAGE_SHIFT;
														
 
															+	unsigned long gfn = tce >> PAGE_SHIFT;
														
 
															 	struct kvm_memory_slot *memslot;
														
 
															 	memslot = search_memslots(kvm_memslots(kvm), gfn);
														
@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 
															 		return -EINVAL;
														
 
															 	*ua = __gfn_to_hva_memslot(memslot, gfn) |
														
 
															-		(gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
														
 
															+		(tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															 	if (prmap)
														
@@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 
															 	return 0;
														
 
															 }
														
 
															-EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															 static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
														
@@ -197,7 +214,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
 
															 	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
														
 
															 				(*direction == DMA_BIDIRECTIONAL))) {
														
 
															-		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
														
 
															+		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
														
 
															 		/*
														
 
															 		 * kvmppc_rm_tce_iommu_do_map() updates the UA cache after
														
 
															 		 * calling this so we still get here a valid UA.
														
@@ -223,7 +240,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
 
															 {
														
 
															 	struct mm_iommu_table_group_mem_t *mem = NULL;
														
 
															 	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
														
 
															-	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
														
 
															+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
														
 
															 	if (!pua)
														
 
															 		/* it_userspace allocation might be delayed */
														
@@ -287,7 +304,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 
															 {
														
 
															 	long ret;
														
 
															 	unsigned long hpa = 0;
														
 
															-	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
														
 
															+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
														
 
															 	struct mm_iommu_table_group_mem_t *mem;
														
 
															 	if (!pua)
														
@@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 
															 	if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
														
 
															 			&hpa)))
														
 
															-		return H_HARDWARE;
														
 
															+		return H_TOO_HARD;
														
 
															 	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
														
 
															-		return H_CLOSED;
														
 
															+		return H_TOO_HARD;
														
 
															 	ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
														
 
															 	if (ret) {
														
@@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
															 	if (ret != H_SUCCESS)
														
 
															 		return ret;
														
 
															-	ret = kvmppc_tce_validate(stt, tce);
														
 
															+	ret = kvmppc_rm_tce_validate(stt, tce);
														
 
															 	if (ret != H_SUCCESS)
														
 
															 		return ret;
														
 
															 	dir = iommu_tce_direction(tce);
														
 
															-	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
														
 
															-			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
														
 
															+	if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
														
 
															 		return H_PARAMETER;
														
 
															 	entry = ioba >> stt->page_shift;
														
@@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
															 			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
														
 
															 					stit->tbl, entry, ua, dir);
														
 
															-		if (ret == H_SUCCESS)
														
 
															-			continue;
														
 
															-
														
 
															-		if (ret == H_TOO_HARD)
														
 
															+		if (ret != H_SUCCESS) {
														
 
															+			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
														
 
															 			return ret;
														
 
															-
														
 
															-		WARN_ON_ONCE_RM(1);
														
 
															-		kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
														
 
															+		}
														
 
															 	}
														
 
															 	kvmppc_tce_put(stt, entry, tce);
														
@@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
															 		 */
														
 
															 		struct mm_iommu_table_group_mem_t *mem;
														
 
															-		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
														
 
															+		if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
														
 
															 			return H_TOO_HARD;
														
 
															 		mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
														
@@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
															 		 * We do not require memory to be preregistered in this case
														
 
															 		 * so lock rmap and do __find_linux_pte_or_hugepte().
														
 
															 		 */
														
 
															-		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
														
 
															+		if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
														
 
															 			return H_TOO_HARD;
														
 
															 		rmap = (void *) vmalloc_to_phys(rmap);
														
 
															 		if (WARN_ON_ONCE_RM(!rmap))
														
 
															-			return H_HARDWARE;
														
 
															+			return H_TOO_HARD;
														
 
															 		/*
														
 
															 		 * Synchronize with the MMU notifier callbacks in
														
@@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
															 	for (i = 0; i < npages; ++i) {
														
 
															 		unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
														
 
															-		ret = kvmppc_tce_validate(stt, tce);
														
 
															+		ret = kvmppc_rm_tce_validate(stt, tce);
														
 
															 		if (ret != H_SUCCESS)
														
 
															 			goto unlock_exit;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < npages; ++i) {
														
 
															+		unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
														
 
															 		ua = 0;
														
 
															-		if (kvmppc_gpa_to_ua(vcpu->kvm,
														
 
															-				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
														
 
															-				&ua, NULL))
														
 
															+		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
														
 
															 			return H_PARAMETER;
														
 
															 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
														
@@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
															 					stit->tbl, entry + i, ua,
														
 
															 					iommu_tce_direction(tce));
														
 
															-			if (ret == H_SUCCESS)
														
 
															-				continue;
														
 
															-
														
 
															-			if (ret == H_TOO_HARD)
														
 
															+			if (ret != H_SUCCESS) {
														
 
															+				kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
														
 
															+						entry);
														
 
															 				goto unlock_exit;
														
 
															-
														
 
															-			WARN_ON_ONCE_RM(1);
														
 
															-			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
														
 
															+			}
														
 
															 		}
														
 
															 		kvmppc_tce_put(stt, entry + i, tce);
														
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -36,7 +36,6 @@
 
															 #define OP_31_XOP_MTSR		210
														
 
															 #define OP_31_XOP_MTSRIN	242
														
 
															 #define OP_31_XOP_TLBIEL	274
														
 
															-#define OP_31_XOP_TLBIE		306
														
 
															 /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
														
 
															 #define OP_31_XOP_FAKE_SC1	308
														
 
															 #define OP_31_XOP_SLBMTE	402
														
@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
 
															 	vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
														
 
															 	vcpu->arch.tar_tm = vcpu->arch.tar;
														
 
															 	vcpu->arch.lr_tm = vcpu->arch.regs.link;
														
 
															-	vcpu->arch.cr_tm = vcpu->arch.cr;
														
 
															+	vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
														
 
															 	vcpu->arch.xer_tm = vcpu->arch.regs.xer;
														
 
															 	vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
														
 
															 }
														
@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
 
															 	vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
														
 
															 	vcpu->arch.tar = vcpu->arch.tar_tm;
														
 
															 	vcpu->arch.regs.link = vcpu->arch.lr_tm;
														
 
															-	vcpu->arch.cr = vcpu->arch.cr_tm;
														
 
															+	vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
														
 
															 	vcpu->arch.regs.xer = vcpu->arch.xer_tm;
														
 
															 	vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
														
 
															 }
														
@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
 
															 	uint64_t texasr;
														
 
															 	/* CR0 = 0 | MSR[TS] | 0 */
														
 
															-	vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
														
 
															+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
														
 
															 		(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
														
 
															 		 << CR0_SHIFT);
														
@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
 
															 	tm_abort(ra_val);
														
 
															 	/* CR0 = 0 | MSR[TS] | 0 */
														
 
															-	vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
														
 
															+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
														
 
															 		(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
														
 
															 		 << CR0_SHIFT);
														
@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 			if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
														
 
															 				preempt_disable();
														
 
															-				vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
														
 
															-				  (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
														
 
															+				vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
														
 
															+				  (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
														
 
															 				vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
														
 
															 					(((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
														
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -50,6 +50,7 @@
 
															 #include <asm/reg.h>
														
 
															 #include <asm/ppc-opcode.h>
														
 
															 #include <asm/asm-prototypes.h>
														
 
															+#include <asm/archrandom.h>
														
 
															 #include <asm/debug.h>
														
 
															 #include <asm/disassemble.h>
														
 
															 #include <asm/cputable.h>
														
@@ -104,6 +105,10 @@ static bool indep_threads_mode = true;
 
															 module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
														
 
															 MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
														
 
															+static bool one_vm_per_core;
														
 
															+module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
														
 
															+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
														
 
															+
														
 
															 #ifdef CONFIG_KVM_XICS
														
 
															 static struct kernel_param_ops module_param_ops = {
														
 
															 	.set = param_set_int,
														
@@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
 
															 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
														
 
															 #endif
														
 
															+/* If set, guests are allowed to create and control nested guests */
														
 
															+static bool nested = true;
														
 
															+module_param(nested, bool, S_IRUGO | S_IWUSR);
														
 
															+MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
														
 
															+
														
 
															+static inline bool nesting_enabled(struct kvm *kvm)
														
 
															+{
														
 
															+	return kvm->arch.nested_enable && kvm_is_radix(kvm);
														
 
															+}
														
 
															+
														
 
															 /* If set, the threads on each CPU core have to be in the same MMU mode */
														
 
															 static bool no_mixing_hpt_and_radix;
														
@@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu)
 
															 {
														
 
															 	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
														
 
															+	/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
														
 
															+	if (kvmhv_on_pseries())
														
 
															+		return false;
														
 
															+
														
 
															 	/* On POWER9 we can use msgsnd to IPI any cpu */
														
 
															 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
														
 
															 		msg |= get_hard_smp_processor_id(cpu);
														
@@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 
															 	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
														
 
															 	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
														
 
															 	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
														
 
															-	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
														
 
															-	       vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
														
 
															+	pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
														
 
															+	       vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
														
 
															 	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
														
 
															 	pr_err("fault dar = %.16lx dsisr = %.8x\n",
														
 
															 	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
														
@@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
 
															 	/*
														
 
															 	 * Ensure that the read of vcore->dpdes comes after the read
														
 
															 	 * of vcpu->doorbell_request.  This barrier matches the
														
 
															-	 * lwsync in book3s_hv_rmhandlers.S just before the
														
 
															-	 * fast_guest_return label.
														
 
															+	 * smb_wmb() in kvmppc_guest_entry_inject().
														
 
															 	 */
														
 
															 	smp_rmb();
														
 
															 	vc = vcpu->arch.vcore;
														
@@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
															 			break;
														
 
															 		}
														
 
															 		return RESUME_HOST;
														
 
															+	case H_SET_DABR:
														
 
															+		ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
														
 
															+		break;
														
 
															+	case H_SET_XDABR:
														
 
															+		ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
														
 
															+						kvmppc_get_gpr(vcpu, 5));
														
 
															+		break;
														
 
															+	case H_GET_TCE:
														
 
															+		ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
														
 
															+						kvmppc_get_gpr(vcpu, 5));
														
 
															+		if (ret == H_TOO_HARD)
														
 
															+			return RESUME_HOST;
														
 
															+		break;
														
 
															 	case H_PUT_TCE:
														
 
															 		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
														
 
															 						kvmppc_get_gpr(vcpu, 5),
														
@@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
															 		if (ret == H_TOO_HARD)
														
 
															 			return RESUME_HOST;
														
 
															 		break;
														
 
															+	case H_RANDOM:
														
 
															+		if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
														
 
															+			ret = H_HARDWARE;
														
 
															+		break;
														
 
															+
														
 
															+	case H_SET_PARTITION_TABLE:
														
 
															+		ret = H_FUNCTION;
														
 
															+		if (nesting_enabled(vcpu->kvm))
														
 
															+			ret = kvmhv_set_partition_table(vcpu);
														
 
															+		break;
														
 
															+	case H_ENTER_NESTED:
														
 
															+		ret = H_FUNCTION;
														
 
															+		if (!nesting_enabled(vcpu->kvm))
														
 
															+			break;
														
 
															+		ret = kvmhv_enter_nested_guest(vcpu);
														
 
															+		if (ret == H_INTERRUPT) {
														
 
															+			kvmppc_set_gpr(vcpu, 3, 0);
														
 
															+			return -EINTR;
														
 
															+		}
														
 
															+		break;
														
 
															+	case H_TLB_INVALIDATE:
														
 
															+		ret = H_FUNCTION;
														
 
															+		if (nesting_enabled(vcpu->kvm))
														
 
															+			ret = kvmhv_do_nested_tlbie(vcpu);
														
 
															+		break;
														
 
															+
														
 
															 	default:
														
 
															 		return RESUME_HOST;
														
 
															 	}
														
@@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
															 	return RESUME_GUEST;
														
 
															 }
														
 
															+/*
														
 
															+ * Handle H_CEDE in the nested virtualization case where we haven't
														
 
															+ * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
														
 
															+ * This has to be done early, not in kvmppc_pseries_do_hcall(), so
														
 
															+ * that the cede logic in kvmppc_run_single_vcpu() works properly.
														
 
															+ */
														
 
															+static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	vcpu->arch.shregs.msr |= MSR_EE;
														
 
															+	vcpu->arch.ceded = 1;
														
 
															+	smp_mb();
														
 
															+	if (vcpu->arch.prodded) {
														
 
															+		vcpu->arch.prodded = 0;
														
 
															+		smp_mb();
														
 
															+		vcpu->arch.ceded = 0;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 static int kvmppc_hcall_impl_hv(unsigned long cmd)
														
 
															 {
														
 
															 	switch (cmd) {
														
@@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
 
															 	return RESUME_GUEST;
														
 
															 }
														
 
															-/* Called with vcpu->arch.vcore->lock held */
														
 
															 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
														
 
															 				 struct task_struct *tsk)
														
 
															 {
														
@@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 		break;
														
 
															 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
														
 
															 		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
														
 
															-		vcpu->arch.fault_dsisr = 0;
														
 
															+		vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
														
 
															+			DSISR_SRR1_MATCH_64S;
														
 
															+		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
														
 
															+			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
														
 
															 		r = RESUME_PAGE_FAULT;
														
 
															 		break;
														
 
															 	/*
														
@@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 				swab32(vcpu->arch.emul_inst) :
														
 
															 				vcpu->arch.emul_inst;
														
 
															 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
														
 
															-			/* Need vcore unlocked to call kvmppc_get_last_inst */
														
 
															-			spin_unlock(&vcpu->arch.vcore->lock);
														
 
															 			r = kvmppc_emulate_debug_inst(run, vcpu);
														
 
															-			spin_lock(&vcpu->arch.vcore->lock);
														
 
															 		} else {
														
 
															 			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
														
 
															 			r = RESUME_GUEST;
														
@@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
														
 
															 		r = EMULATE_FAIL;
														
 
															 		if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
														
 
															-		    cpu_has_feature(CPU_FTR_ARCH_300)) {
														
 
															-			/* Need vcore unlocked to call kvmppc_get_last_inst */
														
 
															-			spin_unlock(&vcpu->arch.vcore->lock);
														
 
															+		    cpu_has_feature(CPU_FTR_ARCH_300))
														
 
															 			r = kvmppc_emulate_doorbell_instr(vcpu);
														
 
															-			spin_lock(&vcpu->arch.vcore->lock);
														
 
															-		}
														
 
															 		if (r == EMULATE_FAIL) {
														
 
															 			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
														
 
															 			r = RESUME_GUEST;
														
@@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 	return r;
														
 
															 }
														
 
															+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	int r;
														
 
															+	int srcu_idx;
														
 
															+
														
 
															+	vcpu->stat.sum_exits++;
														
 
															+
														
 
															+	/*
														
 
															+	 * This can happen if an interrupt occurs in the last stages
														
 
															+	 * of guest entry or the first stages of guest exit (i.e. after
														
 
															+	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
														
 
															+	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
														
 
															+	 * That can happen due to a bug, or due to a machine check
														
 
															+	 * occurring at just the wrong time.
														
 
															+	 */
														
 
															+	if (vcpu->arch.shregs.msr & MSR_HV) {
														
 
															+		pr_emerg("KVM trap in HV mode while nested!\n");
														
 
															+		pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
														
 
															+			 vcpu->arch.trap, kvmppc_get_pc(vcpu),
														
 
															+			 vcpu->arch.shregs.msr);
														
 
															+		kvmppc_dump_regs(vcpu);
														
 
															+		return RESUME_HOST;
														
 
															+	}
														
 
															+	switch (vcpu->arch.trap) {
														
 
															+	/* We're good on these - the host merely wanted to get our attention */
														
 
															+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
														
 
															+		vcpu->stat.dec_exits++;
														
 
															+		r = RESUME_GUEST;
														
 
															+		break;
														
 
															+	case BOOK3S_INTERRUPT_EXTERNAL:
														
 
															+		vcpu->stat.ext_intr_exits++;
														
 
															+		r = RESUME_HOST;
														
 
															+		break;
														
 
															+	case BOOK3S_INTERRUPT_H_DOORBELL:
														
 
															+	case BOOK3S_INTERRUPT_H_VIRT:
														
 
															+		vcpu->stat.ext_intr_exits++;
														
 
															+		r = RESUME_GUEST;
														
 
															+		break;
														
 
															+	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
														
 
															+	case BOOK3S_INTERRUPT_HMI:
														
 
															+	case BOOK3S_INTERRUPT_PERFMON:
														
 
															+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
														
 
															+		r = RESUME_GUEST;
														
 
															+		break;
														
 
															+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
														
 
															+		/* Pass the machine check to the L1 guest */
														
 
															+		r = RESUME_HOST;
														
 
															+		/* Print the MCE event to host console. */
														
 
															+		machine_check_print_event_info(&vcpu->arch.mce_evt, false);
														
 
															+		break;
														
 
															+	/*
														
 
															+	 * We get these next two if the guest accesses a page which it thinks
														
 
															+	 * it has mapped but which is not actually present, either because
														
 
															+	 * it is for an emulated I/O device or because the corresonding
														
 
															+	 * host page has been paged out.
														
 
															+	 */
														
 
															+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
														
 
															+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
														
 
															+		r = kvmhv_nested_page_fault(vcpu);
														
 
															+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
														
 
															+		break;
														
 
															+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
														
 
															+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
														
 
															+		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
														
 
															+					 DSISR_SRR1_MATCH_64S;
														
 
															+		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
														
 
															+			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
														
 
															+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
														
 
															+		r = kvmhv_nested_page_fault(vcpu);
														
 
															+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
														
 
															+		break;
														
 
															+
														
 
															+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
														
 
															+	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
														
 
															+		/*
														
 
															+		 * This occurs for various TM-related instructions that
														
 
															+		 * we need to emulate on POWER9 DD2.2.  We have already
														
 
															+		 * handled the cases where the guest was in real-suspend
														
 
															+		 * mode and was transitioning to transactional state.
														
 
															+		 */
														
 
															+		r = kvmhv_p9_tm_emulation(vcpu);
														
 
															+		break;
														
 
															+#endif
														
 
															+
														
 
															+	case BOOK3S_INTERRUPT_HV_RM_HARD:
														
 
															+		vcpu->arch.trap = 0;
														
 
															+		r = RESUME_GUEST;
														
 
															+		if (!xive_enabled())
														
 
															+			kvmppc_xics_rm_complete(vcpu, 0);
														
 
															+		break;
														
 
															+	default:
														
 
															+		r = RESUME_HOST;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
														
 
															 					    struct kvm_sregs *sregs)
														
 
															 {
														
@@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
															 	case KVM_REG_PPC_ONLINE:
														
 
															 		*val = get_reg_val(id, vcpu->arch.online);
														
 
															 		break;
														
 
															+	case KVM_REG_PPC_PTCR:
														
 
															+		*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
														
 
															+		break;
														
 
															 	default:
														
 
															 		r = -EINVAL;
														
 
															 		break;
														
@@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
															 			atomic_dec(&vcpu->arch.vcore->online_count);
														
 
															 		vcpu->arch.online = i;
														
 
															 		break;
														
 
															+	case KVM_REG_PPC_PTCR:
														
 
															+		vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
														
 
															+		break;
														
 
															 	default:
														
 
															 		r = -EINVAL;
														
 
															 		break;
														
@@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 
															 	 * Set the default HFSCR for the guest from the host value.
														
 
															 	 * This value is only used on POWER9.
														
 
															 	 * On POWER9, we want to virtualize the doorbell facility, so we
														
 
															-	 * turn off the HFSCR bit, which causes those instructions to trap.
														
 
															+	 * don't set the HFSCR_MSGP bit, and that causes those instructions
														
 
															+	 * to trap and then we emulate them.
														
 
															 	 */
														
 
															-	vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
														
 
															-	if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
														
 
															+	vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
														
 
															+		HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
														
 
															+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
														
 
															+		vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
														
 
															+		if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
														
 
															+			vcpu->arch.hfscr |= HFSCR_TM;
														
 
															+	}
														
 
															+	if (cpu_has_feature(CPU_FTR_TM_COMP))
														
 
															 		vcpu->arch.hfscr |= HFSCR_TM;
														
 
															-	else if (!cpu_has_feature(CPU_FTR_TM_COMP))
														
 
															-		vcpu->arch.hfscr &= ~HFSCR_TM;
														
 
															-	if (cpu_has_feature(CPU_FTR_ARCH_300))
														
 
															-		vcpu->arch.hfscr &= ~HFSCR_MSGP;
														
 
															 	kvmppc_mmu_book3s_hv_init(vcpu);
														
@@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu)
 
															 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	struct kvm_nested_guest *nested = vcpu->arch.nested;
														
 
															+	cpumask_t *cpu_in_guest;
														
 
															 	int i;
														
 
															 	cpu = cpu_first_thread_sibling(cpu);
														
 
															-	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
														
 
															+	if (nested) {
														
 
															+		cpumask_set_cpu(cpu, &nested->need_tlb_flush);
														
 
															+		cpu_in_guest = &nested->cpu_in_guest;
														
 
															+	} else {
														
 
															+		cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
														
 
															+		cpu_in_guest = &kvm->arch.cpu_in_guest;
														
 
															+	}
														
 
															 	/*
														
 
															 	 * Make sure setting of bit in need_tlb_flush precedes
														
 
															 	 * testing of cpu_in_guest bits.  The matching barrier on
														
@@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 
															 	 */
														
 
															 	smp_mb();
														
 
															 	for (i = 0; i < threads_per_core; ++i)
														
 
															-		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
														
 
															+		if (cpumask_test_cpu(cpu + i, cpu_in_guest))
														
 
															 			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
														
 
															 }
														
 
															 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
														
 
															 {
														
 
															+	struct kvm_nested_guest *nested = vcpu->arch.nested;
														
 
															 	struct kvm *kvm = vcpu->kvm;
														
 
															+	int prev_cpu;
														
 
															+
														
 
															+	if (!cpu_has_feature(CPU_FTR_HVMODE))
														
 
															+		return;
														
 
															+
														
 
															+	if (nested)
														
 
															+		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
														
 
															+	else
														
 
															+		prev_cpu = vcpu->arch.prev_cpu;
														
 
															 	/*
														
 
															 	 * With radix, the guest can do TLB invalidations itself,
														
@@ -2273,12 +2468,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
 
															 	 * ran to flush the TLB.  The TLB is shared between threads,
														
 
															 	 * so we use a single bit in .need_tlb_flush for all 4 threads.
														
 
															 	 */
														
 
															-	if (vcpu->arch.prev_cpu != pcpu) {
														
 
															-		if (vcpu->arch.prev_cpu >= 0 &&
														
 
															-		    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
														
 
															+	if (prev_cpu != pcpu) {
														
 
															+		if (prev_cpu >= 0 &&
														
 
															+		    cpu_first_thread_sibling(prev_cpu) !=
														
 
															 		    cpu_first_thread_sibling(pcpu))
														
 
															-			radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
														
 
															-		vcpu->arch.prev_cpu = pcpu;
														
 
															+			radix_flush_cpu(kvm, prev_cpu, vcpu);
														
 
															+		if (nested)
														
 
															+			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
														
 
															+		else
														
 
															+			vcpu->arch.prev_cpu = pcpu;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
														
 
															+					      struct kvm_nested_guest *nested)
														
 
															+{
														
 
															+	cpumask_t *need_tlb_flush;
														
 
															+	int lpid;
														
 
															+
														
 
															+	if (!cpu_has_feature(CPU_FTR_HVMODE))
														
 
															+		return;
														
 
															+
														
 
															+	if (cpu_has_feature(CPU_FTR_ARCH_300))
														
 
															+		pcpu &= ~0x3UL;
														
 
															+
														
 
															+	if (nested) {
														
 
															+		lpid = nested->shadow_lpid;
														
 
															+		need_tlb_flush = &nested->need_tlb_flush;
														
 
															+	} else {
														
 
															+		lpid = kvm->arch.lpid;
														
 
															+		need_tlb_flush = &kvm->arch.need_tlb_flush;
														
 
															+	}
														
 
															+
														
 
															+	mtspr(SPRN_LPID, lpid);
														
 
															+	isync();
														
 
															+	smp_mb();
														
 
															+
														
 
															+	if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
														
 
															+		radix__local_flush_tlb_lpid_guest(lpid);
														
 
															+		/* Clear the bit after the TLB flush */
														
 
															+		cpumask_clear_cpu(pcpu, need_tlb_flush);
														
 
															 	}
														
 
															 }
														
@@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 
															 	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
														
 
															 		return false;
														
 
															+	/* In one_vm_per_core mode, require all vcores to be from the same vm */
														
 
															+	if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
														
 
															+		return false;
														
 
															+
														
 
															 	/* Some POWER9 chips require all threads to be in the same MMU mode */
														
 
															 	if (no_mixing_hpt_and_radix &&
														
 
															 	    kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
														
@@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 
															 	spin_lock(&vc->lock);
														
 
															 	now = get_tb();
														
 
															 	for_each_runnable_thread(i, vcpu, vc) {
														
 
															+		/*
														
 
															+		 * It's safe to unlock the vcore in the loop here, because
														
 
															+		 * for_each_runnable_thread() is safe against removal of
														
 
															+		 * the vcpu, and the vcore state is VCORE_EXITING here,
														
 
															+		 * so any vcpus becoming runnable will have their arch.trap
														
 
															+		 * set to zero and can't actually run in the guest.
														
 
															+		 */
														
 
															+		spin_unlock(&vc->lock);
														
 
															 		/* cancel pending dec exception if dec is positive */
														
 
															 		if (now < vcpu->arch.dec_expires &&
														
 
															 		    kvmppc_core_pending_dec(vcpu))
														
@@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 
															 		vcpu->arch.ret = ret;
														
 
															 		vcpu->arch.trap = 0;
														
 
															+		spin_lock(&vc->lock);
														
 
															 		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
														
 
															 			if (vcpu->arch.pending_exceptions)
														
 
															 				kvmppc_core_prepare_to_enter(vcpu);
														
@@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
															 		spin_unlock(&core_info.vc[sub]->lock);
														
 
															 	if (kvm_is_radix(vc->kvm)) {
														
 
															-		int tmp = pcpu;
														
 
															-
														
 
															 		/*
														
 
															 		 * Do we need to flush the process scoped TLB for the LPAR?
														
 
															 		 *
														
@@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
															 		 *
														
 
															 		 * Hash must be flushed in realmode in order to use tlbiel.
														
 
															 		 */
														
 
															-		mtspr(SPRN_LPID, vc->kvm->arch.lpid);
														
 
															-		isync();
														
 
															-
														
 
															-		if (cpu_has_feature(CPU_FTR_ARCH_300))
														
 
															-			tmp &= ~0x3UL;
														
 
															-
														
 
															-		if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
														
 
															-			radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
														
 
															-			/* Clear the bit after the TLB flush */
														
 
															-			cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
														
 
															-		}
														
 
															+		kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
														
 
															 	}
														
 
															 	/*
														
@@ -3079,6 +3309,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
															 	trace_kvmppc_run_core(vc, 1);
														
 
															 }
														
 
															+/*
														
 
															+ * Load up hypervisor-mode registers on P9.
														
 
															+ */
														
 
															+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
														
 
															+				     unsigned long lpcr)
														
 
															+{
														
 
															+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
														
 
															+	s64 hdec;
														
 
															+	u64 tb, purr, spurr;
														
 
															+	int trap;
														
 
															+	unsigned long host_hfscr = mfspr(SPRN_HFSCR);
														
 
															+	unsigned long host_ciabr = mfspr(SPRN_CIABR);
														
 
															+	unsigned long host_dawr = mfspr(SPRN_DAWR);
														
 
															+	unsigned long host_dawrx = mfspr(SPRN_DAWRX);
														
 
															+	unsigned long host_psscr = mfspr(SPRN_PSSCR);
														
 
															+	unsigned long host_pidr = mfspr(SPRN_PID);
														
 
															+
														
 
															+	hdec = time_limit - mftb();
														
 
															+	if (hdec < 0)
														
 
															+		return BOOK3S_INTERRUPT_HV_DECREMENTER;
														
 
															+	mtspr(SPRN_HDEC, hdec);
														
 
															+
														
 
															+	if (vc->tb_offset) {
														
 
															+		u64 new_tb = mftb() + vc->tb_offset;
														
 
															+		mtspr(SPRN_TBU40, new_tb);
														
 
															+		tb = mftb();
														
 
															+		if ((tb & 0xffffff) < (new_tb & 0xffffff))
														
 
															+			mtspr(SPRN_TBU40, new_tb + 0x1000000);
														
 
															+		vc->tb_offset_applied = vc->tb_offset;
														
 
															+	}
														
 
															+
														
 
															+	if (vc->pcr)
														
 
															+		mtspr(SPRN_PCR, vc->pcr);
														
 
															+	mtspr(SPRN_DPDES, vc->dpdes);
														
 
															+	mtspr(SPRN_VTB, vc->vtb);
														
 
															+
														
 
															+	local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
														
 
															+	local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
														
 
															+	mtspr(SPRN_PURR, vcpu->arch.purr);
														
 
															+	mtspr(SPRN_SPURR, vcpu->arch.spurr);
														
 
															+
														
 
															+	if (cpu_has_feature(CPU_FTR_DAWR)) {
														
 
															+		mtspr(SPRN_DAWR, vcpu->arch.dawr);
														
 
															+		mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
														
 
															+	}
														
 
															+	mtspr(SPRN_CIABR, vcpu->arch.ciabr);
														
 
															+	mtspr(SPRN_IC, vcpu->arch.ic);
														
 
															+	mtspr(SPRN_PID, vcpu->arch.pid);
														
 
															+
														
 
															+	mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
														
 
															+	      (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
														
 
															+
														
 
															+	mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
														
 
															+
														
 
															+	mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
														
 
															+	mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
														
 
															+	mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
														
 
															+	mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
														
 
															+
														
 
															+	mtspr(SPRN_AMOR, ~0UL);
														
 
															+
														
 
															+	mtspr(SPRN_LPCR, lpcr);
														
 
															+	isync();
														
 
															+
														
 
															+	kvmppc_xive_push_vcpu(vcpu);
														
 
															+
														
 
															+	mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
														
 
															+	mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
														
 
															+
														
 
															+	trap = __kvmhv_vcpu_entry_p9(vcpu);
														
 
															+
														
 
															+	/* Advance host PURR/SPURR by the amount used by guest */
														
 
															+	purr = mfspr(SPRN_PURR);
														
 
															+	spurr = mfspr(SPRN_SPURR);
														
 
															+	mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
														
 
															+	      purr - vcpu->arch.purr);
														
 
															+	mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
														
 
															+	      spurr - vcpu->arch.spurr);
														
 
															+	vcpu->arch.purr = purr;
														
 
															+	vcpu->arch.spurr = spurr;
														
 
															+
														
 
															+	vcpu->arch.ic = mfspr(SPRN_IC);
														
 
															+	vcpu->arch.pid = mfspr(SPRN_PID);
														
 
															+	vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
														
 
															+
														
 
															+	vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
														
 
															+	vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
														
 
															+	vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
														
 
															+	vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
														
 
															+
														
 
															+	mtspr(SPRN_PSSCR, host_psscr);
														
 
															+	mtspr(SPRN_HFSCR, host_hfscr);
														
 
															+	mtspr(SPRN_CIABR, host_ciabr);
														
 
															+	mtspr(SPRN_DAWR, host_dawr);
														
 
															+	mtspr(SPRN_DAWRX, host_dawrx);
														
 
															+	mtspr(SPRN_PID, host_pidr);
														
 
															+
														
 
															+	/*
														
 
															+	 * Since this is radix, do a eieio; tlbsync; ptesync sequence in
														
 
															+	 * case we interrupted the guest between a tlbie and a ptesync.
														
 
															+	 */
														
 
															+	asm volatile("eieio; tlbsync; ptesync");
														
 
															+
														
 
															+	mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);	/* restore host LPID */
														
 
															+	isync();
														
 
															+
														
 
															+	vc->dpdes = mfspr(SPRN_DPDES);
														
 
															+	vc->vtb = mfspr(SPRN_VTB);
														
 
															+	mtspr(SPRN_DPDES, 0);
														
 
															+	if (vc->pcr)
														
 
															+		mtspr(SPRN_PCR, 0);
														
 
															+
														
 
															+	if (vc->tb_offset_applied) {
														
 
															+		u64 new_tb = mftb() - vc->tb_offset_applied;
														
 
															+		mtspr(SPRN_TBU40, new_tb);
														
 
															+		tb = mftb();
														
 
															+		if ((tb & 0xffffff) < (new_tb & 0xffffff))
														
 
															+			mtspr(SPRN_TBU40, new_tb + 0x1000000);
														
 
															+		vc->tb_offset_applied = 0;
														
 
															+	}
														
 
															+
														
 
															+	mtspr(SPRN_HDEC, 0x7fffffff);
														
 
															+	mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
														
 
															+
														
 
															+	return trap;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Virtual-mode guest entry for POWER9 and later when the host and
														
 
															+ * guest are both using the radix MMU.  The LPIDR has already been set.
														
 
															+ */
														
 
															+int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
														
 
															+			 unsigned long lpcr)
														
 
															+{
														
 
															+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
														
 
															+	unsigned long host_dscr = mfspr(SPRN_DSCR);
														
 
															+	unsigned long host_tidr = mfspr(SPRN_TIDR);
														
 
															+	unsigned long host_iamr = mfspr(SPRN_IAMR);
														
 
															+	s64 dec;
														
 
															+	u64 tb;
														
 
															+	int trap, save_pmu;
														
 
															+
														
 
															+	dec = mfspr(SPRN_DEC);
														
 
															+	tb = mftb();
														
 
															+	if (dec < 512)
														
 
															+		return BOOK3S_INTERRUPT_HV_DECREMENTER;
														
 
															+	local_paca->kvm_hstate.dec_expires = dec + tb;
														
 
															+	if (local_paca->kvm_hstate.dec_expires < time_limit)
														
 
															+		time_limit = local_paca->kvm_hstate.dec_expires;
														
 
															+
														
 
															+	vcpu->arch.ceded = 0;
														
 
															+
														
 
															+	kvmhv_save_host_pmu();		/* saves it to PACA kvm_hstate */
														
 
															+
														
 
															+	kvmppc_subcore_enter_guest();
														
 
															+
														
 
															+	vc->entry_exit_map = 1;
														
 
															+	vc->in_guest = 1;
														
 
															+
														
 
															+	if (vcpu->arch.vpa.pinned_addr) {
														
 
															+		struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
														
 
															+		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
														
 
															+		lp->yield_count = cpu_to_be32(yield_count);
														
 
															+		vcpu->arch.vpa.dirty = 1;
														
 
															+	}
														
 
															+
														
 
															+	if (cpu_has_feature(CPU_FTR_TM) ||
														
 
															+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
														
 
															+		kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
														
 
															+
														
 
															+	kvmhv_load_guest_pmu(vcpu);
														
 
															+
														
 
															+	msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
														
 
															+	load_fp_state(&vcpu->arch.fp);
														
 
															+#ifdef CONFIG_ALTIVEC
														
 
															+	load_vr_state(&vcpu->arch.vr);
														
 
															+#endif
														
 
															+
														
 
															+	mtspr(SPRN_DSCR, vcpu->arch.dscr);
														
 
															+	mtspr(SPRN_IAMR, vcpu->arch.iamr);
														
 
															+	mtspr(SPRN_PSPB, vcpu->arch.pspb);
														
 
															+	mtspr(SPRN_FSCR, vcpu->arch.fscr);
														
 
															+	mtspr(SPRN_TAR, vcpu->arch.tar);
														
 
															+	mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
														
 
															+	mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
														
 
															+	mtspr(SPRN_BESCR, vcpu->arch.bescr);
														
 
															+	mtspr(SPRN_WORT, vcpu->arch.wort);
														
 
															+	mtspr(SPRN_TIDR, vcpu->arch.tid);
														
 
															+	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
														
 
															+	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
														
 
															+	mtspr(SPRN_AMR, vcpu->arch.amr);
														
 
															+	mtspr(SPRN_UAMOR, vcpu->arch.uamor);
														
 
															+
														
 
															+	if (!(vcpu->arch.ctrl & 1))
														
 
															+		mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
														
 
															+
														
 
															+	mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
														
 
															+
														
 
															+	if (kvmhv_on_pseries()) {
														
 
															+		/* call our hypervisor to load up HV regs and go */
														
 
															+		struct hv_guest_state hvregs;
														
 
															+
														
 
															+		kvmhv_save_hv_regs(vcpu, &hvregs);
														
 
															+		hvregs.lpcr = lpcr;
														
 
															+		vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
														
 
															+		hvregs.version = HV_GUEST_STATE_VERSION;
														
 
															+		if (vcpu->arch.nested) {
														
 
															+			hvregs.lpid = vcpu->arch.nested->shadow_lpid;
														
 
															+			hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
														
 
															+		} else {
														
 
															+			hvregs.lpid = vcpu->kvm->arch.lpid;
														
 
															+			hvregs.vcpu_token = vcpu->vcpu_id;
														
 
															+		}
														
 
															+		hvregs.hdec_expiry = time_limit;
														
 
															+		trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
														
 
															+					  __pa(&vcpu->arch.regs));
														
 
															+		kvmhv_restore_hv_return_state(vcpu, &hvregs);
														
 
															+		vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
														
 
															+		vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
														
 
															+		vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
														
 
															+
														
 
															+		/* H_CEDE has to be handled now, not later */
														
 
															+		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
														
 
															+		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
														
 
															+			kvmppc_nested_cede(vcpu);
														
 
															+			trap = 0;
														
 
															+		}
														
 
															+	} else {
														
 
															+		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
														
 
															+	}
														
 
															+
														
 
															+	vcpu->arch.slb_max = 0;
														
 
															+	dec = mfspr(SPRN_DEC);
														
 
															+	tb = mftb();
														
 
															+	vcpu->arch.dec_expires = dec + tb;
														
 
															+	vcpu->cpu = -1;
														
 
															+	vcpu->arch.thread_cpu = -1;
														
 
															+	vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
														
 
															+
														
 
															+	vcpu->arch.iamr = mfspr(SPRN_IAMR);
														
 
															+	vcpu->arch.pspb = mfspr(SPRN_PSPB);
														
 
															+	vcpu->arch.fscr = mfspr(SPRN_FSCR);
														
 
															+	vcpu->arch.tar = mfspr(SPRN_TAR);
														
 
															+	vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
														
 
															+	vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
														
 
															+	vcpu->arch.bescr = mfspr(SPRN_BESCR);
														
 
															+	vcpu->arch.wort = mfspr(SPRN_WORT);
														
 
															+	vcpu->arch.tid = mfspr(SPRN_TIDR);
														
 
															+	vcpu->arch.amr = mfspr(SPRN_AMR);
														
 
															+	vcpu->arch.uamor = mfspr(SPRN_UAMOR);
														
 
															+	vcpu->arch.dscr = mfspr(SPRN_DSCR);
														
 
															+
														
 
															+	mtspr(SPRN_PSPB, 0);
														
 
															+	mtspr(SPRN_WORT, 0);
														
 
															+	mtspr(SPRN_AMR, 0);
														
 
															+	mtspr(SPRN_UAMOR, 0);
														
 
															+	mtspr(SPRN_DSCR, host_dscr);
														
 
															+	mtspr(SPRN_TIDR, host_tidr);
														
 
															+	mtspr(SPRN_IAMR, host_iamr);
														
 
															+	mtspr(SPRN_PSPB, 0);
														
 
															+
														
 
															+	msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
														
 
															+	store_fp_state(&vcpu->arch.fp);
														
 
															+#ifdef CONFIG_ALTIVEC
														
 
															+	store_vr_state(&vcpu->arch.vr);
														
 
															+#endif
														
 
															+
														
 
															+	if (cpu_has_feature(CPU_FTR_TM) ||
														
 
															+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
														
 
															+		kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
														
 
															+
														
 
															+	save_pmu = 1;
														
 
															+	if (vcpu->arch.vpa.pinned_addr) {
														
 
															+		struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
														
 
															+		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
														
 
															+		lp->yield_count = cpu_to_be32(yield_count);
														
 
															+		vcpu->arch.vpa.dirty = 1;
														
 
															+		save_pmu = lp->pmcregs_in_use;
														
 
															+	}
														
 
															+
														
 
															+	kvmhv_save_guest_pmu(vcpu, save_pmu);
														
 
															+
														
 
															+	vc->entry_exit_map = 0x101;
														
 
															+	vc->in_guest = 0;
														
 
															+
														
 
															+	mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
														
 
															+
														
 
															+	kvmhv_load_host_pmu();
														
 
															+
														
 
															+	kvmppc_subcore_exit_guest();
														
 
															+
														
 
															+	return trap;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Wait for some other vcpu thread to execute us, and
														
 
															  * wake us up when we need to handle something in the host.
														
@@ -3256,6 +3780,11 @@ out:
 
															 	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
														
 
															 }
														
 
															+/*
														
 
															+ * This never fails for a radix guest, as none of the operations it does
														
 
															+ * for a radix guest can fail or have a way to report failure.
														
 
															+ * kvmhv_run_single_vcpu() relies on this fact.
														
 
															+ */
														
 
															 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	int r = 0;
														
@@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
															 	return vcpu->arch.ret;
														
 
															 }
														
 
															+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
														
 
															+			  struct kvm_vcpu *vcpu, u64 time_limit,
														
 
															+			  unsigned long lpcr)
														
 
															+{
														
 
															+	int trap, r, pcpu;
														
 
															+	int srcu_idx;
														
 
															+	struct kvmppc_vcore *vc;
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	struct kvm_nested_guest *nested = vcpu->arch.nested;
														
 
															+
														
 
															+	trace_kvmppc_run_vcpu_enter(vcpu);
														
 
															+
														
 
															+	kvm_run->exit_reason = 0;
														
 
															+	vcpu->arch.ret = RESUME_GUEST;
														
 
															+	vcpu->arch.trap = 0;
														
 
															+
														
 
															+	vc = vcpu->arch.vcore;
														
 
															+	vcpu->arch.ceded = 0;
														
 
															+	vcpu->arch.run_task = current;
														
 
															+	vcpu->arch.kvm_run = kvm_run;
														
 
															+	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
														
 
															+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
														
 
															+	vcpu->arch.busy_preempt = TB_NIL;
														
 
															+	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
														
 
															+	vc->runnable_threads[0] = vcpu;
														
 
															+	vc->n_runnable = 1;
														
 
															+	vc->runner = vcpu;
														
 
															+
														
 
															+	/* See if the MMU is ready to go */
														
 
															+	if (!kvm->arch.mmu_ready)
														
 
															+		kvmhv_setup_mmu(vcpu);
														
 
															+
														
 
															+	if (need_resched())
														
 
															+		cond_resched();
														
 
															+
														
 
															+	kvmppc_update_vpas(vcpu);
														
 
															+
														
 
															+	init_vcore_to_run(vc);
														
 
															+	vc->preempt_tb = TB_NIL;
														
 
															+
														
 
															+	preempt_disable();
														
 
															+	pcpu = smp_processor_id();
														
 
															+	vc->pcpu = pcpu;
														
 
															+	kvmppc_prepare_radix_vcpu(vcpu, pcpu);
														
 
															+
														
 
															+	local_irq_disable();
														
 
															+	hard_irq_disable();
														
 
															+	if (signal_pending(current))
														
 
															+		goto sigpend;
														
 
															+	if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (!nested) {
														
 
															+		kvmppc_core_prepare_to_enter(vcpu);
														
 
															+		if (vcpu->arch.doorbell_request) {
														
 
															+			vc->dpdes = 1;
														
 
															+			smp_wmb();
														
 
															+			vcpu->arch.doorbell_request = 0;
														
 
															+		}
														
 
															+		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
														
 
															+			     &vcpu->arch.pending_exceptions))
														
 
															+			lpcr |= LPCR_MER;
														
 
															+	} else if (vcpu->arch.pending_exceptions ||
														
 
															+		   vcpu->arch.doorbell_request ||
														
 
															+		   xive_interrupt_pending(vcpu)) {
														
 
															+		vcpu->arch.ret = RESUME_HOST;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	kvmppc_clear_host_core(pcpu);
														
 
															+
														
 
															+	local_paca->kvm_hstate.tid = 0;
														
 
															+	local_paca->kvm_hstate.napping = 0;
														
 
															+	local_paca->kvm_hstate.kvm_split_mode = NULL;
														
 
															+	kvmppc_start_thread(vcpu, vc);
														
 
															+	kvmppc_create_dtl_entry(vcpu, vc);
														
 
															+	trace_kvm_guest_enter(vcpu);
														
 
															+
														
 
															+	vc->vcore_state = VCORE_RUNNING;
														
 
															+	trace_kvmppc_run_core(vc, 0);
														
 
															+
														
 
															+	if (cpu_has_feature(CPU_FTR_HVMODE))
														
 
															+		kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
														
 
															+
														
 
															+	trace_hardirqs_on();
														
 
															+	guest_enter_irqoff();
														
 
															+
														
 
															+	srcu_idx = srcu_read_lock(&kvm->srcu);
														
 
															+
														
 
															+	this_cpu_disable_ftrace();
														
 
															+
														
 
															+	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
														
 
															+	vcpu->arch.trap = trap;
														
 
															+
														
 
															+	this_cpu_enable_ftrace();
														
 
															+
														
 
															+	srcu_read_unlock(&kvm->srcu, srcu_idx);
														
 
															+
														
 
															+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
														
 
															+		mtspr(SPRN_LPID, kvm->arch.host_lpid);
														
 
															+		isync();
														
 
															+	}
														
 
															+
														
 
															+	trace_hardirqs_off();
														
 
															+	set_irq_happened(trap);
														
 
															+
														
 
															+	kvmppc_set_host_core(pcpu);
														
 
															+
														
 
															+	local_irq_enable();
														
 
															+	guest_exit();
														
 
															+
														
 
															+	cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
														
 
															+
														
 
															+	preempt_enable();
														
 
															+
														
 
															+	/* cancel pending decrementer exception if DEC is now positive */
														
 
															+	if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
														
 
															+		kvmppc_core_dequeue_dec(vcpu);
														
 
															+
														
 
															+	trace_kvm_guest_exit(vcpu);
														
 
															+	r = RESUME_GUEST;
														
 
															+	if (trap) {
														
 
															+		if (!nested)
														
 
															+			r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
														
 
															+		else
														
 
															+			r = kvmppc_handle_nested_exit(vcpu);
														
 
															+	}
														
 
															+	vcpu->arch.ret = r;
														
 
															+
														
 
															+	if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
														
 
															+	    !kvmppc_vcpu_woken(vcpu)) {
														
 
															+		kvmppc_set_timer(vcpu);
														
 
															+		while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
														
 
															+			if (signal_pending(current)) {
														
 
															+				vcpu->stat.signal_exits++;
														
 
															+				kvm_run->exit_reason = KVM_EXIT_INTR;
														
 
															+				vcpu->arch.ret = -EINTR;
														
 
															+				break;
														
 
															+			}
														
 
															+			spin_lock(&vc->lock);
														
 
															+			kvmppc_vcore_blocked(vc);
														
 
															+			spin_unlock(&vc->lock);
														
 
															+		}
														
 
															+	}
														
 
															+	vcpu->arch.ceded = 0;
														
 
															+
														
 
															+	vc->vcore_state = VCORE_INACTIVE;
														
 
															+	trace_kvmppc_run_core(vc, 1);
														
 
															+
														
 
															+ done:
														
 
															+	kvmppc_remove_runnable(vc, vcpu);
														
 
															+	trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
														
 
															+
														
 
															+	return vcpu->arch.ret;
														
 
															+
														
 
															+ sigpend:
														
 
															+	vcpu->stat.signal_exits++;
														
 
															+	kvm_run->exit_reason = KVM_EXIT_INTR;
														
 
															+	vcpu->arch.ret = -EINTR;
														
 
															+ out:
														
 
															+	local_irq_enable();
														
 
															+	preempt_enable();
														
 
															+	goto done;
														
 
															+}
														
 
															+
														
 
															 static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	int r;
														
@@ -3480,7 +4174,20 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
															 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
														
 
															 	do {
														
 
															-		r = kvmppc_run_vcpu(run, vcpu);
														
 
															+		/*
														
 
															+		 * The early POWER9 chips that can't mix radix and HPT threads
														
 
															+		 * on the same core also need the workaround for the problem
														
 
															+		 * where the TLB would prefetch entries in the guest exit path
														
 
															+		 * for radix guests using the guest PIDR value and LPID 0.
														
 
															+		 * The workaround is in the old path (kvmppc_run_vcpu())
														
 
															+		 * but not the new path (kvmhv_run_single_vcpu()).
														
 
															+		 */
														
 
															+		if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
														
 
															+		    !no_mixing_hpt_and_radix)
														
 
															+			r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
														
 
															+						  vcpu->arch.vcore->lpcr);
														
 
															+		else
														
 
															+			r = kvmppc_run_vcpu(run, vcpu);
														
 
															 		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
														
 
															 		    !(vcpu->arch.shregs.msr & MSR_PR)) {
														
@@ -3559,6 +4266,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 
															 	kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
														
 
															 	kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
														
 
															+	/* If running as a nested hypervisor, we don't support HPT guests */
														
 
															+	if (kvmhv_on_pseries())
														
 
															+		info->flags |= KVM_PPC_NO_HASH;
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
@@ -3723,8 +4434,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
 
															 			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
														
 
															 		dw1 = PATB_GR | kvm->arch.process_table;
														
 
															 	}
														
 
															-
														
 
															-	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
														
 
															+	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
														
 
															 }
														
 
															 /*
														
@@ -3820,6 +4530,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 
															 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
														
 
															 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
														
 
															 {
														
 
															+	if (nesting_enabled(kvm))
														
 
															+		kvmhv_release_all_nested(kvm);
														
 
															 	kvmppc_free_radix(kvm);
														
 
															 	kvmppc_update_lpcr(kvm, LPCR_VPM1,
														
 
															 			   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
														
@@ -3841,6 +4553,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
 
															 	kvmppc_free_hpt(&kvm->arch.hpt);
														
 
															 	kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
														
 
															 			   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
														
 
															+	kvmppc_rmap_reset(kvm);
														
 
															 	kvm->arch.radix = 1;
														
 
															 	return 0;
														
 
															 }
														
@@ -3940,6 +4653,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
															 	kvmppc_alloc_host_rm_ops();
														
 
															+	kvmhv_vm_nested_init(kvm);
														
 
															+
														
 
															 	/*
														
 
															 	 * Since we don't flush the TLB when tearing down a VM,
														
 
															 	 * and this lpid might have previously been used,
														
@@ -3958,9 +4673,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
															 		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
														
 
															 	/* Init LPCR for virtual RMA mode */
														
 
															-	kvm->arch.host_lpid = mfspr(SPRN_LPID);
														
 
															-	kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
														
 
															-	lpcr &= LPCR_PECE | LPCR_LPES;
														
 
															+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
														
 
															+		kvm->arch.host_lpid = mfspr(SPRN_LPID);
														
 
															+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
														
 
															+		lpcr &= LPCR_PECE | LPCR_LPES;
														
 
															+	} else {
														
 
															+		lpcr = 0;
														
 
															+	}
														
 
															 	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
														
 
															 		LPCR_VPM0 | LPCR_VPM1;
														
 
															 	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
														
@@ -4027,8 +4746,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
															 	 * On POWER9, we only need to do this if the "indep_threads_mode"
														
 
															 	 * module parameter has been set to N.
														
 
															 	 */
														
 
															-	if (cpu_has_feature(CPU_FTR_ARCH_300))
														
 
															-		kvm->arch.threads_indep = indep_threads_mode;
														
 
															+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
														
 
															+		if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
														
 
															+			pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
														
 
															+			kvm->arch.threads_indep = true;
														
 
															+		} else {
														
 
															+			kvm->arch.threads_indep = indep_threads_mode;
														
 
															+		}
														
 
															+	}
														
 
															 	if (!kvm->arch.threads_indep)
														
 
															 		kvm_hv_vm_activated();
														
@@ -4051,6 +4776,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
															 	snprintf(buf, sizeof(buf), "vm%d", current->pid);
														
 
															 	kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
														
 
															 	kvmppc_mmu_debugfs_init(kvm);
														
 
															+	if (radix_enabled())
														
 
															+		kvmhv_radix_debugfs_init(kvm);
														
 
															 	return 0;
														
 
															 }
														
@@ -4073,13 +4800,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
															 	kvmppc_free_vcores(kvm);
														
 
															-	kvmppc_free_lpid(kvm->arch.lpid);
														
 
															 	if (kvm_is_radix(kvm))
														
 
															 		kvmppc_free_radix(kvm);
														
 
															 	else
														
 
															 		kvmppc_free_hpt(&kvm->arch.hpt);
														
 
															+	/* Perform global invalidation and return lpid to the pool */
														
 
															+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
														
 
															+		if (nesting_enabled(kvm))
														
 
															+			kvmhv_release_all_nested(kvm);
														
 
															+		kvm->arch.process_table = 0;
														
 
															+		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
														
 
															+	}
														
 
															+	kvmppc_free_lpid(kvm->arch.lpid);
														
 
															+
														
 
															 	kvmppc_free_pimap(kvm);
														
 
															 }
														
@@ -4104,11 +4839,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
 
															 static int kvmppc_core_check_processor_compat_hv(void)
														
 
															 {
														
 
															-	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
														
 
															-	    !cpu_has_feature(CPU_FTR_ARCH_206))
														
 
															-		return -EIO;
														
 
															+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
														
 
															+	    cpu_has_feature(CPU_FTR_ARCH_206))
														
 
															+		return 0;
														
 
															-	return 0;
														
 
															+	/* POWER9 in radix mode is capable of being a nested hypervisor. */
														
 
															+	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
														
 
															+		return 0;
														
 
															+
														
 
															+	return -EIO;
														
 
															 }
														
 
															 #ifdef CONFIG_KVM_XICS
														
@@ -4426,6 +5165,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 
															 	if (radix && !radix_enabled())
														
 
															 		return -EINVAL;
														
 
															+	/* If we're a nested hypervisor, we currently only support radix */
														
 
															+	if (kvmhv_on_pseries() && !radix)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															 	mutex_lock(&kvm->lock);
														
 
															 	if (radix != kvm_is_radix(kvm)) {
														
 
															 		if (kvm->arch.mmu_ready) {
														
@@ -4458,6 +5201,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 
															 	return err;
														
 
															 }
														
 
															+static int kvmhv_enable_nested(struct kvm *kvm)
														
 
															+{
														
 
															+	if (!nested)
														
 
															+		return -EPERM;
														
 
															+	if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	/* kvm == NULL means the caller is testing if the capability exists */
														
 
															+	if (kvm)
														
 
															+		kvm->arch.nested_enable = true;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 static struct kvmppc_ops kvm_ops_hv = {
														
 
															 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
														
 
															 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
														
@@ -4497,6 +5253,7 @@ static struct kvmppc_ops kvm_ops_hv = {
 
															 	.configure_mmu = kvmhv_configure_mmu,
														
 
															 	.get_rmmu_info = kvmhv_get_rmmu_info,
														
 
															 	.set_smt_mode = kvmhv_set_smt_mode,
														
 
															+	.enable_nested = kvmhv_enable_nested,
														
 
															 };
														
 
															 static int kvm_init_subcore_bitmap(void)
														
@@ -4547,6 +5304,10 @@ static int kvmppc_book3s_init_hv(void)
 
															 	if (r < 0)
														
 
															 		return -ENODEV;
														
 
															+	r = kvmhv_nested_init();
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															 	r = kvm_init_subcore_bitmap();
														
 
															 	if (r)
														
 
															 		return r;
														
@@ -4557,7 +5318,8 @@ static int kvmppc_book3s_init_hv(void)
 
															 	 * indirectly, via OPAL.
														
 
															 	 */
														
 
															 #ifdef CONFIG_SMP
														
 
															-	if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
														
 
															+	if (!xive_enabled() && !kvmhv_on_pseries() &&
														
 
															+	    !local_paca->kvm_hstate.xics_phys) {
														
 
															 		struct device_node *np;
														
 
															 		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
														
@@ -4605,6 +5367,7 @@ static void kvmppc_book3s_exit_hv(void)
 
															 	if (kvmppc_radix_possible())
														
 
															 		kvmppc_radix_exit();
														
 
															 	kvmppc_hv_ops = NULL;
														
 
															+	kvmhv_nested_exit();
														
 
															 }
														
 
															 module_init(kvmppc_book3s_init_hv);
														
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu)
 
															 	void __iomem *xics_phys;
														
 
															 	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
														
 
															+	/* For a nested hypervisor, use the XICS via hcall */
														
 
															+	if (kvmhv_on_pseries()) {
														
 
															+		unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
														
 
															+
														
 
															+		plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
														
 
															+				IPI_PRIORITY);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															 	/* On POWER9 we can use msgsnd for any destination cpu. */
														
 
															 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
														
 
															 		msg |= get_hard_smp_processor_id(cpu);
														
@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again)
 
															 		return 1;
														
 
															 	/* Now read the interrupt from the ICP */
														
 
															-	xics_phys = local_paca->kvm_hstate.xics_phys;
														
 
															-	rc = 0;
														
 
															-	if (!xics_phys)
														
 
															-		rc = opal_int_get_xirr(&xirr, false);
														
 
															-	else
														
 
															-		xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
														
 
															+	if (kvmhv_on_pseries()) {
														
 
															+		unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
														
 
															+
														
 
															+		rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
														
 
															+		xirr = cpu_to_be32(retbuf[0]);
														
 
															+	} else {
														
 
															+		xics_phys = local_paca->kvm_hstate.xics_phys;
														
 
															+		rc = 0;
														
 
															+		if (!xics_phys)
														
 
															+			rc = opal_int_get_xirr(&xirr, false);
														
 
															+		else
														
 
															+			xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
														
 
															+	}
														
 
															 	if (rc < 0)
														
 
															 		return 1;
														
@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again)
 
															 	 */
														
 
															 	if (xisr == XICS_IPI) {
														
 
															 		rc = 0;
														
 
															-		if (xics_phys) {
														
 
															+		if (kvmhv_on_pseries()) {
														
 
															+			unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
														
 
															+
														
 
															+			plpar_hcall_raw(H_IPI, retbuf,
														
 
															+					hard_smp_processor_id(), 0xff);
														
 
															+			plpar_hcall_raw(H_EOI, retbuf, h_xirr);
														
 
															+		} else if (xics_phys) {
														
 
															 			__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
														
 
															 			__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
														
 
															 		} else {
														
@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again)
 
															 			/* We raced with the host,
														
 
															 			 * we need to resend that IPI, bummer
														
 
															 			 */
														
 
															-			if (xics_phys)
														
 
															+			if (kvmhv_on_pseries()) {
														
 
															+				unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
														
 
															+
														
 
															+				plpar_hcall_raw(H_IPI, retbuf,
														
 
															+						hard_smp_processor_id(),
														
 
															+						IPI_PRIORITY);
														
 
															+			} else if (xics_phys)
														
 
															 				__raw_rm_writeb(IPI_PRIORITY,
														
 
															 						xics_phys + XICS_MFRR);
														
 
															 			else
														
@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
 
															 	smp_mb();
														
 
															 	local_paca->kvm_hstate.kvm_split_mode = NULL;
														
 
															 }
														
 
															+
														
 
															+/*
														
 
															+ * Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
														
 
															+ * Can we inject a Decrementer or a External interrupt?
														
 
															+ */
														
 
															+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	int ext;
														
 
															+	unsigned long vec = 0;
														
 
															+	unsigned long lpcr;
														
 
															+
														
 
															+	/* Insert EXTERNAL bit into LPCR at the MER bit position */
														
 
															+	ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
														
 
															+	lpcr = mfspr(SPRN_LPCR);
														
 
															+	lpcr |= ext << LPCR_MER_SH;
														
 
															+	mtspr(SPRN_LPCR, lpcr);
														
 
															+	isync();
														
 
															+
														
 
															+	if (vcpu->arch.shregs.msr & MSR_EE) {
														
 
															+		if (ext) {
														
 
															+			vec = BOOK3S_INTERRUPT_EXTERNAL;
														
 
															+		} else {
														
 
															+			long int dec = mfspr(SPRN_DEC);
														
 
															+			if (!(lpcr & LPCR_LD))
														
 
															+				dec = (int) dec;
														
 
															+			if (dec < 0)
														
 
															+				vec = BOOK3S_INTERRUPT_DECREMENTER;
														
 
															+		}
														
 
															+	}
														
 
															+	if (vec) {
														
 
															+		unsigned long msr, old_msr = vcpu->arch.shregs.msr;
														
 
															+
														
 
															+		kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
														
 
															+		kvmppc_set_srr1(vcpu, old_msr);
														
 
															+		kvmppc_set_pc(vcpu, vec);
														
 
															+		msr = vcpu->arch.intr_msr;
														
 
															+		if (MSR_TM_ACTIVE(old_msr))
														
 
															+			msr |= MSR_TS_S;
														
 
															+		vcpu->arch.shregs.msr = msr;
														
 
															+	}
														
 
															+
														
 
															+	if (vcpu->arch.doorbell_request) {
														
 
															+		mtspr(SPRN_DPDES, 1);
														
 
															+		vcpu->arch.vcore->dpdes = 1;
														
 
															+		smp_wmb();
														
 
															+		vcpu->arch.doorbell_request = 0;
														
 
															+	}
														
 
															+}
														
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION
 
															 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
														
 
															 	/* Save host PMU registers */
														
 
															-BEGIN_FTR_SECTION
														
 
															-	/* Work around P8 PMAE bug */
														
 
															-	li	r3, -1
														
 
															-	clrrdi	r3, r3, 10
														
 
															-	mfspr	r8, SPRN_MMCR2
														
 
															-	mtspr	SPRN_MMCR2, r3		/* freeze all counters using MMCR2 */
														
 
															-	isync
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-	li	r3, 1
														
 
															-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
														
 
															-	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
														
 
															-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
														
 
															-	mfspr	r6, SPRN_MMCRA
														
 
															-	/* Clear MMCRA in order to disable SDAR updates */
														
 
															-	li	r5, 0
														
 
															-	mtspr	SPRN_MMCRA, r5
														
 
															-	isync
														
 
															-	lbz	r5, PACA_PMCINUSE(r13)	/* is the host using the PMU? */
														
 
															-	cmpwi	r5, 0
														
 
															-	beq	31f			/* skip if not */
														
 
															-	mfspr	r5, SPRN_MMCR1
														
 
															-	mfspr	r9, SPRN_SIAR
														
 
															-	mfspr	r10, SPRN_SDAR
														
 
															-	std	r7, HSTATE_MMCR0(r13)
														
 
															-	std	r5, HSTATE_MMCR1(r13)
														
 
															-	std	r6, HSTATE_MMCRA(r13)
														
 
															-	std	r9, HSTATE_SIAR(r13)
														
 
															-	std	r10, HSTATE_SDAR(r13)
														
 
															-BEGIN_FTR_SECTION
														
 
															-	mfspr	r9, SPRN_SIER
														
 
															-	std	r8, HSTATE_MMCR2(r13)
														
 
															-	std	r9, HSTATE_SIER(r13)
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-	mfspr	r3, SPRN_PMC1
														
 
															-	mfspr	r5, SPRN_PMC2
														
 
															-	mfspr	r6, SPRN_PMC3
														
 
															-	mfspr	r7, SPRN_PMC4
														
 
															-	mfspr	r8, SPRN_PMC5
														
 
															-	mfspr	r9, SPRN_PMC6
														
 
															-	stw	r3, HSTATE_PMC1(r13)
														
 
															-	stw	r5, HSTATE_PMC2(r13)
														
 
															-	stw	r6, HSTATE_PMC3(r13)
														
 
															-	stw	r7, HSTATE_PMC4(r13)
														
 
															-	stw	r8, HSTATE_PMC5(r13)
														
 
															-	stw	r9, HSTATE_PMC6(r13)
														
 
															-31:
														
 
															+	bl	kvmhv_save_host_pmu
														
 
															 	/*
														
 
															 	 * Put whatever is in the decrementer into the
														
@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
															 	ld	r0, PPC_LR_STKOFF(r1)
														
 
															 	mtlr	r0
														
 
															 	blr
														
 
															+
														
 
															+_GLOBAL(kvmhv_save_host_pmu)
														
 
															+BEGIN_FTR_SECTION
														
 
															+	/* Work around P8 PMAE bug */
														
 
															+	li	r3, -1
														
 
															+	clrrdi	r3, r3, 10
														
 
															+	mfspr	r8, SPRN_MMCR2
														
 
															+	mtspr	SPRN_MMCR2, r3		/* freeze all counters using MMCR2 */
														
 
															+	isync
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															+	li	r3, 1
														
 
															+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
														
 
															+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
														
 
															+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
														
 
															+	mfspr	r6, SPRN_MMCRA
														
 
															+	/* Clear MMCRA in order to disable SDAR updates */
														
 
															+	li	r5, 0
														
 
															+	mtspr	SPRN_MMCRA, r5
														
 
															+	isync
														
 
															+	lbz	r5, PACA_PMCINUSE(r13)	/* is the host using the PMU? */
														
 
															+	cmpwi	r5, 0
														
 
															+	beq	31f			/* skip if not */
														
 
															+	mfspr	r5, SPRN_MMCR1
														
 
															+	mfspr	r9, SPRN_SIAR
														
 
															+	mfspr	r10, SPRN_SDAR
														
 
															+	std	r7, HSTATE_MMCR0(r13)
														
 
															+	std	r5, HSTATE_MMCR1(r13)
														
 
															+	std	r6, HSTATE_MMCRA(r13)
														
 
															+	std	r9, HSTATE_SIAR(r13)
														
 
															+	std	r10, HSTATE_SDAR(r13)
														
 
															+BEGIN_FTR_SECTION
														
 
															+	mfspr	r9, SPRN_SIER
														
 
															+	std	r8, HSTATE_MMCR2(r13)
														
 
															+	std	r9, HSTATE_SIER(r13)
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															+	mfspr	r3, SPRN_PMC1
														
 
															+	mfspr	r5, SPRN_PMC2
														
 
															+	mfspr	r6, SPRN_PMC3
														
 
															+	mfspr	r7, SPRN_PMC4
														
 
															+	mfspr	r8, SPRN_PMC5
														
 
															+	mfspr	r9, SPRN_PMC6
														
 
															+	stw	r3, HSTATE_PMC1(r13)
														
 
															+	stw	r5, HSTATE_PMC2(r13)
														
 
															+	stw	r6, HSTATE_PMC3(r13)
														
 
															+	stw	r7, HSTATE_PMC4(r13)
														
 
															+	stw	r8, HSTATE_PMC5(r13)
														
 
															+	stw	r9, HSTATE_PMC6(r13)
														
 
															+31:	blr
														
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,1291 @@
 
															+// SPDX-License-Identifier: GPL-2.0
														
 
															+/*
														
 
															+ * Copyright IBM Corporation, 2018
														
 
															+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
														
 
															+ *	   Paul Mackerras <paulus@ozlabs.org>
														
 
															+ *
														
 
															+ * Description: KVM functions specific to running nested KVM-HV guests
														
 
															+ * on Book3S processors (specifically POWER9 and later).
														
 
															+ */
														
 
															+
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/kvm_host.h>
														
 
															+#include <linux/llist.h>
														
 
															+
														
 
															+#include <asm/kvm_ppc.h>
														
 
															+#include <asm/kvm_book3s.h>
														
 
															+#include <asm/mmu.h>
														
 
															+#include <asm/pgtable.h>
														
 
															+#include <asm/pgalloc.h>
														
 
															+#include <asm/pte-walk.h>
														
 
															+#include <asm/reg.h>
														
 
															+
														
 
															+static struct patb_entry *pseries_partition_tb;
														
 
															+
														
 
															+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
														
 
															+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
														
 
															+
														
 
															+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
														
 
															+{
														
 
															+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
														
 
															+
														
 
															+	hr->pcr = vc->pcr;
														
 
															+	hr->dpdes = vc->dpdes;
														
 
															+	hr->hfscr = vcpu->arch.hfscr;
														
 
															+	hr->tb_offset = vc->tb_offset;
														
 
															+	hr->dawr0 = vcpu->arch.dawr;
														
 
															+	hr->dawrx0 = vcpu->arch.dawrx;
														
 
															+	hr->ciabr = vcpu->arch.ciabr;
														
 
															+	hr->purr = vcpu->arch.purr;
														
 
															+	hr->spurr = vcpu->arch.spurr;
														
 
															+	hr->ic = vcpu->arch.ic;
														
 
															+	hr->vtb = vc->vtb;
														
 
															+	hr->srr0 = vcpu->arch.shregs.srr0;
														
 
															+	hr->srr1 = vcpu->arch.shregs.srr1;
														
 
															+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
														
 
															+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
														
 
															+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
														
 
															+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
														
 
															+	hr->pidr = vcpu->arch.pid;
														
 
															+	hr->cfar = vcpu->arch.cfar;
														
 
															+	hr->ppr = vcpu->arch.ppr;
														
 
															+}
														
 
															+
														
 
															+static void byteswap_pt_regs(struct pt_regs *regs)
														
 
															+{
														
 
															+	unsigned long *addr = (unsigned long *) regs;
														
 
															+
														
 
															+	for (; addr < ((unsigned long *) (regs + 1)); addr++)
														
 
															+		*addr = swab64(*addr);
														
 
															+}
														
 
															+
														
 
															+static void byteswap_hv_regs(struct hv_guest_state *hr)
														
 
															+{
														
 
															+	hr->version = swab64(hr->version);
														
 
															+	hr->lpid = swab32(hr->lpid);
														
 
															+	hr->vcpu_token = swab32(hr->vcpu_token);
														
 
															+	hr->lpcr = swab64(hr->lpcr);
														
 
															+	hr->pcr = swab64(hr->pcr);
														
 
															+	hr->amor = swab64(hr->amor);
														
 
															+	hr->dpdes = swab64(hr->dpdes);
														
 
															+	hr->hfscr = swab64(hr->hfscr);
														
 
															+	hr->tb_offset = swab64(hr->tb_offset);
														
 
															+	hr->dawr0 = swab64(hr->dawr0);
														
 
															+	hr->dawrx0 = swab64(hr->dawrx0);
														
 
															+	hr->ciabr = swab64(hr->ciabr);
														
 
															+	hr->hdec_expiry = swab64(hr->hdec_expiry);
														
 
															+	hr->purr = swab64(hr->purr);
														
 
															+	hr->spurr = swab64(hr->spurr);
														
 
															+	hr->ic = swab64(hr->ic);
														
 
															+	hr->vtb = swab64(hr->vtb);
														
 
															+	hr->hdar = swab64(hr->hdar);
														
 
															+	hr->hdsisr = swab64(hr->hdsisr);
														
 
															+	hr->heir = swab64(hr->heir);
														
 
															+	hr->asdr = swab64(hr->asdr);
														
 
															+	hr->srr0 = swab64(hr->srr0);
														
 
															+	hr->srr1 = swab64(hr->srr1);
														
 
															+	hr->sprg[0] = swab64(hr->sprg[0]);
														
 
															+	hr->sprg[1] = swab64(hr->sprg[1]);
														
 
															+	hr->sprg[2] = swab64(hr->sprg[2]);
														
 
															+	hr->sprg[3] = swab64(hr->sprg[3]);
														
 
															+	hr->pidr = swab64(hr->pidr);
														
 
															+	hr->cfar = swab64(hr->cfar);
														
 
															+	hr->ppr = swab64(hr->ppr);
														
 
															+}
														
 
															+
														
 
															+static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
														
 
															+				 struct hv_guest_state *hr)
														
 
															+{
														
 
															+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
														
 
															+
														
 
															+	hr->dpdes = vc->dpdes;
														
 
															+	hr->hfscr = vcpu->arch.hfscr;
														
 
															+	hr->purr = vcpu->arch.purr;
														
 
															+	hr->spurr = vcpu->arch.spurr;
														
 
															+	hr->ic = vcpu->arch.ic;
														
 
															+	hr->vtb = vc->vtb;
														
 
															+	hr->srr0 = vcpu->arch.shregs.srr0;
														
 
															+	hr->srr1 = vcpu->arch.shregs.srr1;
														
 
															+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
														
 
															+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
														
 
															+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
														
 
															+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
														
 
															+	hr->pidr = vcpu->arch.pid;
														
 
															+	hr->cfar = vcpu->arch.cfar;
														
 
															+	hr->ppr = vcpu->arch.ppr;
														
 
															+	switch (trap) {
														
 
															+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
														
 
															+		hr->hdar = vcpu->arch.fault_dar;
														
 
															+		hr->hdsisr = vcpu->arch.fault_dsisr;
														
 
															+		hr->asdr = vcpu->arch.fault_gpa;
														
 
															+		break;
														
 
															+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
														
 
															+		hr->asdr = vcpu->arch.fault_gpa;
														
 
															+		break;
														
 
															+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
														
 
															+		hr->heir = vcpu->arch.emul_inst;
														
 
															+		break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
														
 
															+{
														
 
															+	/*
														
 
															+	 * Don't let L1 enable features for L2 which we've disabled for L1,
														
 
															+	 * but preserve the interrupt cause field.
														
 
															+	 */
														
 
															+	hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
														
 
															+
														
 
															+	/* Don't let data address watchpoint match in hypervisor state */
														
 
															+	hr->dawrx0 &= ~DAWRX_HYP;
														
 
															+
														
 
															+	/* Don't let completed instruction address breakpt match in HV state */
														
 
															+	if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
														
 
															+		hr->ciabr &= ~CIABR_PRIV;
														
 
															+}
														
 
															+
														
 
															+static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
														
 
															+{
														
 
															+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
														
 
															+
														
 
															+	vc->pcr = hr->pcr;
														
 
															+	vc->dpdes = hr->dpdes;
														
 
															+	vcpu->arch.hfscr = hr->hfscr;
														
 
															+	vcpu->arch.dawr = hr->dawr0;
														
 
															+	vcpu->arch.dawrx = hr->dawrx0;
														
 
															+	vcpu->arch.ciabr = hr->ciabr;
														
 
															+	vcpu->arch.purr = hr->purr;
														
 
															+	vcpu->arch.spurr = hr->spurr;
														
 
															+	vcpu->arch.ic = hr->ic;
														
 
															+	vc->vtb = hr->vtb;
														
 
															+	vcpu->arch.shregs.srr0 = hr->srr0;
														
 
															+	vcpu->arch.shregs.srr1 = hr->srr1;
														
 
															+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
														
 
															+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
														
 
															+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
														
 
															+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
														
 
															+	vcpu->arch.pid = hr->pidr;
														
 
															+	vcpu->arch.cfar = hr->cfar;
														
 
															+	vcpu->arch.ppr = hr->ppr;
														
 
															+}
														
 
															+
														
 
															+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
														
 
															+				   struct hv_guest_state *hr)
														
 
															+{
														
 
															+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
														
 
															+
														
 
															+	vc->dpdes = hr->dpdes;
														
 
															+	vcpu->arch.hfscr = hr->hfscr;
														
 
															+	vcpu->arch.purr = hr->purr;
														
 
															+	vcpu->arch.spurr = hr->spurr;
														
 
															+	vcpu->arch.ic = hr->ic;
														
 
															+	vc->vtb = hr->vtb;
														
 
															+	vcpu->arch.fault_dar = hr->hdar;
														
 
															+	vcpu->arch.fault_dsisr = hr->hdsisr;
														
 
															+	vcpu->arch.fault_gpa = hr->asdr;
														
 
															+	vcpu->arch.emul_inst = hr->heir;
														
 
															+	vcpu->arch.shregs.srr0 = hr->srr0;
														
 
															+	vcpu->arch.shregs.srr1 = hr->srr1;
														
 
															+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
														
 
															+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
														
 
															+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
														
 
															+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
														
 
															+	vcpu->arch.pid = hr->pidr;
														
 
															+	vcpu->arch.cfar = hr->cfar;
														
 
															+	vcpu->arch.ppr = hr->ppr;
														
 
															+}
														
 
															+
														
 
															+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	long int err, r;
														
 
															+	struct kvm_nested_guest *l2;
														
 
															+	struct pt_regs l2_regs, saved_l1_regs;
														
 
															+	struct hv_guest_state l2_hv, saved_l1_hv;
														
 
															+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
														
 
															+	u64 hv_ptr, regs_ptr;
														
 
															+	u64 hdec_exp;
														
 
															+	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
														
 
															+	u64 mask;
														
 
															+	unsigned long lpcr;
														
 
															+
														
 
															+	if (vcpu->kvm->arch.l1_ptcr == 0)
														
 
															+		return H_NOT_AVAILABLE;
														
 
															+
														
 
															+	/* copy parameters in */
														
 
															+	hv_ptr = kvmppc_get_gpr(vcpu, 4);
														
 
															+	err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
														
 
															+				  sizeof(struct hv_guest_state));
														
 
															+	if (err)
														
 
															+		return H_PARAMETER;
														
 
															+	if (kvmppc_need_byteswap(vcpu))
														
 
															+		byteswap_hv_regs(&l2_hv);
														
 
															+	if (l2_hv.version != HV_GUEST_STATE_VERSION)
														
 
															+		return H_P2;
														
 
															+
														
 
															+	regs_ptr = kvmppc_get_gpr(vcpu, 5);
														
 
															+	err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
														
 
															+				  sizeof(struct pt_regs));
														
 
															+	if (err)
														
 
															+		return H_PARAMETER;
														
 
															+	if (kvmppc_need_byteswap(vcpu))
														
 
															+		byteswap_pt_regs(&l2_regs);
														
 
															+	if (l2_hv.vcpu_token >= NR_CPUS)
														
 
															+		return H_PARAMETER;
														
 
															+
														
 
															+	/* translate lpid */
														
 
															+	l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
														
 
															+	if (!l2)
														
 
															+		return H_PARAMETER;
														
 
															+	if (!l2->l1_gr_to_hr) {
														
 
															+		mutex_lock(&l2->tlb_lock);
														
 
															+		kvmhv_update_ptbl_cache(l2);
														
 
															+		mutex_unlock(&l2->tlb_lock);
														
 
															+	}
														
 
															+
														
 
															+	/* save l1 values of things */
														
 
															+	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
														
 
															+	saved_l1_regs = vcpu->arch.regs;
														
 
															+	kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
														
 
															+
														
 
															+	/* convert TB values/offsets to host (L0) values */
														
 
															+	hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
														
 
															+	vc->tb_offset += l2_hv.tb_offset;
														
 
															+
														
 
															+	/* set L1 state to L2 state */
														
 
															+	vcpu->arch.nested = l2;
														
 
															+	vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
														
 
															+	vcpu->arch.regs = l2_regs;
														
 
															+	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
														
 
															+	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
														
 
															+		LPCR_LPES | LPCR_MER;
														
 
															+	lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
														
 
															+	sanitise_hv_regs(vcpu, &l2_hv);
														
 
															+	restore_hv_regs(vcpu, &l2_hv);
														
 
															+
														
 
															+	vcpu->arch.ret = RESUME_GUEST;
														
 
															+	vcpu->arch.trap = 0;
														
 
															+	do {
														
 
															+		if (mftb() >= hdec_exp) {
														
 
															+			vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
														
 
															+			r = RESUME_HOST;
														
 
															+			break;
														
 
															+		}
														
 
															+		r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
														
 
															+					  lpcr);
														
 
															+	} while (is_kvmppc_resume_guest(r));
														
 
															+
														
 
															+	/* save L2 state for return */
														
 
															+	l2_regs = vcpu->arch.regs;
														
 
															+	l2_regs.msr = vcpu->arch.shregs.msr;
														
 
															+	delta_purr = vcpu->arch.purr - l2_hv.purr;
														
 
															+	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
														
 
															+	delta_ic = vcpu->arch.ic - l2_hv.ic;
														
 
															+	delta_vtb = vc->vtb - l2_hv.vtb;
														
 
															+	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
														
 
															+
														
 
															+	/* restore L1 state */
														
 
															+	vcpu->arch.nested = NULL;
														
 
															+	vcpu->arch.regs = saved_l1_regs;
														
 
															+	vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
														
 
															+	/* set L1 MSR TS field according to L2 transaction state */
														
 
															+	if (l2_regs.msr & MSR_TS_MASK)
														
 
															+		vcpu->arch.shregs.msr |= MSR_TS_S;
														
 
															+	vc->tb_offset = saved_l1_hv.tb_offset;
														
 
															+	restore_hv_regs(vcpu, &saved_l1_hv);
														
 
															+	vcpu->arch.purr += delta_purr;
														
 
															+	vcpu->arch.spurr += delta_spurr;
														
 
															+	vcpu->arch.ic += delta_ic;
														
 
															+	vc->vtb += delta_vtb;
														
 
															+
														
 
															+	kvmhv_put_nested(l2);
														
 
															+
														
 
															+	/* copy l2_hv_state and regs back to guest */
														
 
															+	if (kvmppc_need_byteswap(vcpu)) {
														
 
															+		byteswap_hv_regs(&l2_hv);
														
 
															+		byteswap_pt_regs(&l2_regs);
														
 
															+	}
														
 
															+	err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
														
 
															+				   sizeof(struct hv_guest_state));
														
 
															+	if (err)
														
 
															+		return H_AUTHORITY;
														
 
															+	err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
														
 
															+				   sizeof(struct pt_regs));
														
 
															+	if (err)
														
 
															+		return H_AUTHORITY;
														
 
															+
														
 
															+	if (r == -EINTR)
														
 
															+		return H_INTERRUPT;
														
 
															+
														
 
															+	return vcpu->arch.trap;
														
 
															+}
														
 
															+
														
 
															+long kvmhv_nested_init(void)
														
 
															+{
														
 
															+	long int ptb_order;
														
 
															+	unsigned long ptcr;
														
 
															+	long rc;
														
 
															+
														
 
															+	if (!kvmhv_on_pseries())
														
 
															+		return 0;
														
 
															+	if (!radix_enabled())
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
														
 
															+	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
														
 
															+	if (ptb_order < 8)
														
 
															+		ptb_order = 8;
														
 
															+	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
														
 
															+				       GFP_KERNEL);
														
 
															+	if (!pseries_partition_tb) {
														
 
															+		pr_err("kvm-hv: failed to allocated nested partition table\n");
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
														
 
															+	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
														
 
															+	if (rc != H_SUCCESS) {
														
 
															+		pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
														
 
															+		       rc);
														
 
															+		kfree(pseries_partition_tb);
														
 
															+		pseries_partition_tb = NULL;
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void kvmhv_nested_exit(void)
														
 
															+{
														
 
															+	/*
														
 
															+	 * N.B. the kvmhv_on_pseries() test is there because it enables
														
 
															+	 * the compiler to remove the call to plpar_hcall_norets()
														
 
															+	 * when CONFIG_PPC_PSERIES=n.
														
 
															+	 */
														
 
															+	if (kvmhv_on_pseries() && pseries_partition_tb) {
														
 
															+		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
														
 
															+		kfree(pseries_partition_tb);
														
 
															+		pseries_partition_tb = NULL;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_flush_lpid(unsigned int lpid)
														
 
															+{
														
 
															+	long rc;
														
 
															+
														
 
															+	if (!kvmhv_on_pseries()) {
														
 
															+		radix__flush_tlb_lpid(lpid);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
														
 
															+				lpid, TLBIEL_INVAL_SET_LPID);
														
 
															+	if (rc)
														
 
															+		pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
														
 
															+}
														
 
															+
														
 
															+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
														
 
															+{
														
 
															+	if (!kvmhv_on_pseries()) {
														
 
															+		mmu_partition_table_set_entry(lpid, dw0, dw1);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
														
 
															+	pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
														
 
															+	/* L0 will do the necessary barriers */
														
 
															+	kvmhv_flush_lpid(lpid);
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
														
 
															+{
														
 
															+	unsigned long dw0;
														
 
															+
														
 
															+	dw0 = PATB_HR | radix__get_tree_size() |
														
 
															+		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
														
 
															+	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
														
 
															+}
														
 
															+
														
 
															+void kvmhv_vm_nested_init(struct kvm *kvm)
														
 
															+{
														
 
															+	kvm->arch.max_nested_lpid = -1;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Handle the H_SET_PARTITION_TABLE hcall.
														
 
															+ * r4 = guest real address of partition table + log_2(size) - 12
														
 
															+ * (formatted as for the PTCR).
														
 
															+ */
														
 
															+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
														
 
															+	int srcu_idx;
														
 
															+	long ret = H_SUCCESS;
														
 
															+
														
 
															+	srcu_idx = srcu_read_lock(&kvm->srcu);
														
 
															+	/*
														
 
															+	 * Limit the partition table to 4096 entries (because that's what
														
 
															+	 * hardware supports), and check the base address.
														
 
															+	 */
														
 
															+	if ((ptcr & PRTS_MASK) > 12 - 8 ||
														
 
															+	    !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
														
 
															+		ret = H_PARAMETER;
														
 
															+	srcu_read_unlock(&kvm->srcu, srcu_idx);
														
 
															+	if (ret == H_SUCCESS)
														
 
															+		kvm->arch.l1_ptcr = ptcr;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Reload the partition table entry for a guest.
														
 
															+ * Caller must hold gp->tlb_lock.
														
 
															+ */
														
 
															+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct patb_entry ptbl_entry;
														
 
															+	unsigned long ptbl_addr;
														
 
															+	struct kvm *kvm = gp->l1_host;
														
 
															+
														
 
															+	ret = -EFAULT;
														
 
															+	ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
														
 
															+	if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
														
 
															+		ret = kvm_read_guest(kvm, ptbl_addr,
														
 
															+				     &ptbl_entry, sizeof(ptbl_entry));
														
 
															+	if (ret) {
														
 
															+		gp->l1_gr_to_hr = 0;
														
 
															+		gp->process_table = 0;
														
 
															+	} else {
														
 
															+		gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
														
 
															+		gp->process_table = be64_to_cpu(ptbl_entry.patb1);
														
 
															+	}
														
 
															+	kvmhv_set_nested_ptbl(gp);
														
 
															+}
														
 
															+
														
 
															+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
														
 
															+{
														
 
															+	struct kvm_nested_guest *gp;
														
 
															+	long shadow_lpid;
														
 
															+
														
 
															+	gp = kzalloc(sizeof(*gp), GFP_KERNEL);
														
 
															+	if (!gp)
														
 
															+		return NULL;
														
 
															+	gp->l1_host = kvm;
														
 
															+	gp->l1_lpid = lpid;
														
 
															+	mutex_init(&gp->tlb_lock);
														
 
															+	gp->shadow_pgtable = pgd_alloc(kvm->mm);
														
 
															+	if (!gp->shadow_pgtable)
														
 
															+		goto out_free;
														
 
															+	shadow_lpid = kvmppc_alloc_lpid();
														
 
															+	if (shadow_lpid < 0)
														
 
															+		goto out_free2;
														
 
															+	gp->shadow_lpid = shadow_lpid;
														
 
															+
														
 
															+	memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
														
 
															+
														
 
															+	return gp;
														
 
															+
														
 
															+ out_free2:
														
 
															+	pgd_free(kvm->mm, gp->shadow_pgtable);
														
 
															+ out_free:
														
 
															+	kfree(gp);
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Free up any resources allocated for a nested guest.
														
 
															+ */
														
 
															+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
														
 
															+{
														
 
															+	struct kvm *kvm = gp->l1_host;
														
 
															+
														
 
															+	if (gp->shadow_pgtable) {
														
 
															+		/*
														
 
															+		 * No vcpu is using this struct and no call to
														
 
															+		 * kvmhv_get_nested can find this struct,
														
 
															+		 * so we don't need to hold kvm->mmu_lock.
														
 
															+		 */
														
 
															+		kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
														
 
															+					  gp->shadow_lpid);
														
 
															+		pgd_free(kvm->mm, gp->shadow_pgtable);
														
 
															+	}
														
 
															+	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
														
 
															+	kvmppc_free_lpid(gp->shadow_lpid);
														
 
															+	kfree(gp);
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
														
 
															+{
														
 
															+	struct kvm *kvm = gp->l1_host;
														
 
															+	int lpid = gp->l1_lpid;
														
 
															+	long ref;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	if (gp == kvm->arch.nested_guests[lpid]) {
														
 
															+		kvm->arch.nested_guests[lpid] = NULL;
														
 
															+		if (lpid == kvm->arch.max_nested_lpid) {
														
 
															+			while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
														
 
															+				;
														
 
															+			kvm->arch.max_nested_lpid = lpid;
														
 
															+		}
														
 
															+		--gp->refcnt;
														
 
															+	}
														
 
															+	ref = gp->refcnt;
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+	if (ref == 0)
														
 
															+		kvmhv_release_nested(gp);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Free up all nested resources allocated for this guest.
														
 
															+ * This is called with no vcpus of the guest running, when
														
 
															+ * switching the guest to HPT mode or when destroying the
														
 
															+ * guest.
														
 
															+ */
														
 
															+void kvmhv_release_all_nested(struct kvm *kvm)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct kvm_nested_guest *gp;
														
 
															+	struct kvm_nested_guest *freelist = NULL;
														
 
															+	struct kvm_memory_slot *memslot;
														
 
															+	int srcu_idx;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
														
 
															+		gp = kvm->arch.nested_guests[i];
														
 
															+		if (!gp)
														
 
															+			continue;
														
 
															+		kvm->arch.nested_guests[i] = NULL;
														
 
															+		if (--gp->refcnt == 0) {
														
 
															+			gp->next = freelist;
														
 
															+			freelist = gp;
														
 
															+		}
														
 
															+	}
														
 
															+	kvm->arch.max_nested_lpid = -1;
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+	while ((gp = freelist) != NULL) {
														
 
															+		freelist = gp->next;
														
 
															+		kvmhv_release_nested(gp);
														
 
															+	}
														
 
															+
														
 
															+	srcu_idx = srcu_read_lock(&kvm->srcu);
														
 
															+	kvm_for_each_memslot(memslot, kvm_memslots(kvm))
														
 
															+		kvmhv_free_memslot_nest_rmap(memslot);
														
 
															+	srcu_read_unlock(&kvm->srcu, srcu_idx);
														
 
															+}
														
 
															+
														
 
															+/* caller must hold gp->tlb_lock */
														
 
															+static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
														
 
															+{
														
 
															+	struct kvm *kvm = gp->l1_host;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+	kvmhv_flush_lpid(gp->shadow_lpid);
														
 
															+	kvmhv_update_ptbl_cache(gp);
														
 
															+	if (gp->l1_gr_to_hr == 0)
														
 
															+		kvmhv_remove_nested(gp);
														
 
															+}
														
 
															+
														
 
															+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
														
 
															+					  bool create)
														
 
															+{
														
 
															+	struct kvm_nested_guest *gp, *newgp;
														
 
															+
														
 
															+	if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
														
 
															+	    l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
														
 
															+		return NULL;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	gp = kvm->arch.nested_guests[l1_lpid];
														
 
															+	if (gp)
														
 
															+		++gp->refcnt;
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+
														
 
															+	if (gp || !create)
														
 
															+		return gp;
														
 
															+
														
 
															+	newgp = kvmhv_alloc_nested(kvm, l1_lpid);
														
 
															+	if (!newgp)
														
 
															+		return NULL;
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	if (kvm->arch.nested_guests[l1_lpid]) {
														
 
															+		/* someone else beat us to it */
														
 
															+		gp = kvm->arch.nested_guests[l1_lpid];
														
 
															+	} else {
														
 
															+		kvm->arch.nested_guests[l1_lpid] = newgp;
														
 
															+		++newgp->refcnt;
														
 
															+		gp = newgp;
														
 
															+		newgp = NULL;
														
 
															+		if (l1_lpid > kvm->arch.max_nested_lpid)
														
 
															+			kvm->arch.max_nested_lpid = l1_lpid;
														
 
															+	}
														
 
															+	++gp->refcnt;
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+
														
 
															+	if (newgp)
														
 
															+		kvmhv_release_nested(newgp);
														
 
															+
														
 
															+	return gp;
														
 
															+}
														
 
															+
														
 
															+void kvmhv_put_nested(struct kvm_nested_guest *gp)
														
 
															+{
														
 
															+	struct kvm *kvm = gp->l1_host;
														
 
															+	long ref;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	ref = --gp->refcnt;
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+	if (ref == 0)
														
 
															+		kvmhv_release_nested(gp);
														
 
															+}
														
 
															+
														
 
															+static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
														
 
															+{
														
 
															+	if (lpid > kvm->arch.max_nested_lpid)
														
 
															+		return NULL;
														
 
															+	return kvm->arch.nested_guests[lpid];
														
 
															+}
														
 
															+
														
 
															+static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
														
 
															+{
														
 
															+	return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
														
 
															+				       RMAP_NESTED_GPA_MASK));
														
 
															+}
														
 
															+
														
 
															+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
														
 
															+			    struct rmap_nested **n_rmap)
														
 
															+{
														
 
															+	struct llist_node *entry = ((struct llist_head *) rmapp)->first;
														
 
															+	struct rmap_nested *cursor;
														
 
															+	u64 rmap, new_rmap = (*n_rmap)->rmap;
														
 
															+
														
 
															+	/* Are there any existing entries? */
														
 
															+	if (!(*rmapp)) {
														
 
															+		/* No -> use the rmap as a single entry */
														
 
															+		*rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/* Do any entries match what we're trying to insert? */
														
 
															+	for_each_nest_rmap_safe(cursor, entry, &rmap) {
														
 
															+		if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
														
 
															+			return;
														
 
															+	}
														
 
															+
														
 
															+	/* Do we need to create a list or just add the new entry? */
														
 
															+	rmap = *rmapp;
														
 
															+	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
														
 
															+		*rmapp = 0UL;
														
 
															+	llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
														
 
															+	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
														
 
															+		(*n_rmap)->list.next = (struct llist_node *) rmap;
														
 
															+
														
 
															+	/* Set NULL so not freed by caller */
														
 
															+	*n_rmap = NULL;
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
														
 
															+				   unsigned long hpa, unsigned long mask)
														
 
															+{
														
 
															+	struct kvm_nested_guest *gp;
														
 
															+	unsigned long gpa;
														
 
															+	unsigned int shift, lpid;
														
 
															+	pte_t *ptep;
														
 
															+
														
 
															+	gpa = n_rmap & RMAP_NESTED_GPA_MASK;
														
 
															+	lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
														
 
															+	gp = kvmhv_find_nested(kvm, lpid);
														
 
															+	if (!gp)
														
 
															+		return;
														
 
															+
														
 
															+	/* Find and invalidate the pte */
														
 
															+	ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
														
 
															+	/* Don't spuriously invalidate ptes if the pfn has changed */
														
 
															+	if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
														
 
															+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
														
 
															+					unsigned long hpa, unsigned long mask)
														
 
															+{
														
 
															+	struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
														
 
															+	struct rmap_nested *cursor;
														
 
															+	unsigned long rmap;
														
 
															+
														
 
															+	for_each_nest_rmap_safe(cursor, entry, &rmap) {
														
 
															+		kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
														
 
															+		kfree(cursor);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* called with kvm->mmu_lock held */
														
 
															+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
														
 
															+				  struct kvm_memory_slot *memslot,
														
 
															+				  unsigned long gpa, unsigned long hpa,
														
 
															+				  unsigned long nbytes)
														
 
															+{
														
 
															+	unsigned long gfn, end_gfn;
														
 
															+	unsigned long addr_mask;
														
 
															+
														
 
															+	if (!memslot)
														
 
															+		return;
														
 
															+	gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
														
 
															+	end_gfn = gfn + (nbytes >> PAGE_SHIFT);
														
 
															+
														
 
															+	addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
														
 
															+	hpa &= addr_mask;
														
 
															+
														
 
															+	for (; gfn < end_gfn; gfn++) {
														
 
															+		unsigned long *rmap = &memslot->arch.rmap[gfn];
														
 
															+		kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
														
 
															+{
														
 
															+	unsigned long page;
														
 
															+
														
 
															+	for (page = 0; page < free->npages; page++) {
														
 
															+		unsigned long rmap, *rmapp = &free->arch.rmap[page];
														
 
															+		struct rmap_nested *cursor;
														
 
															+		struct llist_node *entry;
														
 
															+
														
 
															+		entry = llist_del_all((struct llist_head *) rmapp);
														
 
															+		for_each_nest_rmap_safe(cursor, entry, &rmap)
														
 
															+			kfree(cursor);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
														
 
															+					struct kvm_nested_guest *gp,
														
 
															+					long gpa, int *shift_ret)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	bool ret = false;
														
 
															+	pte_t *ptep;
														
 
															+	int shift;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
														
 
															+	if (!shift)
														
 
															+		shift = PAGE_SHIFT;
														
 
															+	if (ptep && pte_present(*ptep)) {
														
 
															+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
														
 
															+		ret = true;
														
 
															+	}
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+
														
 
															+	if (shift_ret)
														
 
															+		*shift_ret = shift;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline int get_ric(unsigned int instr)
														
 
															+{
														
 
															+	return (instr >> 18) & 0x3;
														
 
															+}
														
 
															+
														
 
															+static inline int get_prs(unsigned int instr)
														
 
															+{
														
 
															+	return (instr >> 17) & 0x1;
														
 
															+}
														
 
															+
														
 
															+static inline int get_r(unsigned int instr)
														
 
															+{
														
 
															+	return (instr >> 16) & 0x1;
														
 
															+}
														
 
															+
														
 
															+static inline int get_lpid(unsigned long r_val)
														
 
															+{
														
 
															+	return r_val & 0xffffffff;
														
 
															+}
														
 
															+
														
 
															+static inline int get_is(unsigned long r_val)
														
 
															+{
														
 
															+	return (r_val >> 10) & 0x3;
														
 
															+}
														
 
															+
														
 
															+static inline int get_ap(unsigned long r_val)
														
 
															+{
														
 
															+	return (r_val >> 5) & 0x7;
														
 
															+}
														
 
															+
														
 
															+static inline long get_epn(unsigned long r_val)
														
 
															+{
														
 
															+	return r_val >> 12;
														
 
															+}
														
 
															+
														
 
															+static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
														
 
															+					int ap, long epn)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	struct kvm_nested_guest *gp;
														
 
															+	long npages;
														
 
															+	int shift, shadow_shift;
														
 
															+	unsigned long addr;
														
 
															+
														
 
															+	shift = ap_to_shift(ap);
														
 
															+	addr = epn << 12;
														
 
															+	if (shift < 0)
														
 
															+		/* Invalid ap encoding */
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	addr &= ~((1UL << shift) - 1);
														
 
															+	npages = 1UL << (shift - PAGE_SHIFT);
														
 
															+
														
 
															+	gp = kvmhv_get_nested(kvm, lpid, false);
														
 
															+	if (!gp) /* No such guest -> nothing to do */
														
 
															+		return 0;
														
 
															+	mutex_lock(&gp->tlb_lock);
														
 
															+
														
 
															+	/* There may be more than one host page backing this single guest pte */
														
 
															+	do {
														
 
															+		kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
														
 
															+
														
 
															+		npages -= 1UL << (shadow_shift - PAGE_SHIFT);
														
 
															+		addr += 1UL << shadow_shift;
														
 
															+	} while (npages > 0);
														
 
															+
														
 
															+	mutex_unlock(&gp->tlb_lock);
														
 
															+	kvmhv_put_nested(gp);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
														
 
															+				     struct kvm_nested_guest *gp, int ric)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+
														
 
															+	mutex_lock(&gp->tlb_lock);
														
 
															+	switch (ric) {
														
 
															+	case 0:
														
 
															+		/* Invalidate TLB */
														
 
															+		spin_lock(&kvm->mmu_lock);
														
 
															+		kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
														
 
															+					  gp->shadow_lpid);
														
 
															+		kvmhv_flush_lpid(gp->shadow_lpid);
														
 
															+		spin_unlock(&kvm->mmu_lock);
														
 
															+		break;
														
 
															+	case 1:
														
 
															+		/*
														
 
															+		 * Invalidate PWC
														
 
															+		 * We don't cache this -> nothing to do
														
 
															+		 */
														
 
															+		break;
														
 
															+	case 2:
														
 
															+		/* Invalidate TLB, PWC and caching of partition table entries */
														
 
															+		kvmhv_flush_nested(gp);
														
 
															+		break;
														
 
															+	default:
														
 
															+		break;
														
 
															+	}
														
 
															+	mutex_unlock(&gp->tlb_lock);
														
 
															+}
														
 
															+
														
 
															+static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	struct kvm_nested_guest *gp;
														
 
															+	int i;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
														
 
															+		gp = kvm->arch.nested_guests[i];
														
 
															+		if (gp) {
														
 
															+			spin_unlock(&kvm->mmu_lock);
														
 
															+			kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
														
 
															+			spin_lock(&kvm->mmu_lock);
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+}
														
 
															+
														
 
															+static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
														
 
															+				    unsigned long rsval, unsigned long rbval)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	struct kvm_nested_guest *gp;
														
 
															+	int r, ric, prs, is, ap;
														
 
															+	int lpid;
														
 
															+	long epn;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	ric = get_ric(instr);
														
 
															+	prs = get_prs(instr);
														
 
															+	r = get_r(instr);
														
 
															+	lpid = get_lpid(rsval);
														
 
															+	is = get_is(rbval);
														
 
															+
														
 
															+	/*
														
 
															+	 * These cases are invalid and are not handled:
														
 
															+	 * r   != 1 -> Only radix supported
														
 
															+	 * prs == 1 -> Not HV privileged
														
 
															+	 * ric == 3 -> No cluster bombs for radix
														
 
															+	 * is  == 1 -> Partition scoped translations not associated with pid
														
 
															+	 * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
														
 
															+	 */
														
 
															+	if ((!r) || (prs) || (ric == 3) || (is == 1) ||
														
 
															+	    ((!is) && (ric == 1 || ric == 2)))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	switch (is) {
														
 
															+	case 0:
														
 
															+		/*
														
 
															+		 * We know ric == 0
														
 
															+		 * Invalidate TLB for a given target address
														
 
															+		 */
														
 
															+		epn = get_epn(rbval);
														
 
															+		ap = get_ap(rbval);
														
 
															+		ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
														
 
															+		break;
														
 
															+	case 2:
														
 
															+		/* Invalidate matching LPID */
														
 
															+		gp = kvmhv_get_nested(kvm, lpid, false);
														
 
															+		if (gp) {
														
 
															+			kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
														
 
															+			kvmhv_put_nested(gp);
														
 
															+		}
														
 
															+		break;
														
 
															+	case 3:
														
 
															+		/* Invalidate ALL LPIDs */
														
 
															+		kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
														
 
															+		break;
														
 
															+	default:
														
 
															+		ret = -EINVAL;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This handles the H_TLB_INVALIDATE hcall.
														
 
															+ * Parameters are (r4) tlbie instruction code, (r5) rS contents,
														
 
															+ * (r6) rB contents.
														
 
															+ */
														
 
															+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
														
 
															+			kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
														
 
															+	if (ret)
														
 
															+		return H_PARAMETER;
														
 
															+	return H_SUCCESS;
														
 
															+}
														
 
															+
														
 
															+/* Used to convert a nested guest real address to a L1 guest real address */
														
 
															+static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
														
 
															+				       struct kvm_nested_guest *gp,
														
 
															+				       unsigned long n_gpa, unsigned long dsisr,
														
 
															+				       struct kvmppc_pte *gpte_p)
														
 
															+{
														
 
															+	u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
														
 
															+					 &fault_addr);
														
 
															+
														
 
															+	if (ret) {
														
 
															+		/* We didn't find a pte */
														
 
															+		if (ret == -EINVAL) {
														
 
															+			/* Unsupported mmu config */
														
 
															+			flags |= DSISR_UNSUPP_MMU;
														
 
															+		} else if (ret == -ENOENT) {
														
 
															+			/* No translation found */
														
 
															+			flags |= DSISR_NOHPTE;
														
 
															+		} else if (ret == -EFAULT) {
														
 
															+			/* Couldn't access L1 real address */
														
 
															+			flags |= DSISR_PRTABLE_FAULT;
														
 
															+			vcpu->arch.fault_gpa = fault_addr;
														
 
															+		} else {
														
 
															+			/* Unknown error */
														
 
															+			return ret;
														
 
															+		}
														
 
															+		goto forward_to_l1;
														
 
															+	} else {
														
 
															+		/* We found a pte -> check permissions */
														
 
															+		if (dsisr & DSISR_ISSTORE) {
														
 
															+			/* Can we write? */
														
 
															+			if (!gpte_p->may_write) {
														
 
															+				flags |= DSISR_PROTFAULT;
														
 
															+				goto forward_to_l1;
														
 
															+			}
														
 
															+		} else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
														
 
															+			/* Can we execute? */
														
 
															+			if (!gpte_p->may_execute) {
														
 
															+				flags |= SRR1_ISI_N_OR_G;
														
 
															+				goto forward_to_l1;
														
 
															+			}
														
 
															+		} else {
														
 
															+			/* Can we read? */
														
 
															+			if (!gpte_p->may_read && !gpte_p->may_write) {
														
 
															+				flags |= DSISR_PROTFAULT;
														
 
															+				goto forward_to_l1;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+forward_to_l1:
														
 
															+	vcpu->arch.fault_dsisr = flags;
														
 
															+	if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
														
 
															+		vcpu->arch.shregs.msr &= ~0x783f0000ul;
														
 
															+		vcpu->arch.shregs.msr |= flags;
														
 
															+	}
														
 
															+	return RESUME_HOST;
														
 
															+}
														
 
															+
														
 
															+static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
														
 
															+				       struct kvm_nested_guest *gp,
														
 
															+				       unsigned long n_gpa,
														
 
															+				       struct kvmppc_pte gpte,
														
 
															+				       unsigned long dsisr)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	bool writing = !!(dsisr & DSISR_ISSTORE);
														
 
															+	u64 pgflags;
														
 
															+	bool ret;
														
 
															+
														
 
															+	/* Are the rc bits set in the L1 partition scoped pte? */
														
 
															+	pgflags = _PAGE_ACCESSED;
														
 
															+	if (writing)
														
 
															+		pgflags |= _PAGE_DIRTY;
														
 
															+	if (pgflags & ~gpte.rc)
														
 
															+		return RESUME_HOST;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
														
 
															+	ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
														
 
															+				     gpte.raddr, kvm->arch.lpid);
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+	if (!ret)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	/* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
														
 
															+	ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
														
 
															+				      gp->shadow_lpid);
														
 
															+	if (!ret)
														
 
															+		return -EINVAL;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static inline int kvmppc_radix_level_to_shift(int level)
														
 
															+{
														
 
															+	switch (level) {
														
 
															+	case 2:
														
 
															+		return PUD_SHIFT;
														
 
															+	case 1:
														
 
															+		return PMD_SHIFT;
														
 
															+	default:
														
 
															+		return PAGE_SHIFT;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static inline int kvmppc_radix_shift_to_level(int shift)
														
 
															+{
														
 
															+	if (shift == PUD_SHIFT)
														
 
															+		return 2;
														
 
															+	if (shift == PMD_SHIFT)
														
 
															+		return 1;
														
 
															+	if (shift == PAGE_SHIFT)
														
 
															+		return 0;
														
 
															+	WARN_ON_ONCE(1);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* called with gp->tlb_lock held */
														
 
															+static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
														
 
															+					  struct kvm_nested_guest *gp)
														
 
															+{
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															+	struct kvm_memory_slot *memslot;
														
 
															+	struct rmap_nested *n_rmap;
														
 
															+	struct kvmppc_pte gpte;
														
 
															+	pte_t pte, *pte_p;
														
 
															+	unsigned long mmu_seq;
														
 
															+	unsigned long dsisr = vcpu->arch.fault_dsisr;
														
 
															+	unsigned long ea = vcpu->arch.fault_dar;
														
 
															+	unsigned long *rmapp;
														
 
															+	unsigned long n_gpa, gpa, gfn, perm = 0UL;
														
 
															+	unsigned int shift, l1_shift, level;
														
 
															+	bool writing = !!(dsisr & DSISR_ISSTORE);
														
 
															+	bool kvm_ro = false;
														
 
															+	long int ret;
														
 
															+
														
 
															+	if (!gp->l1_gr_to_hr) {
														
 
															+		kvmhv_update_ptbl_cache(gp);
														
 
															+		if (!gp->l1_gr_to_hr)
														
 
															+			return RESUME_HOST;
														
 
															+	}
														
 
															+
														
 
															+	/* Convert the nested guest real address into a L1 guest real address */
														
 
															+
														
 
															+	n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
														
 
															+	if (!(dsisr & DSISR_PRTABLE_FAULT))
														
 
															+		n_gpa |= ea & 0xFFF;
														
 
															+	ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
														
 
															+
														
 
															+	/*
														
 
															+	 * If the hardware found a translation but we don't now have a usable
														
 
															+	 * translation in the l1 partition-scoped tree, remove the shadow pte
														
 
															+	 * and let the guest retry.
														
 
															+	 */
														
 
															+	if (ret == RESUME_HOST &&
														
 
															+	    (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
														
 
															+		      DSISR_BAD_COPYPASTE)))
														
 
															+		goto inval;
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	/* Failed to set the reference/change bits */
														
 
															+	if (dsisr & DSISR_SET_RC) {
														
 
															+		ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
														
 
															+		if (ret == RESUME_HOST)
														
 
															+			return ret;
														
 
															+		if (ret)
														
 
															+			goto inval;
														
 
															+		dsisr &= ~DSISR_SET_RC;
														
 
															+		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
														
 
															+			       DSISR_PROTFAULT)))
														
 
															+			return RESUME_GUEST;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * We took an HISI or HDSI while we were running a nested guest which
														
 
															+	 * means we have no partition scoped translation for that. This means
														
 
															+	 * we need to insert a pte for the mapping into our shadow_pgtable.
														
 
															+	 */
														
 
															+
														
 
															+	l1_shift = gpte.page_shift;
														
 
															+	if (l1_shift < PAGE_SHIFT) {
														
 
															+		/* We don't support l1 using a page size smaller than our own */
														
 
															+		pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
														
 
															+			l1_shift, PAGE_SHIFT);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+	gpa = gpte.raddr;
														
 
															+	gfn = gpa >> PAGE_SHIFT;
														
 
															+
														
 
															+	/* 1. Get the corresponding host memslot */
														
 
															+
														
 
															+	memslot = gfn_to_memslot(kvm, gfn);
														
 
															+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
														
 
															+		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
														
 
															+			/* unusual error -> reflect to the guest as a DSI */
														
 
															+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
														
 
															+			return RESUME_GUEST;
														
 
															+		}
														
 
															+		/* passthrough of emulated MMIO case... */
														
 
															+		pr_err("emulated MMIO passthrough?\n");
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+	if (memslot->flags & KVM_MEM_READONLY) {
														
 
															+		if (writing) {
														
 
															+			/* Give the guest a DSI */
														
 
															+			kvmppc_core_queue_data_storage(vcpu, ea,
														
 
															+					DSISR_ISSTORE | DSISR_PROTFAULT);
														
 
															+			return RESUME_GUEST;
														
 
															+		}
														
 
															+		kvm_ro = true;
														
 
															+	}
														
 
															+
														
 
															+	/* 2. Find the host pte for this L1 guest real address */
														
 
															+
														
 
															+	/* Used to check for invalidations in progress */
														
 
															+	mmu_seq = kvm->mmu_notifier_seq;
														
 
															+	smp_rmb();
														
 
															+
														
 
															+	/* See if can find translation in our partition scoped tables for L1 */
														
 
															+	pte = __pte(0);
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
														
 
															+	if (!shift)
														
 
															+		shift = PAGE_SHIFT;
														
 
															+	if (pte_p)
														
 
															+		pte = *pte_p;
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+
														
 
															+	if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
														
 
															+		/* No suitable pte found -> try to insert a mapping */
														
 
															+		ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
														
 
															+					writing, kvm_ro, &pte, &level);
														
 
															+		if (ret == -EAGAIN)
														
 
															+			return RESUME_GUEST;
														
 
															+		else if (ret)
														
 
															+			return ret;
														
 
															+		shift = kvmppc_radix_level_to_shift(level);
														
 
															+	}
														
 
															+
														
 
															+	/* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
														
 
															+
														
 
															+	/* The permissions is the combination of the host and l1 guest ptes */
														
 
															+	perm |= gpte.may_read ? 0UL : _PAGE_READ;
														
 
															+	perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
														
 
															+	perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
														
 
															+	pte = __pte(pte_val(pte) & ~perm);
														
 
															+
														
 
															+	/* What size pte can we insert? */
														
 
															+	if (shift > l1_shift) {
														
 
															+		u64 mask;
														
 
															+		unsigned int actual_shift = PAGE_SHIFT;
														
 
															+		if (PMD_SHIFT < l1_shift)
														
 
															+			actual_shift = PMD_SHIFT;
														
 
															+		mask = (1UL << shift) - (1UL << actual_shift);
														
 
															+		pte = __pte(pte_val(pte) | (gpa & mask));
														
 
															+		shift = actual_shift;
														
 
															+	}
														
 
															+	level = kvmppc_radix_shift_to_level(shift);
														
 
															+	n_gpa &= ~((1UL << shift) - 1);
														
 
															+
														
 
															+	/* 4. Insert the pte into our shadow_pgtable */
														
 
															+
														
 
															+	n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
														
 
															+	if (!n_rmap)
														
 
															+		return RESUME_GUEST; /* Let the guest try again */
														
 
															+	n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
														
 
															+		(((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
														
 
															+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
														
 
															+	ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
														
 
															+				mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
														
 
															+	if (n_rmap)
														
 
															+		kfree(n_rmap);
														
 
															+	if (ret == -EAGAIN)
														
 
															+		ret = RESUME_GUEST;	/* Let the guest try again */
														
 
															+
														
 
															+	return ret;
														
 
															+
														
 
															+ inval:
														
 
															+	kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
														
 
															+	return RESUME_GUEST;
														
 
															+}
														
 
															+
														
 
															+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct kvm_nested_guest *gp = vcpu->arch.nested;
														
 
															+	long int ret;
														
 
															+
														
 
															+	mutex_lock(&gp->tlb_lock);
														
 
															+	ret = __kvmhv_nested_page_fault(vcpu, gp);
														
 
															+	mutex_unlock(&gp->tlb_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
														
 
															+{
														
 
															+	int ret = -1;
														
 
															+
														
 
															+	spin_lock(&kvm->mmu_lock);
														
 
															+	while (++lpid <= kvm->arch.max_nested_lpid) {
														
 
															+		if (kvm->arch.nested_guests[lpid]) {
														
 
															+			ret = lpid;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&kvm->mmu_lock);
														
 
															+	return ret;
														
 
															+}
														
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void)
 
															 	local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
														
 
															 void kvmppc_subcore_exit_guest(void)
														
 
															 {
														
@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void)
 
															 	local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
														
 
															 static bool kvmppc_tb_resync_required(void)
														
 
															 {
														
@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void)
 
															 	} else {
														
 
															 		wait_for_tb_resync();
														
 
															 	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Reset tb_offset_applied so the guest exit code won't try
														
 
															+	 * to subtract the previous timebase offset from the timebase.
														
 
															+	 */
														
 
															+	if (local_paca->kvm_hstate.kvm_vcore)
														
 
															+		local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 
															 	/* Mark the target VCPU as having an interrupt pending */
														
 
															 	vcpu->stat.queue_intr++;
														
 
															-	set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
														
 
															+	set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
														
 
															 	/* Kick self ? Just set MER and return */
														
 
															 	if (vcpu == this_vcpu) {
														
@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 
															 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	/* Note: Only called on self ! */
														
 
															-	clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
														
 
															-		  &vcpu->arch.pending_exceptions);
														
 
															+	clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
														
 
															 	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
														
 
															 }
														
@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 
															 	void __iomem *xics_phys;
														
 
															 	int64_t rc;
														
 
															+	if (kvmhv_on_pseries()) {
														
 
															+		unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
														
 
															+
														
 
															+		iosync();
														
 
															+		plpar_hcall_raw(H_EOI, retbuf, hwirq);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															 	rc = pnv_opal_pci_msi_eoi(c, hwirq);
														
 
															 	if (rc)
														
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -28,6 +28,7 @@
 
															 #include <asm/exception-64s.h>
														
 
															 #include <asm/kvm_book3s_asm.h>
														
 
															 #include <asm/book3s/64/mmu-hash.h>
														
 
															+#include <asm/export.h>
														
 
															 #include <asm/tm.h>
														
 
															 #include <asm/opal.h>
														
 
															 #include <asm/xive-regs.h>
														
@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
															 #define NAPPING_NOVCPU	2
														
 
															 /* Stack frame offsets for kvmppc_hv_entry */
														
 
															-#define SFS			160
														
 
															+#define SFS			208
														
 
															 #define STACK_SLOT_TRAP		(SFS-4)
														
 
															+#define STACK_SLOT_SHORT_PATH	(SFS-8)
														
 
															 #define STACK_SLOT_TID		(SFS-16)
														
 
															 #define STACK_SLOT_PSSCR	(SFS-24)
														
 
															 #define STACK_SLOT_PID		(SFS-32)
														
@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
															 #define STACK_SLOT_DAWR		(SFS-56)
														
 
															 #define STACK_SLOT_DAWRX	(SFS-64)
														
 
															 #define STACK_SLOT_HFSCR	(SFS-72)
														
 
															+/* the following is used by the P9 short path */
														
 
															+#define STACK_SLOT_NVGPRS	(SFS-152)	/* 18 gprs */
														
 
															 /*
														
 
															  * Call kvmppc_hv_entry in real mode.
														
@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
															 	mtspr	SPRN_SPRG_VDSO_WRITE,r3
														
 
															 	/* Reload the host's PMU registers */
														
 
															-	lbz	r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
														
 
															-	cmpwi	r4, 0
														
 
															-	beq	23f			/* skip if not */
														
 
															-BEGIN_FTR_SECTION
														
 
															-	ld	r3, HSTATE_MMCR0(r13)
														
 
															-	andi.	r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
														
 
															-	cmpwi	r4, MMCR0_PMAO
														
 
															-	beql	kvmppc_fix_pmao
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
														
 
															-	lwz	r3, HSTATE_PMC1(r13)
														
 
															-	lwz	r4, HSTATE_PMC2(r13)
														
 
															-	lwz	r5, HSTATE_PMC3(r13)
														
 
															-	lwz	r6, HSTATE_PMC4(r13)
														
 
															-	lwz	r8, HSTATE_PMC5(r13)
														
 
															-	lwz	r9, HSTATE_PMC6(r13)
														
 
															-	mtspr	SPRN_PMC1, r3
														
 
															-	mtspr	SPRN_PMC2, r4
														
 
															-	mtspr	SPRN_PMC3, r5
														
 
															-	mtspr	SPRN_PMC4, r6
														
 
															-	mtspr	SPRN_PMC5, r8
														
 
															-	mtspr	SPRN_PMC6, r9
														
 
															-	ld	r3, HSTATE_MMCR0(r13)
														
 
															-	ld	r4, HSTATE_MMCR1(r13)
														
 
															-	ld	r5, HSTATE_MMCRA(r13)
														
 
															-	ld	r6, HSTATE_SIAR(r13)
														
 
															-	ld	r7, HSTATE_SDAR(r13)
														
 
															-	mtspr	SPRN_MMCR1, r4
														
 
															-	mtspr	SPRN_MMCRA, r5
														
 
															-	mtspr	SPRN_SIAR, r6
														
 
															-	mtspr	SPRN_SDAR, r7
														
 
															-BEGIN_FTR_SECTION
														
 
															-	ld	r8, HSTATE_MMCR2(r13)
														
 
															-	ld	r9, HSTATE_SIER(r13)
														
 
															-	mtspr	SPRN_MMCR2, r8
														
 
															-	mtspr	SPRN_SIER, r9
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-	mtspr	SPRN_MMCR0, r3
														
 
															-	isync
														
 
															-23:
														
 
															+	bl	kvmhv_load_host_pmu
														
 
															 	/*
														
 
															 	 * Reload DEC.  HDEC interrupts were disabled when
														
@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION
 
															 	b	91f
														
 
															 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
														
 
															 	/*
														
 
															-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
														
 
															+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
														
 
															 	 */
														
 
															 	mr      r3, r4
														
 
															 	ld      r4, VCPU_MSR(r3)
														
 
															+	li	r5, 0			/* don't preserve non-vol regs */
														
 
															 	bl	kvmppc_restore_tm_hv
														
 
															+	nop
														
 
															 	ld	r4, HSTATE_KVM_VCPU(r13)
														
 
															 91:
														
 
															 #endif
														
 
															-	/* Load guest PMU registers */
														
 
															-	/* R4 is live here (vcpu pointer) */
														
 
															-	li	r3, 1
														
 
															-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
														
 
															-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
														
 
															-	isync
														
 
															-BEGIN_FTR_SECTION
														
 
															-	ld	r3, VCPU_MMCR(r4)
														
 
															-	andi.	r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
														
 
															-	cmpwi	r5, MMCR0_PMAO
														
 
															-	beql	kvmppc_fix_pmao
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
														
 
															-	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
														
 
															-	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
														
 
															-	lwz	r6, VCPU_PMC + 8(r4)
														
 
															-	lwz	r7, VCPU_PMC + 12(r4)
														
 
															-	lwz	r8, VCPU_PMC + 16(r4)
														
 
															-	lwz	r9, VCPU_PMC + 20(r4)
														
 
															-	mtspr	SPRN_PMC1, r3
														
 
															-	mtspr	SPRN_PMC2, r5
														
 
															-	mtspr	SPRN_PMC3, r6
														
 
															-	mtspr	SPRN_PMC4, r7
														
 
															-	mtspr	SPRN_PMC5, r8
														
 
															-	mtspr	SPRN_PMC6, r9
														
 
															-	ld	r3, VCPU_MMCR(r4)
														
 
															-	ld	r5, VCPU_MMCR + 8(r4)
														
 
															-	ld	r6, VCPU_MMCR + 16(r4)
														
 
															-	ld	r7, VCPU_SIAR(r4)
														
 
															-	ld	r8, VCPU_SDAR(r4)
														
 
															-	mtspr	SPRN_MMCR1, r5
														
 
															-	mtspr	SPRN_MMCRA, r6
														
 
															-	mtspr	SPRN_SIAR, r7
														
 
															-	mtspr	SPRN_SDAR, r8
														
 
															-BEGIN_FTR_SECTION
														
 
															-	ld	r5, VCPU_MMCR + 24(r4)
														
 
															-	ld	r6, VCPU_SIER(r4)
														
 
															-	mtspr	SPRN_MMCR2, r5
														
 
															-	mtspr	SPRN_SIER, r6
														
 
															-BEGIN_FTR_SECTION_NESTED(96)
														
 
															-	lwz	r7, VCPU_PMC + 24(r4)
														
 
															-	lwz	r8, VCPU_PMC + 28(r4)
														
 
															-	ld	r9, VCPU_MMCR + 32(r4)
														
 
															-	mtspr	SPRN_SPMC1, r7
														
 
															-	mtspr	SPRN_SPMC2, r8
														
 
															-	mtspr	SPRN_MMCRS, r9
														
 
															-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-	mtspr	SPRN_MMCR0, r3
														
 
															-	isync
														
 
															+	/* Load guest PMU registers; r4 = vcpu pointer here */
														
 
															+	mr	r3, r4
														
 
															+	bl	kvmhv_load_guest_pmu
														
 
															 	/* Load up FP, VMX and VSX registers */
														
 
															+	ld	r4, HSTATE_KVM_VCPU(r13)
														
 
															 	bl	kvmppc_load_fp
														
 
															 	ld	r14, VCPU_GPR(R14)(r4)
														
@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 
															 no_xive:
														
 
															 #endif /* CONFIG_KVM_XICS */
														
 
															-deliver_guest_interrupt:
														
 
															-	ld	r6, VCPU_CTR(r4)
														
 
															-	ld	r7, VCPU_XER(r4)
														
 
															-
														
 
															-	mtctr	r6
														
 
															-	mtxer	r7
														
 
															+	li	r0, 0
														
 
															+	stw	r0, STACK_SLOT_SHORT_PATH(r1)
														
 
															-kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */
														
 
															-	ld	r10, VCPU_PC(r4)
														
 
															-	ld	r11, VCPU_MSR(r4)
														
 
															+deliver_guest_interrupt:	/* r4 = vcpu, r13 = paca */
														
 
															+	/* Check if we can deliver an external or decrementer interrupt now */
														
 
															+	ld	r0, VCPU_PENDING_EXC(r4)
														
 
															+BEGIN_FTR_SECTION
														
 
															+	/* On POWER9, also check for emulated doorbell interrupt */
														
 
															+	lbz	r3, VCPU_DBELL_REQ(r4)
														
 
															+	or	r0, r0, r3
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
														
 
															+	cmpdi	r0, 0
														
 
															+	beq	71f
														
 
															+	mr	r3, r4
														
 
															+	bl	kvmppc_guest_entry_inject_int
														
 
															+	ld	r4, HSTATE_KVM_VCPU(r13)
														
 
															+71:
														
 
															 	ld	r6, VCPU_SRR0(r4)
														
 
															 	ld	r7, VCPU_SRR1(r4)
														
 
															 	mtspr	SPRN_SRR0, r6
														
 
															 	mtspr	SPRN_SRR1, r7
														
 
															+fast_guest_entry_c:
														
 
															+	ld	r10, VCPU_PC(r4)
														
 
															+	ld	r11, VCPU_MSR(r4)
														
 
															 	/* r11 = vcpu->arch.msr & ~MSR_HV */
														
 
															 	rldicl	r11, r11, 63 - MSR_HV_LG, 1
														
 
															 	rotldi	r11, r11, 1 + MSR_HV_LG
														
 
															 	ori	r11, r11, MSR_ME
														
 
															-	/* Check if we can deliver an external or decrementer interrupt now */
														
 
															-	ld	r0, VCPU_PENDING_EXC(r4)
														
 
															-	rldicl	r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
														
 
															-	cmpdi	cr1, r0, 0
														
 
															-	andi.	r8, r11, MSR_EE
														
 
															-	mfspr	r8, SPRN_LPCR
														
 
															-	/* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
														
 
															-	rldimi	r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
														
 
															-	mtspr	SPRN_LPCR, r8
														
 
															-	isync
														
 
															-	beq	5f
														
 
															-	li	r0, BOOK3S_INTERRUPT_EXTERNAL
														
 
															-	bne	cr1, 12f
														
 
															-	mfspr	r0, SPRN_DEC
														
 
															-BEGIN_FTR_SECTION
														
 
															-	/* On POWER9 check whether the guest has large decrementer enabled */
														
 
															-	andis.	r8, r8, LPCR_LD@h
														
 
															-	bne	15f
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
														
 
															-	extsw	r0, r0
														
 
															-15:	cmpdi	r0, 0
														
 
															-	li	r0, BOOK3S_INTERRUPT_DECREMENTER
														
 
															-	bge	5f
														
 
															-
														
 
															-12:	mtspr	SPRN_SRR0, r10
														
 
															-	mr	r10,r0
														
 
															-	mtspr	SPRN_SRR1, r11
														
 
															-	mr	r9, r4
														
 
															-	bl	kvmppc_msr_interrupt
														
 
															-5:
														
 
															-BEGIN_FTR_SECTION
														
 
															-	b	fast_guest_return
														
 
															-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
														
 
															-	/* On POWER9, check for pending doorbell requests */
														
 
															-	lbz	r0, VCPU_DBELL_REQ(r4)
														
 
															-	cmpwi	r0, 0
														
 
															-	beq	fast_guest_return
														
 
															-	ld	r5, HSTATE_KVM_VCORE(r13)
														
 
															-	/* Set DPDES register so the CPU will take a doorbell interrupt */
														
 
															-	li	r0, 1
														
 
															-	mtspr	SPRN_DPDES, r0
														
 
															-	std	r0, VCORE_DPDES(r5)
														
 
															-	/* Make sure other cpus see vcore->dpdes set before dbell req clear */
														
 
															-	lwsync
														
 
															-	/* Clear the pending doorbell request */
														
 
															-	li	r0, 0
														
 
															-	stb	r0, VCPU_DBELL_REQ(r4)
														
 
															+	ld	r6, VCPU_CTR(r4)
														
 
															+	ld	r7, VCPU_XER(r4)
														
 
															+	mtctr	r6
														
 
															+	mtxer	r7
														
 
															 /*
														
 
															  * Required state:
														
@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION
 
															 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
														
 
															 	ld	r5, VCPU_LR(r4)
														
 
															-	lwz	r6, VCPU_CR(r4)
														
 
															+	ld	r6, VCPU_CR(r4)
														
 
															 	mtlr	r5
														
 
															 	mtcr	r6
														
@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
															 	HRFI_TO_GUEST
														
 
															 	b	.
														
 
															+/*
														
 
															+ * Enter the guest on a P9 or later system where we have exactly
														
 
															+ * one vcpu per vcore and we don't need to go to real mode
														
 
															+ * (which implies that host and guest are both using radix MMU mode).
														
 
															+ * r3 = vcpu pointer
														
 
															+ * Most SPRs and all the VSRs have been loaded already.
														
 
															+ */
														
 
															+_GLOBAL(__kvmhv_vcpu_entry_p9)
														
 
															+EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
														
 
															+	mflr	r0
														
 
															+	std	r0, PPC_LR_STKOFF(r1)
														
 
															+	stdu	r1, -SFS(r1)
														
 
															+
														
 
															+	li	r0, 1
														
 
															+	stw	r0, STACK_SLOT_SHORT_PATH(r1)
														
 
															+
														
 
															+	std	r3, HSTATE_KVM_VCPU(r13)
														
 
															+	mfcr	r4
														
 
															+	stw	r4, SFS+8(r1)
														
 
															+
														
 
															+	std	r1, HSTATE_HOST_R1(r13)
														
 
															+
														
 
															+	reg = 14
														
 
															+	.rept	18
														
 
															+	std	reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
														
 
															+	reg = reg + 1
														
 
															+	.endr
														
 
															+
														
 
															+	reg = 14
														
 
															+	.rept	18
														
 
															+	ld	reg, __VCPU_GPR(reg)(r3)
														
 
															+	reg = reg + 1
														
 
															+	.endr
														
 
															+
														
 
															+	mfmsr	r10
														
 
															+	std	r10, HSTATE_HOST_MSR(r13)
														
 
															+
														
 
															+	mr	r4, r3
														
 
															+	b	fast_guest_entry_c
														
 
															+guest_exit_short_path:
														
 
															+
														
 
															+	li	r0, KVM_GUEST_MODE_NONE
														
 
															+	stb	r0, HSTATE_IN_GUEST(r13)
														
 
															+
														
 
															+	reg = 14
														
 
															+	.rept	18
														
 
															+	std	reg, __VCPU_GPR(reg)(r9)
														
 
															+	reg = reg + 1
														
 
															+	.endr
														
 
															+
														
 
															+	reg = 14
														
 
															+	.rept	18
														
 
															+	ld	reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
														
 
															+	reg = reg + 1
														
 
															+	.endr
														
 
															+
														
 
															+	lwz	r4, SFS+8(r1)
														
 
															+	mtcr	r4
														
 
															+
														
 
															+	mr	r3, r12		/* trap number */
														
 
															+
														
 
															+	addi	r1, r1, SFS
														
 
															+	ld	r0, PPC_LR_STKOFF(r1)
														
 
															+	mtlr	r0
														
 
															+
														
 
															+	/* If we are in real mode, do a rfid to get back to the caller */
														
 
															+	mfmsr	r4
														
 
															+	andi.	r5, r4, MSR_IR
														
 
															+	bnelr
														
 
															+	rldicl	r5, r4, 64 - MSR_TS_S_LG, 62	/* extract TS field */
														
 
															+	mtspr	SPRN_SRR0, r0
														
 
															+	ld	r10, HSTATE_HOST_MSR(r13)
														
 
															+	rldimi	r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
														
 
															+	mtspr	SPRN_SRR1, r10
														
 
															+	RFI_TO_KERNEL
														
 
															+	b	.
														
 
															+
														
 
															 secondary_too_late:
														
 
															 	li	r12, 0
														
 
															 	stw	r12, STACK_SLOT_TRAP(r1)
														
@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv:
 
															 	std	r3, VCPU_GPR(R12)(r9)
														
 
															 	/* CR is in the high half of r12 */
														
 
															 	srdi	r4, r12, 32
														
 
															-	stw	r4, VCPU_CR(r9)
														
 
															+	std	r4, VCPU_CR(r9)
														
 
															 BEGIN_FTR_SECTION
														
 
															 	ld	r3, HSTATE_CFAR(r13)
														
 
															 	std	r3, VCPU_CFAR(r9)
														
@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
															 	std	r3, VCPU_CTR(r9)
														
 
															 	std	r4, VCPU_XER(r9)
														
 
															-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
														
 
															-	/* For softpatch interrupt, go off and do TM instruction emulation */
														
 
															-	cmpwi	r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
														
 
															-	beq	kvmppc_tm_emul
														
 
															-#endif
														
 
															+	/* Save more register state  */
														
 
															+	mfdar	r3
														
 
															+	mfdsisr	r4
														
 
															+	std	r3, VCPU_DAR(r9)
														
 
															+	stw	r4, VCPU_DSISR(r9)
														
 
															 	/* If this is a page table miss then see if it's theirs or ours */
														
 
															 	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
														
 
															 	beq	kvmppc_hdsi
														
 
															+	std	r3, VCPU_FAULT_DAR(r9)
														
 
															+	stw	r4, VCPU_FAULT_DSISR(r9)
														
 
															 	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
														
 
															 	beq	kvmppc_hisi
														
 
															+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
														
 
															+	/* For softpatch interrupt, go off and do TM instruction emulation */
														
 
															+	cmpwi	r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
														
 
															+	beq	kvmppc_tm_emul
														
 
															+#endif
														
 
															+
														
 
															 	/* See if this is a leftover HDEC interrupt */
														
 
															 	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
														
 
															 	bne	2f
														
@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
															 BEGIN_FTR_SECTION
														
 
															 	PPC_MSGSYNC
														
 
															 	lwsync
														
 
															+	/* always exit if we're running a nested guest */
														
 
															+	ld	r0, VCPU_NESTED(r9)
														
 
															+	cmpdi	r0, 0
														
 
															+	bne	guest_exit_cont
														
 
															 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
														
 
															 	lbz	r0, HSTATE_HOST_IPI(r13)
														
 
															 	cmpwi	r0, 0
														
 
															-	beq	4f
														
 
															+	beq	maybe_reenter_guest
														
 
															 	b	guest_exit_cont
														
 
															 3:
														
 
															 	/* If it's a hypervisor facility unavailable interrupt, save HFSCR */
														
@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
															 14:
														
 
															 	/* External interrupt ? */
														
 
															 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
														
 
															-	bne+	guest_exit_cont
														
 
															-
														
 
															-	/* External interrupt, first check for host_ipi. If this is
														
 
															-	 * set, we know the host wants us out so let's do it now
														
 
															-	 */
														
 
															-	bl	kvmppc_read_intr
														
 
															-
														
 
															-	/*
														
 
															-	 * Restore the active volatile registers after returning from
														
 
															-	 * a C function.
														
 
															-	 */
														
 
															-	ld	r9, HSTATE_KVM_VCPU(r13)
														
 
															-	li	r12, BOOK3S_INTERRUPT_EXTERNAL
														
 
															-
														
 
															-	/*
														
 
															-	 * kvmppc_read_intr return codes:
														
 
															-	 *
														
 
															-	 * Exit to host (r3 > 0)
														
 
															-	 *   1 An interrupt is pending that needs to be handled by the host
														
 
															-	 *     Exit guest and return to host by branching to guest_exit_cont
														
 
															-	 *
														
 
															-	 *   2 Passthrough that needs completion in the host
														
 
															-	 *     Exit guest and return to host by branching to guest_exit_cont
														
 
															-	 *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
														
 
															-	 *     to indicate to the host to complete handling the interrupt
														
 
															-	 *
														
 
															-	 * Before returning to guest, we check if any CPU is heading out
														
 
															-	 * to the host and if so, we head out also. If no CPUs are heading
														
 
															-	 * check return values <= 0.
														
 
															-	 *
														
 
															-	 * Return to guest (r3 <= 0)
														
 
															-	 *  0 No external interrupt is pending
														
 
															-	 * -1 A guest wakeup IPI (which has now been cleared)
														
 
															-	 *    In either case, we return to guest to deliver any pending
														
 
															-	 *    guest interrupts.
														
 
															-	 *
														
 
															-	 * -2 A PCI passthrough external interrupt was handled
														
 
															-	 *    (interrupt was delivered directly to guest)
														
 
															-	 *    Return to guest to deliver any pending guest interrupts.
														
 
															-	 */
														
 
															-
														
 
															-	cmpdi	r3, 1
														
 
															-	ble	1f
														
 
															-
														
 
															-	/* Return code = 2 */
														
 
															-	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
														
 
															-	stw	r12, VCPU_TRAP(r9)
														
 
															-	b	guest_exit_cont
														
 
															-
														
 
															-1:	/* Return code <= 1 */
														
 
															-	cmpdi	r3, 0
														
 
															-	bgt	guest_exit_cont
														
 
															-
														
 
															-	/* Return code <= 0 */
														
 
															-4:	ld	r5, HSTATE_KVM_VCORE(r13)
														
 
															-	lwz	r0, VCORE_ENTRY_EXIT(r5)
														
 
															-	cmpwi	r0, 0x100
														
 
															-	mr	r4, r9
														
 
															-	blt	deliver_guest_interrupt
														
 
															-
														
 
															-guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
														
 
															-	/* Save more register state  */
														
 
															-	mfdar	r6
														
 
															-	mfdsisr	r7
														
 
															-	std	r6, VCPU_DAR(r9)
														
 
															-	stw	r7, VCPU_DSISR(r9)
														
 
															-	/* don't overwrite fault_dar/fault_dsisr if HDSI */
														
 
															-	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
														
 
															-	beq	mc_cont
														
 
															-	std	r6, VCPU_FAULT_DAR(r9)
														
 
															-	stw	r7, VCPU_FAULT_DSISR(r9)
														
 
															-
														
 
															+	beq	kvmppc_guest_external
														
 
															 	/* See if it is a machine check */
														
 
															 	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
														
 
															 	beq	machine_check_realmode
														
 
															-mc_cont:
														
 
															+	/* Or a hypervisor maintenance interrupt */
														
 
															+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
														
 
															+	beq	hmi_realmode
														
 
															+
														
 
															+guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
														
 
															+
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
														
 
															 	addi	r3, r9, VCPU_TB_RMEXIT
														
 
															 	mr	r4, r9
														
@@ -1552,6 +1465,11 @@ mc_cont:
 
															 1:
														
 
															 #endif /* CONFIG_KVM_XICS */
														
 
															+	/* If we came in through the P9 short path, go back out to C now */
														
 
															+	lwz	r0, STACK_SLOT_SHORT_PATH(r1)
														
 
															+	cmpwi	r0, 0
														
 
															+	bne	guest_exit_short_path
														
 
															+
														
 
															 	/* For hash guest, read the guest SLB and save it away */
														
 
															 	ld	r5, VCPU_KVM(r9)
														
 
															 	lbz	r0, KVM_RADIX(r5)
														
@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION
 
															 	b	91f
														
 
															 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
														
 
															 	/*
														
 
															-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
														
 
															+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
														
 
															 	 */
														
 
															 	mr      r3, r9
														
 
															 	ld      r4, VCPU_MSR(r3)
														
 
															+	li	r5, 0			/* don't preserve non-vol regs */
														
 
															 	bl	kvmppc_save_tm_hv
														
 
															+	nop
														
 
															 	ld	r9, HSTATE_KVM_VCPU(r13)
														
 
															 91:
														
 
															 #endif
														
@@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 
															 25:
														
 
															 	/* Save PMU registers if requested */
														
 
															 	/* r8 and cr0.eq are live here */
														
 
															+	mr	r3, r9
														
 
															+	li	r4, 1
														
 
															+	beq	21f			/* if no VPA, save PMU stuff anyway */
														
 
															+	lbz	r4, LPPACA_PMCINUSE(r8)
														
 
															+21:	bl	kvmhv_save_guest_pmu
														
 
															+	ld	r9, HSTATE_KVM_VCPU(r13)
														
 
															+
														
 
															+	/* Restore host values of some registers */
														
 
															 BEGIN_FTR_SECTION
														
 
															-	/*
														
 
															-	 * POWER8 seems to have a hardware bug where setting
														
 
															-	 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
														
 
															-	 * when some counters are already negative doesn't seem
														
 
															-	 * to cause a performance monitor alert (and hence interrupt).
														
 
															-	 * The effect of this is that when saving the PMU state,
														
 
															-	 * if there is no PMU alert pending when we read MMCR0
														
 
															-	 * before freezing the counters, but one becomes pending
														
 
															-	 * before we read the counters, we lose it.
														
 
															-	 * To work around this, we need a way to freeze the counters
														
 
															-	 * before reading MMCR0.  Normally, freezing the counters
														
 
															-	 * is done by writing MMCR0 (to set MMCR0[FC]) which
														
 
															-	 * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
														
 
															-	 * we can also freeze the counters using MMCR2, by writing
														
 
															-	 * 1s to all the counter freeze condition bits (there are
														
 
															-	 * 9 bits each for 6 counters).
														
 
															-	 */
														
 
															-	li	r3, -1			/* set all freeze bits */
														
 
															-	clrrdi	r3, r3, 10
														
 
															-	mfspr	r10, SPRN_MMCR2
														
 
															-	mtspr	SPRN_MMCR2, r3
														
 
															-	isync
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-	li	r3, 1
														
 
															-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
														
 
															-	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
														
 
															-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
														
 
															-	mfspr	r6, SPRN_MMCRA
														
 
															-	/* Clear MMCRA in order to disable SDAR updates */
														
 
															-	li	r7, 0
														
 
															-	mtspr	SPRN_MMCRA, r7
														
 
															-	isync
														
 
															-	beq	21f			/* if no VPA, save PMU stuff anyway */
														
 
															-	lbz	r7, LPPACA_PMCINUSE(r8)
														
 
															-	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
														
 
															-	bne	21f
														
 
															-	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
														
 
															-	b	22f
														
 
															-21:	mfspr	r5, SPRN_MMCR1
														
 
															-	mfspr	r7, SPRN_SIAR
														
 
															-	mfspr	r8, SPRN_SDAR
														
 
															-	std	r4, VCPU_MMCR(r9)
														
 
															-	std	r5, VCPU_MMCR + 8(r9)
														
 
															-	std	r6, VCPU_MMCR + 16(r9)
														
 
															-BEGIN_FTR_SECTION
														
 
															-	std	r10, VCPU_MMCR + 24(r9)
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-	std	r7, VCPU_SIAR(r9)
														
 
															-	std	r8, VCPU_SDAR(r9)
														
 
															-	mfspr	r3, SPRN_PMC1
														
 
															-	mfspr	r4, SPRN_PMC2
														
 
															-	mfspr	r5, SPRN_PMC3
														
 
															-	mfspr	r6, SPRN_PMC4
														
 
															-	mfspr	r7, SPRN_PMC5
														
 
															-	mfspr	r8, SPRN_PMC6
														
 
															-	stw	r3, VCPU_PMC(r9)
														
 
															-	stw	r4, VCPU_PMC + 4(r9)
														
 
															-	stw	r5, VCPU_PMC + 8(r9)
														
 
															-	stw	r6, VCPU_PMC + 12(r9)
														
 
															-	stw	r7, VCPU_PMC + 16(r9)
														
 
															-	stw	r8, VCPU_PMC + 20(r9)
														
 
															-BEGIN_FTR_SECTION
														
 
															-	mfspr	r5, SPRN_SIER
														
 
															-	std	r5, VCPU_SIER(r9)
														
 
															-BEGIN_FTR_SECTION_NESTED(96)
														
 
															-	mfspr	r6, SPRN_SPMC1
														
 
															-	mfspr	r7, SPRN_SPMC2
														
 
															-	mfspr	r8, SPRN_MMCRS
														
 
															-	stw	r6, VCPU_PMC + 24(r9)
														
 
															-	stw	r7, VCPU_PMC + 28(r9)
														
 
															-	std	r8, VCPU_MMCR + 32(r9)
														
 
															-	lis	r4, 0x8000
														
 
															-	mtspr	SPRN_MMCRS, r4
														
 
															-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
														
 
															-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-22:
														
 
															-
														
 
															-	/* Restore host values of some registers */
														
 
															-BEGIN_FTR_SECTION
														
 
															-	ld	r5, STACK_SLOT_CIABR(r1)
														
 
															-	ld	r6, STACK_SLOT_DAWR(r1)
														
 
															-	ld	r7, STACK_SLOT_DAWRX(r1)
														
 
															-	mtspr	SPRN_CIABR, r5
														
 
															+	ld	r5, STACK_SLOT_CIABR(r1)
														
 
															+	ld	r6, STACK_SLOT_DAWR(r1)
														
 
															+	ld	r7, STACK_SLOT_DAWRX(r1)
														
 
															+	mtspr	SPRN_CIABR, r5
														
 
															 	/*
														
 
															 	 * If the DAWR doesn't work, it's ok to write these here as
														
 
															 	 * this value should always be zero
														
@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION
 
															 	mtspr	SPRN_DPDES, r8
														
 
															 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															-	/* If HMI, call kvmppc_realmode_hmi_handler() */
														
 
															-	lwz	r12, STACK_SLOT_TRAP(r1)
														
 
															-	cmpwi	r12, BOOK3S_INTERRUPT_HMI
														
 
															-	bne	27f
														
 
															-	bl	kvmppc_realmode_hmi_handler
														
 
															-	nop
														
 
															-	cmpdi	r3, 0
														
 
															-	/*
														
 
															-	 * At this point kvmppc_realmode_hmi_handler may have resync-ed
														
 
															-	 * the TB, and if it has, we must not subtract the guest timebase
														
 
															-	 * offset from the timebase. So, skip it.
														
 
															-	 *
														
 
															-	 * Also, do not call kvmppc_subcore_exit_guest() because it has
														
 
															-	 * been invoked as part of kvmppc_realmode_hmi_handler().
														
 
															-	 */
														
 
															-	beq	30f
														
 
															-
														
 
															-27:
														
 
															 	/* Subtract timebase offset from timebase */
														
 
															 	ld	r8, VCORE_TB_OFFSET_APPL(r5)
														
 
															 	cmpdi	r8,0
														
@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
															 	addis	r8,r8,0x100		/* if so, increment upper 40 bits */
														
 
															 	mtspr	SPRN_TBU40,r8
														
 
															-17:	bl	kvmppc_subcore_exit_guest
														
 
															+17:
														
 
															+	/*
														
 
															+	 * If this is an HMI, we called kvmppc_realmode_hmi_handler
														
 
															+	 * above, which may or may not have already called
														
 
															+	 * kvmppc_subcore_exit_guest.  Fortunately, all that
														
 
															+	 * kvmppc_subcore_exit_guest does is clear a flag, so calling
														
 
															+	 * it again here is benign even if kvmppc_realmode_hmi_handler
														
 
															+	 * has already called it.
														
 
															+	 */
														
 
															+	bl	kvmppc_subcore_exit_guest
														
 
															 	nop
														
 
															 30:	ld	r5,HSTATE_KVM_VCORE(r13)
														
 
															 	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
														
@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
															 	mtlr	r0
														
 
															 	blr
														
 
															+kvmppc_guest_external:
														
 
															+	/* External interrupt, first check for host_ipi. If this is
														
 
															+	 * set, we know the host wants us out so let's do it now
														
 
															+	 */
														
 
															+	bl	kvmppc_read_intr
														
 
															+
														
 
															+	/*
														
 
															+	 * Restore the active volatile registers after returning from
														
 
															+	 * a C function.
														
 
															+	 */
														
 
															+	ld	r9, HSTATE_KVM_VCPU(r13)
														
 
															+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
														
 
															+
														
 
															+	/*
														
 
															+	 * kvmppc_read_intr return codes:
														
 
															+	 *
														
 
															+	 * Exit to host (r3 > 0)
														
 
															+	 *   1 An interrupt is pending that needs to be handled by the host
														
 
															+	 *     Exit guest and return to host by branching to guest_exit_cont
														
 
															+	 *
														
 
															+	 *   2 Passthrough that needs completion in the host
														
 
															+	 *     Exit guest and return to host by branching to guest_exit_cont
														
 
															+	 *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
														
 
															+	 *     to indicate to the host to complete handling the interrupt
														
 
															+	 *
														
 
															+	 * Before returning to guest, we check if any CPU is heading out
														
 
															+	 * to the host and if so, we head out also. If no CPUs are heading
														
 
															+	 * check return values <= 0.
														
 
															+	 *
														
 
															+	 * Return to guest (r3 <= 0)
														
 
															+	 *  0 No external interrupt is pending
														
 
															+	 * -1 A guest wakeup IPI (which has now been cleared)
														
 
															+	 *    In either case, we return to guest to deliver any pending
														
 
															+	 *    guest interrupts.
														
 
															+	 *
														
 
															+	 * -2 A PCI passthrough external interrupt was handled
														
 
															+	 *    (interrupt was delivered directly to guest)
														
 
															+	 *    Return to guest to deliver any pending guest interrupts.
														
 
															+	 */
														
 
															+
														
 
															+	cmpdi	r3, 1
														
 
															+	ble	1f
														
 
															+
														
 
															+	/* Return code = 2 */
														
 
															+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
														
 
															+	stw	r12, VCPU_TRAP(r9)
														
 
															+	b	guest_exit_cont
														
 
															+
														
 
															+1:	/* Return code <= 1 */
														
 
															+	cmpdi	r3, 0
														
 
															+	bgt	guest_exit_cont
														
 
															+
														
 
															+	/* Return code <= 0 */
														
 
															+maybe_reenter_guest:
														
 
															+	ld	r5, HSTATE_KVM_VCORE(r13)
														
 
															+	lwz	r0, VCORE_ENTRY_EXIT(r5)
														
 
															+	cmpwi	r0, 0x100
														
 
															+	mr	r4, r9
														
 
															+	blt	deliver_guest_interrupt
														
 
															+	b	guest_exit_cont
														
 
															+
														
 
															 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
														
 
															 /*
														
 
															  * Softpatch interrupt for transactional memory emulation cases
														
@@ -2302,6 +2203,10 @@ hcall_try_real_mode:
 
															 	andi.	r0,r11,MSR_PR
														
 
															 	/* sc 1 from userspace - reflect to guest syscall */
														
 
															 	bne	sc_1_fast_return
														
 
															+	/* sc 1 from nested guest - give it to L1 to handle */
														
 
															+	ld	r0, VCPU_NESTED(r9)
														
 
															+	cmpdi	r0, 0
														
 
															+	bne	guest_exit_cont
														
 
															 	clrrdi	r3,r3,2
														
 
															 	cmpldi	r3,hcall_real_table_end - hcall_real_table
														
 
															 	bge	guest_exit_cont
														
@@ -2561,6 +2466,7 @@ hcall_real_table:
 
															 hcall_real_table_end:
														
 
															 _GLOBAL(kvmppc_h_set_xdabr)
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
														
 
															 	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
														
 
															 	beq	6f
														
 
															 	li	r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
														
@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr)
 
															 	blr
														
 
															 _GLOBAL(kvmppc_h_set_dabr)
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
														
 
															 	li	r5, DABRX_USER | DABRX_KERNEL
														
 
															 3:
														
 
															 BEGIN_FTR_SECTION
														
@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION
 
															 	b	91f
														
 
															 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
														
 
															 	/*
														
 
															-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
														
 
															+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
														
 
															 	 */
														
 
															 	ld	r3, HSTATE_KVM_VCPU(r13)
														
 
															 	ld      r4, VCPU_MSR(r3)
														
 
															+	li	r5, 0			/* don't preserve non-vol regs */
														
 
															 	bl	kvmppc_save_tm_hv
														
 
															+	nop
														
 
															 91:
														
 
															 #endif
														
@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION
 
															 	b	91f
														
 
															 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
														
 
															 	/*
														
 
															-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
														
 
															+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
														
 
															 	 */
														
 
															 	mr      r3, r4
														
 
															 	ld      r4, VCPU_MSR(r3)
														
 
															+	li	r5, 0			/* don't preserve non-vol regs */
														
 
															 	bl	kvmppc_restore_tm_hv
														
 
															+	nop
														
 
															 	ld	r4, HSTATE_KVM_VCPU(r13)
														
 
															 91:
														
 
															 #endif
														
@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 
															 	mr	r9, r4
														
 
															 	cmpdi	r3, 0
														
 
															 	bgt	guest_exit_cont
														
 
															-
														
 
															-	/* see if any other thread is already exiting */
														
 
															-	lwz	r0,VCORE_ENTRY_EXIT(r5)
														
 
															-	cmpwi	r0,0x100
														
 
															-	bge	guest_exit_cont
														
 
															-
														
 
															-	b	kvmppc_cede_reentry	/* if not go back to guest */
														
 
															+	b	maybe_reenter_guest
														
 
															 	/* cede when already previously prodded case */
														
 
															 kvm_cede_prodded:
														
@@ -2947,12 +2852,12 @@ machine_check_realmode:
 
															 	 */
														
 
															 	ld	r11, VCPU_MSR(r9)
														
 
															 	rldicl.	r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
														
 
															-	bne	mc_cont			/* if so, exit to host */
														
 
															+	bne	guest_exit_cont		/* if so, exit to host */
														
 
															 	/* Check if guest is capable of handling NMI exit */
														
 
															 	ld	r10, VCPU_KVM(r9)
														
 
															 	lbz	r10, KVM_FWNMI(r10)
														
 
															 	cmpdi	r10, 1			/* FWNMI capable? */
														
 
															-	beq	mc_cont			/* if so, exit with KVM_EXIT_NMI. */
														
 
															+	beq	guest_exit_cont		/* if so, exit with KVM_EXIT_NMI. */
														
 
															 	/* if not, fall through for backward compatibility. */
														
 
															 	andi.	r10, r11, MSR_RI	/* check for unrecoverable exception */
														
@@ -2965,6 +2870,21 @@ machine_check_realmode:
 
															 	bl	kvmppc_msr_interrupt
														
 
															 2:	b	fast_interrupt_c_return
														
 
															+/*
														
 
															+ * Call C code to handle a HMI in real mode.
														
 
															+ * Only the primary thread does the call, secondary threads are handled
														
 
															+ * by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
														
 
															+ * r9 points to the vcpu on entry
														
 
															+ */
														
 
															+hmi_realmode:
														
 
															+	lbz	r0, HSTATE_PTID(r13)
														
 
															+	cmpwi	r0, 0
														
 
															+	bne	guest_exit_cont
														
 
															+	bl	kvmppc_realmode_hmi_handler
														
 
															+	ld	r9, HSTATE_KVM_VCPU(r13)
														
 
															+	li	r12, BOOK3S_INTERRUPT_HMI
														
 
															+	b	guest_exit_cont
														
 
															+
														
 
															 /*
														
 
															  * Check the reason we woke from nap, and take appropriate action.
														
 
															  * Returns (in r3):
														
@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 
															  * Save transactional state and TM-related registers.
														
 
															  * Called with r3 pointing to the vcpu struct and r4 containing
														
 
															  * the guest MSR value.
														
 
															- * This can modify all checkpointed registers, but
														
 
															+ * r5 is non-zero iff non-volatile register state needs to be maintained.
														
 
															+ * If r5 == 0, this can modify all checkpointed registers, but
														
 
															  * restores r1 and r2 before exit.
														
 
															  */
														
 
															-kvmppc_save_tm_hv:
														
 
															+_GLOBAL_TOC(kvmppc_save_tm_hv)
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
														
 
															 	/* See if we need to handle fake suspend mode */
														
 
															 BEGIN_FTR_SECTION
														
 
															 	b	__kvmppc_save_tm
														
@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION
 
															 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
														
 
															 	nop
														
 
															-	std	r1, HSTATE_HOST_R1(r13)
														
 
															-
														
 
															-	/* Clear the MSR RI since r1, r13 may be foobar. */
														
 
															-	li	r5, 0
														
 
															-	mtmsrd	r5, 1
														
 
															-
														
 
															 	/* We have to treclaim here because that's the only way to do S->N */
														
 
															 	li	r3, TM_CAUSE_KVM_RESCHED
														
 
															 	TRECLAIM(R3)
														
@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
 
															 	 * We were in fake suspend, so we are not going to save the
														
 
															 	 * register state as the guest checkpointed state (since
														
 
															 	 * we already have it), therefore we can now use any volatile GPR.
														
 
															+	 * In fact treclaim in fake suspend state doesn't modify
														
 
															+	 * any registers.
														
 
															 	 */
														
 
															-	/* Reload PACA pointer, stack pointer and TOC. */
														
 
															-	GET_PACA(r13)
														
 
															-	ld	r1, HSTATE_HOST_R1(r13)
														
 
															-	ld	r2, PACATOC(r13)
														
 
															-	/* Set MSR RI now we have r1 and r13 back. */
														
 
															-	li	r5, MSR_RI
														
 
															-	mtmsrd	r5, 1
														
 
															-
														
 
															-	HMT_MEDIUM
														
 
															-	ld	r6, HSTATE_DSCR(r13)
														
 
															-	mtspr	SPRN_DSCR, r6
														
 
															-BEGIN_FTR_SECTION_NESTED(96)
														
 
															+BEGIN_FTR_SECTION
														
 
															 	bl	pnv_power9_force_smt4_release
														
 
															-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
														
 
															 	nop
														
 
															 4:
														
@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
 
															  * Restore transactional state and TM-related registers.
														
 
															  * Called with r3 pointing to the vcpu struct
														
 
															  * and r4 containing the guest MSR value.
														
 
															+ * r5 is non-zero iff non-volatile register state needs to be maintained.
														
 
															  * This potentially modifies all checkpointed registers.
														
 
															  * It restores r1 and r2 from the PACA.
														
 
															  */
														
 
															-kvmppc_restore_tm_hv:
														
 
															+_GLOBAL_TOC(kvmppc_restore_tm_hv)
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
														
 
															 	/*
														
 
															 	 * If we are doing TM emulation for the guest on a POWER9 DD2,
														
 
															 	 * then we don't actually do a trechkpt -- we either set up
														
@@ -3423,6 +3332,194 @@ kvmppc_msr_interrupt:
 
															 1:	rldimi	r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
														
 
															 	blr
														
 
															+/*
														
 
															+ * Load up guest PMU state.  R3 points to the vcpu struct.
														
 
															+ */
														
 
															+_GLOBAL(kvmhv_load_guest_pmu)
														
 
															+EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
														
 
															+	mr	r4, r3
														
 
															+	mflr	r0
														
 
															+	li	r3, 1
														
 
															+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
														
 
															+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
														
 
															+	isync
														
 
															+BEGIN_FTR_SECTION
														
 
															+	ld	r3, VCPU_MMCR(r4)
														
 
															+	andi.	r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
														
 
															+	cmpwi	r5, MMCR0_PMAO
														
 
															+	beql	kvmppc_fix_pmao
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
														
 
															+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
														
 
															+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
														
 
															+	lwz	r6, VCPU_PMC + 8(r4)
														
 
															+	lwz	r7, VCPU_PMC + 12(r4)
														
 
															+	lwz	r8, VCPU_PMC + 16(r4)
														
 
															+	lwz	r9, VCPU_PMC + 20(r4)
														
 
															+	mtspr	SPRN_PMC1, r3
														
 
															+	mtspr	SPRN_PMC2, r5
														
 
															+	mtspr	SPRN_PMC3, r6
														
 
															+	mtspr	SPRN_PMC4, r7
														
 
															+	mtspr	SPRN_PMC5, r8
														
 
															+	mtspr	SPRN_PMC6, r9
														
 
															+	ld	r3, VCPU_MMCR(r4)
														
 
															+	ld	r5, VCPU_MMCR + 8(r4)
														
 
															+	ld	r6, VCPU_MMCR + 16(r4)
														
 
															+	ld	r7, VCPU_SIAR(r4)
														
 
															+	ld	r8, VCPU_SDAR(r4)
														
 
															+	mtspr	SPRN_MMCR1, r5
														
 
															+	mtspr	SPRN_MMCRA, r6
														
 
															+	mtspr	SPRN_SIAR, r7
														
 
															+	mtspr	SPRN_SDAR, r8
														
 
															+BEGIN_FTR_SECTION
														
 
															+	ld	r5, VCPU_MMCR + 24(r4)
														
 
															+	ld	r6, VCPU_SIER(r4)
														
 
															+	mtspr	SPRN_MMCR2, r5
														
 
															+	mtspr	SPRN_SIER, r6
														
 
															+BEGIN_FTR_SECTION_NESTED(96)
														
 
															+	lwz	r7, VCPU_PMC + 24(r4)
														
 
															+	lwz	r8, VCPU_PMC + 28(r4)
														
 
															+	ld	r9, VCPU_MMCR + 32(r4)
														
 
															+	mtspr	SPRN_SPMC1, r7
														
 
															+	mtspr	SPRN_SPMC2, r8
														
 
															+	mtspr	SPRN_MMCRS, r9
														
 
															+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															+	mtspr	SPRN_MMCR0, r3
														
 
															+	isync
														
 
															+	mtlr	r0
														
 
															+	blr
														
 
															+
														
 
															+/*
														
 
															+ * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
														
 
															+ */
														
 
															+_GLOBAL(kvmhv_load_host_pmu)
														
 
															+EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
														
 
															+	mflr	r0
														
 
															+	lbz	r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
														
 
															+	cmpwi	r4, 0
														
 
															+	beq	23f			/* skip if not */
														
 
															+BEGIN_FTR_SECTION
														
 
															+	ld	r3, HSTATE_MMCR0(r13)
														
 
															+	andi.	r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
														
 
															+	cmpwi	r4, MMCR0_PMAO
														
 
															+	beql	kvmppc_fix_pmao
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
														
 
															+	lwz	r3, HSTATE_PMC1(r13)
														
 
															+	lwz	r4, HSTATE_PMC2(r13)
														
 
															+	lwz	r5, HSTATE_PMC3(r13)
														
 
															+	lwz	r6, HSTATE_PMC4(r13)
														
 
															+	lwz	r8, HSTATE_PMC5(r13)
														
 
															+	lwz	r9, HSTATE_PMC6(r13)
														
 
															+	mtspr	SPRN_PMC1, r3
														
 
															+	mtspr	SPRN_PMC2, r4
														
 
															+	mtspr	SPRN_PMC3, r5
														
 
															+	mtspr	SPRN_PMC4, r6
														
 
															+	mtspr	SPRN_PMC5, r8
														
 
															+	mtspr	SPRN_PMC6, r9
														
 
															+	ld	r3, HSTATE_MMCR0(r13)
														
 
															+	ld	r4, HSTATE_MMCR1(r13)
														
 
															+	ld	r5, HSTATE_MMCRA(r13)
														
 
															+	ld	r6, HSTATE_SIAR(r13)
														
 
															+	ld	r7, HSTATE_SDAR(r13)
														
 
															+	mtspr	SPRN_MMCR1, r4
														
 
															+	mtspr	SPRN_MMCRA, r5
														
 
															+	mtspr	SPRN_SIAR, r6
														
 
															+	mtspr	SPRN_SDAR, r7
														
 
															+BEGIN_FTR_SECTION
														
 
															+	ld	r8, HSTATE_MMCR2(r13)
														
 
															+	ld	r9, HSTATE_SIER(r13)
														
 
															+	mtspr	SPRN_MMCR2, r8
														
 
															+	mtspr	SPRN_SIER, r9
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															+	mtspr	SPRN_MMCR0, r3
														
 
															+	isync
														
 
															+	mtlr	r0
														
 
															+23:	blr
														
 
															+
														
 
															+/*
														
 
															+ * Save guest PMU state into the vcpu struct.
														
 
															+ * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
														
 
															+ */
														
 
															+_GLOBAL(kvmhv_save_guest_pmu)
														
 
															+EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
														
 
															+	mr	r9, r3
														
 
															+	mr	r8, r4
														
 
															+BEGIN_FTR_SECTION
														
 
															+	/*
														
 
															+	 * POWER8 seems to have a hardware bug where setting
														
 
															+	 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
														
 
															+	 * when some counters are already negative doesn't seem
														
 
															+	 * to cause a performance monitor alert (and hence interrupt).
														
 
															+	 * The effect of this is that when saving the PMU state,
														
 
															+	 * if there is no PMU alert pending when we read MMCR0
														
 
															+	 * before freezing the counters, but one becomes pending
														
 
															+	 * before we read the counters, we lose it.
														
 
															+	 * To work around this, we need a way to freeze the counters
														
 
															+	 * before reading MMCR0.  Normally, freezing the counters
														
 
															+	 * is done by writing MMCR0 (to set MMCR0[FC]) which
														
 
															+	 * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
														
 
															+	 * we can also freeze the counters using MMCR2, by writing
														
 
															+	 * 1s to all the counter freeze condition bits (there are
														
 
															+	 * 9 bits each for 6 counters).
														
 
															+	 */
														
 
															+	li	r3, -1			/* set all freeze bits */
														
 
															+	clrrdi	r3, r3, 10
														
 
															+	mfspr	r10, SPRN_MMCR2
														
 
															+	mtspr	SPRN_MMCR2, r3
														
 
															+	isync
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															+	li	r3, 1
														
 
															+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
														
 
															+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
														
 
															+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
														
 
															+	mfspr	r6, SPRN_MMCRA
														
 
															+	/* Clear MMCRA in order to disable SDAR updates */
														
 
															+	li	r7, 0
														
 
															+	mtspr	SPRN_MMCRA, r7
														
 
															+	isync
														
 
															+	cmpwi	r8, 0			/* did they ask for PMU stuff to be saved? */
														
 
															+	bne	21f
														
 
															+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
														
 
															+	b	22f
														
 
															+21:	mfspr	r5, SPRN_MMCR1
														
 
															+	mfspr	r7, SPRN_SIAR
														
 
															+	mfspr	r8, SPRN_SDAR
														
 
															+	std	r4, VCPU_MMCR(r9)
														
 
															+	std	r5, VCPU_MMCR + 8(r9)
														
 
															+	std	r6, VCPU_MMCR + 16(r9)
														
 
															+BEGIN_FTR_SECTION
														
 
															+	std	r10, VCPU_MMCR + 24(r9)
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															+	std	r7, VCPU_SIAR(r9)
														
 
															+	std	r8, VCPU_SDAR(r9)
														
 
															+	mfspr	r3, SPRN_PMC1
														
 
															+	mfspr	r4, SPRN_PMC2
														
 
															+	mfspr	r5, SPRN_PMC3
														
 
															+	mfspr	r6, SPRN_PMC4
														
 
															+	mfspr	r7, SPRN_PMC5
														
 
															+	mfspr	r8, SPRN_PMC6
														
 
															+	stw	r3, VCPU_PMC(r9)
														
 
															+	stw	r4, VCPU_PMC + 4(r9)
														
 
															+	stw	r5, VCPU_PMC + 8(r9)
														
 
															+	stw	r6, VCPU_PMC + 12(r9)
														
 
															+	stw	r7, VCPU_PMC + 16(r9)
														
 
															+	stw	r8, VCPU_PMC + 20(r9)
														
 
															+BEGIN_FTR_SECTION
														
 
															+	mfspr	r5, SPRN_SIER
														
 
															+	std	r5, VCPU_SIER(r9)
														
 
															+BEGIN_FTR_SECTION_NESTED(96)
														
 
															+	mfspr	r6, SPRN_SPMC1
														
 
															+	mfspr	r7, SPRN_SPMC2
														
 
															+	mfspr	r8, SPRN_MMCRS
														
 
															+	stw	r6, VCPU_PMC + 24(r9)
														
 
															+	stw	r7, VCPU_PMC + 28(r9)
														
 
															+	std	r8, VCPU_MMCR + 32(r9)
														
 
															+	lis	r4, 0x8000
														
 
															+	mtspr	SPRN_MMCRS, r4
														
 
															+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
														
 
															+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
														
 
															+22:	blr
														
 
															+
														
 
															 /*
														
 
															  * This works around a hardware bug on POWER8E processors, where
														
 
															  * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
														
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
															 			return RESUME_GUEST;
														
 
															 		}
														
 
															 		/* Set CR0 to indicate previous transactional state */
														
 
															-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
														
 
															+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
														
 
															 			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
														
 
															 		/* L=1 => tresume, L=0 => tsuspend */
														
 
															 		if (instr & (1 << 21)) {
														
@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
															 		copy_from_checkpoint(vcpu);
														
 
															 		/* Set CR0 to indicate previous transactional state */
														
 
															-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
														
 
															+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
														
 
															 			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
														
 
															 		vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
														
 
															 		return RESUME_GUEST;
														
@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
															 		copy_to_checkpoint(vcpu);
														
 
															 		/* Set CR0 to indicate previous transactional state */
														
 
															-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
														
 
															+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
														
 
															 			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
														
 
															 		vcpu->arch.shregs.msr = msr | MSR_TS_S;
														
 
															 		return RESUME_GUEST;
														
--- a/arch/powerpc/kvm/book3s_hv_tm_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
 
															 		if (instr & (1 << 21))
														
 
															 			vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
														
 
															 		/* Set CR0 to 0b0010 */
														
 
															-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000;
														
 
															+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
														
 
															+			0x20000000;
														
 
															 		return 1;
														
 
															 	}
														
@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
 
															 	vcpu->arch.shregs.msr &= ~MSR_TS_MASK;	/* go to N state */
														
 
															 	vcpu->arch.regs.nip = vcpu->arch.tfhar;
														
 
															 	copy_from_checkpoint(vcpu);
														
 
															-	vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000;
														
 
															+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
														
 
															 }
														
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
 
															 	svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
														
 
															 	svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
														
 
															 	svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
														
 
															-	svcpu->cr  = vcpu->arch.cr;
														
 
															+	svcpu->cr  = vcpu->arch.regs.ccr;
														
 
															 	svcpu->xer = vcpu->arch.regs.xer;
														
 
															 	svcpu->ctr = vcpu->arch.regs.ctr;
														
 
															 	svcpu->lr  = vcpu->arch.regs.link;
														
@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
 
															 	vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
														
 
															 	vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
														
 
															 	vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
														
 
															-	vcpu->arch.cr  = svcpu->cr;
														
 
															+	vcpu->arch.regs.ccr  = svcpu->cr;
														
 
															 	vcpu->arch.regs.xer = svcpu->xer;
														
 
															 	vcpu->arch.regs.ctr = svcpu->ctr;
														
 
															 	vcpu->arch.regs.link  = svcpu->lr;
														
@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
															 		r = RESUME_GUEST;
														
 
															 		break;
														
 
															 	case BOOK3S_INTERRUPT_EXTERNAL:
														
 
															-	case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
														
 
															 	case BOOK3S_INTERRUPT_EXTERNAL_HV:
														
 
															 	case BOOK3S_INTERRUPT_H_VIRT:
														
 
															 		vcpu->stat.ext_intr_exits++;
														
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp,
 
															 	 */
														
 
															 	if (new.out_ee) {
														
 
															 		kvmppc_book3s_queue_irqprio(icp->vcpu,
														
 
															-					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
														
 
															+					    BOOK3S_INTERRUPT_EXTERNAL);
														
 
															 		if (!change_self)
														
 
															 			kvmppc_fast_vcpu_kick(icp->vcpu);
														
 
															 	}
														
@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
 
															 	u32 xirr;
														
 
															 	/* First, remove EE from the processor */
														
 
															-	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
														
 
															-				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
														
 
															+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
														
 
															 	/*
														
 
															 	 * ICP State: Accept_Interrupt
														
@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 
															 	 * We can remove EE from the current processor, the update
														
 
															 	 * transaction will set it again if needed
														
 
															 	 */
														
 
															-	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
														
 
															-				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
														
 
															+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
														
 
															 	do {
														
 
															 		old_state = new_state = READ_ONCE(icp->state);
														
@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
 
															 	 * Deassert the CPU interrupt request.
														
 
															 	 * icp_try_update will reassert it if necessary.
														
 
															 	 */
														
 
															-	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
														
 
															-				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
														
 
															+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
														
 
															 	/*
														
 
															 	 * Note that if we displace an interrupt from old_state.xisr,
														
@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 
															 	}
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															-	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
														
 
															+	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
														
 
															+	    cpu_has_feature(CPU_FTR_HVMODE)) {
														
 
															 		/* Enable real mode support */
														
 
															 		xics->real_mode = ENABLE_REALMODE;
														
 
															 		xics->real_mode_dbg = DEBUG_REALMODE;
														
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -61,6 +61,69 @@
 
															  */
														
 
															 #define XIVE_Q_GAP	2
														
 
															+/*
														
 
															+ * Push a vcpu's context to the XIVE on guest entry.
														
 
															+ * This assumes we are in virtual mode (MMU on)
														
 
															+ */
														
 
															+void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
														
 
															+	u64 pq;
														
 
															+
														
 
															+	if (!tima)
														
 
															+		return;
														
 
															+	eieio();
														
 
															+	__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
														
 
															+	__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
														
 
															+	vcpu->arch.xive_pushed = 1;
														
 
															+	eieio();
														
 
															+
														
 
															+	/*
														
 
															+	 * We clear the irq_pending flag. There is a small chance of a
														
 
															+	 * race vs. the escalation interrupt happening on another
														
 
															+	 * processor setting it again, but the only consequence is to
														
 
															+	 * cause a spurious wakeup on the next H_CEDE, which is not an
														
 
															+	 * issue.
														
 
															+	 */
														
 
															+	vcpu->arch.irq_pending = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * In single escalation mode, if the escalation interrupt is
														
 
															+	 * on, we mask it.
														
 
															+	 */
														
 
															+	if (vcpu->arch.xive_esc_on) {
														
 
															+		pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
														
 
															+						  XIVE_ESB_SET_PQ_01));
														
 
															+		mb();
														
 
															+
														
 
															+		/*
														
 
															+		 * We have a possible subtle race here: The escalation
														
 
															+		 * interrupt might have fired and be on its way to the
														
 
															+		 * host queue while we mask it, and if we unmask it
														
 
															+		 * early enough (re-cede right away), there is a
														
 
															+		 * theorical possibility that it fires again, thus
														
 
															+		 * landing in the target queue more than once which is
														
 
															+		 * a big no-no.
														
 
															+		 *
														
 
															+		 * Fortunately, solving this is rather easy. If the
														
 
															+		 * above load setting PQ to 01 returns a previous
														
 
															+		 * value where P is set, then we know the escalation
														
 
															+		 * interrupt is somewhere on its way to the host. In
														
 
															+		 * that case we simply don't clear the xive_esc_on
														
 
															+		 * flag below. It will be eventually cleared by the
														
 
															+		 * handler for the escalation interrupt.
														
 
															+		 *
														
 
															+		 * Then, when doing a cede, we check that flag again
														
 
															+		 * before re-enabling the escalation interrupt, and if
														
 
															+		 * set, we abort the cede.
														
 
															+		 */
														
 
															+		if (!(pq & XIVE_ESB_VAL_P))
														
 
															+			/* Now P is 0, we can clear the flag */
														
 
															+			vcpu->arch.xive_esc_on = 0;
														
 
															+	}
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
														
 
															+
														
 
															 /*
														
 
															  * This is a simple trigger for a generic XIVE IRQ. This must
														
 
															  * only be called for interrupts that support a trigger page
														
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
 
															 	/* First collect pending bits from HW */
														
 
															 	GLUE(X_PFX,ack_pending)(xc);
														
 
															-	/*
														
 
															-	 * Cleanup the old-style bits if needed (they may have been
														
 
															-	 * set by pull or an escalation interrupts).
														
 
															-	 */
														
 
															-	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
														
 
															-		clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
														
 
															-			  &vcpu->arch.pending_exceptions);
														
 
															-
														
 
															 	pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
														
 
															 		 xc->pending, xc->hw_cppr, xc->cppr);
														
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -182,7 +182,7 @@
 
															 	 */
														
 
															 	PPC_LL	r4, PACACURRENT(r13)
														
 
															 	PPC_LL	r4, (THREAD + THREAD_KVM_VCPU)(r4)
														
 
															-	stw	r10, VCPU_CR(r4)
														
 
															+	PPC_STL	r10, VCPU_CR(r4)
														
 
															 	PPC_STL r11, VCPU_GPR(R4)(r4)
														
 
															 	PPC_STL	r5, VCPU_GPR(R5)(r4)
														
 
															 	PPC_STL	r6, VCPU_GPR(R6)(r4)
														
@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 
															 	PPC_STL	r4, VCPU_GPR(R4)(r11)
														
 
															 	PPC_LL	r4, THREAD_NORMSAVE(0)(r10)
														
 
															 	PPC_STL	r5, VCPU_GPR(R5)(r11)
														
 
															-	stw	r13, VCPU_CR(r11)
														
 
															+	PPC_STL	r13, VCPU_CR(r11)
														
 
															 	mfspr	r5, \srr0
														
 
															 	PPC_STL	r3, VCPU_GPR(R10)(r11)
														
 
															 	PPC_LL	r3, THREAD_NORMSAVE(2)(r10)
														
@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 
															 	PPC_STL	r4, VCPU_GPR(R4)(r11)
														
 
															 	PPC_LL	r4, GPR9(r8)
														
 
															 	PPC_STL	r5, VCPU_GPR(R5)(r11)
														
 
															-	stw	r9, VCPU_CR(r11)
														
 
															+	PPC_STL	r9, VCPU_CR(r11)
														
 
															 	mfspr	r5, \srr0
														
 
															 	PPC_STL	r3, VCPU_GPR(R8)(r11)
														
 
															 	PPC_LL	r3, GPR10(r8)
														
@@ -643,7 +643,7 @@ lightweight_exit:
 
															 	PPC_LL	r3, VCPU_LR(r4)
														
 
															 	PPC_LL	r5, VCPU_XER(r4)
														
 
															 	PPC_LL	r6, VCPU_CTR(r4)
														
 
															-	lwz	r7, VCPU_CR(r4)
														
 
															+	PPC_LL	r7, VCPU_CR(r4)
														
 
															 	PPC_LL	r8, VCPU_PC(r4)
														
 
															 	PPC_LD(r9, VCPU_SHARED_MSR, r11)
														
 
															 	PPC_LL	r0, VCPU_GPR(R0)(r4)
														
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
															 	emulated = EMULATE_FAIL;
														
 
															 	vcpu->arch.regs.msr = vcpu->arch.shared->msr;
														
 
															-	vcpu->arch.regs.ccr = vcpu->arch.cr;
														
 
															 	if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
														
 
															 		int type = op.type & INSTR_TYPE_MASK;
														
 
															 		int size = GETSIZE(op.type);
														
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
															 		r = !!(hv_enabled && radix_enabled());
														
 
															 		break;
														
 
															 	case KVM_CAP_PPC_MMU_HASH_V3:
														
 
															-		r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
														
 
															+		r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
														
 
															+		       cpu_has_feature(CPU_FTR_HVMODE));
														
 
															+		break;
														
 
															+	case KVM_CAP_PPC_NESTED_HV:
														
 
															+		r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
														
 
															+		       !kvmppc_hv_ops->enable_nested(NULL));
														
 
															 		break;
														
 
															 #endif
														
 
															 	case KVM_CAP_SYNC_MMU:
														
@@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 
															 			r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
														
 
															 		break;
														
 
															 	}
														
 
															+
														
 
															+	case KVM_CAP_PPC_NESTED_HV:
														
 
															+		r = -EINVAL;
														
 
															+		if (!is_kvmppc_hv_enabled(kvm) ||
														
 
															+		    !kvm->arch.kvm_ops->enable_nested)
														
 
															+			break;
														
 
															+		r = kvm->arch.kvm_ops->enable_nested(kvm);
														
 
															+		break;
														
 
															 #endif
														
 
															 	default:
														
 
															 		r = -EINVAL;
														
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -28,17 +28,25 @@
 
															  * Save transactional state and TM-related registers.
														
 
															  * Called with:
														
 
															  * - r3 pointing to the vcpu struct
														
 
															- * - r4 points to the MSR with current TS bits:
														
 
															+ * - r4 containing the MSR with current TS bits:
														
 
															  * 	(For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
														
 
															- * This can modify all checkpointed registers, but
														
 
															- * restores r1, r2 before exit.
														
 
															+ * - r5 containing a flag indicating that non-volatile registers
														
 
															+ *	must be preserved.
														
 
															+ * If r5 == 0, this can modify all checkpointed registers, but
														
 
															+ * restores r1, r2 before exit.  If r5 != 0, this restores the
														
 
															+ * MSR TM/FP/VEC/VSX bits to their state on entry.
														
 
															  */
														
 
															 _GLOBAL(__kvmppc_save_tm)
														
 
															 	mflr	r0
														
 
															 	std	r0, PPC_LR_STKOFF(r1)
														
 
															+	stdu    r1, -SWITCH_FRAME_SIZE(r1)
														
 
															+
														
 
															+	mr	r9, r3
														
 
															+	cmpdi	cr7, r5, 0
														
 
															 	/* Turn on TM. */
														
 
															 	mfmsr	r8
														
 
															+	mr	r10, r8
														
 
															 	li	r0, 1
														
 
															 	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
														
 
															 	ori     r8, r8, MSR_FP
														
@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm)
 
															 	std	r1, HSTATE_SCRATCH2(r13)
														
 
															 	std	r3, HSTATE_SCRATCH1(r13)
														
 
															+	/* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
														
 
															+	mfcr	r6
														
 
															+	SAVE_GPR(6, r1)
														
 
															+
														
 
															+	/* Save DSCR so we can restore it to avoid running with user value */
														
 
															+	mfspr	r7, SPRN_DSCR
														
 
															+	SAVE_GPR(7, r1)
														
 
															+
														
 
															+	/*
														
 
															+	 * We are going to do treclaim., which will modify all checkpointed
														
 
															+	 * registers.  Save the non-volatile registers on the stack if
														
 
															+	 * preservation of non-volatile state has been requested.
														
 
															+	 */
														
 
															+	beq	cr7, 3f
														
 
															+	SAVE_NVGPRS(r1)
														
 
															+
														
 
															+	/* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
														
 
															+	li	r0, 0
														
 
															+	rldimi	r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
														
 
															+	SAVE_GPR(10, r1)	/* final MSR value */
														
 
															+3:
														
 
															 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															 BEGIN_FTR_SECTION
														
 
															 	/* Emulation of the treclaim instruction needs TEXASR before treclaim */
														
@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
															 	std	r9, PACATMSCRATCH(r13)
														
 
															 	ld	r9, HSTATE_SCRATCH1(r13)
														
 
															-	/* Get a few more GPRs free. */
														
 
															-	std	r29, VCPU_GPRS_TM(29)(r9)
														
 
															-	std	r30, VCPU_GPRS_TM(30)(r9)
														
 
															-	std	r31, VCPU_GPRS_TM(31)(r9)
														
 
															-
														
 
															-	/* Save away PPR and DSCR soon so don't run with user values. */
														
 
															-	mfspr	r31, SPRN_PPR
														
 
															+	/* Save away PPR soon so we don't run with user value. */
														
 
															+	std	r0, VCPU_GPRS_TM(0)(r9)
														
 
															+	mfspr	r0, SPRN_PPR
														
 
															 	HMT_MEDIUM
														
 
															-	mfspr	r30, SPRN_DSCR
														
 
															-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															-	ld	r29, HSTATE_DSCR(r13)
														
 
															-	mtspr	SPRN_DSCR, r29
														
 
															-#endif
														
 
															-	/* Save all but r9, r13 & r29-r31 */
														
 
															-	reg = 0
														
 
															+	/* Reload stack pointer. */
														
 
															+	std	r1, VCPU_GPRS_TM(1)(r9)
														
 
															+	ld	r1, HSTATE_SCRATCH2(r13)
														
 
															+
														
 
															+	/* Set MSR RI now we have r1 and r13 back. */
														
 
															+	std	r2, VCPU_GPRS_TM(2)(r9)
														
 
															+	li	r2, MSR_RI
														
 
															+	mtmsrd	r2, 1
														
 
															+
														
 
															+	/* Reload TOC pointer. */
														
 
															+	ld	r2, PACATOC(r13)
														
 
															+
														
 
															+	/* Save all but r0-r2, r9 & r13 */
														
 
															+	reg = 3
														
 
															 	.rept	29
														
 
															 	.if (reg != 9) && (reg != 13)
														
 
															 	std	reg, VCPU_GPRS_TM(reg)(r9)
														
@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
															 	ld	r4, PACATMSCRATCH(r13)
														
 
															 	std	r4, VCPU_GPRS_TM(9)(r9)
														
 
															-	/* Reload stack pointer and TOC. */
														
 
															-	ld	r1, HSTATE_SCRATCH2(r13)
														
 
															-	ld	r2, PACATOC(r13)
														
 
															-
														
 
															-	/* Set MSR RI now we have r1 and r13 back. */
														
 
															-	li	r5, MSR_RI
														
 
															-	mtmsrd	r5, 1
														
 
															+	/* Restore host DSCR and CR values, after saving guest values */
														
 
															+	mfcr	r6
														
 
															+	mfspr	r7, SPRN_DSCR
														
 
															+	stw	r6, VCPU_CR_TM(r9)
														
 
															+	std	r7, VCPU_DSCR_TM(r9)
														
 
															+	REST_GPR(6, r1)
														
 
															+	REST_GPR(7, r1)
														
 
															+	mtcr	r6
														
 
															+	mtspr	SPRN_DSCR, r7
														
 
															-	/* Save away checkpinted SPRs. */
														
 
															-	std	r31, VCPU_PPR_TM(r9)
														
 
															-	std	r30, VCPU_DSCR_TM(r9)
														
 
															+	/* Save away checkpointed SPRs. */
														
 
															+	std	r0, VCPU_PPR_TM(r9)
														
 
															 	mflr	r5
														
 
															-	mfcr	r6
														
 
															 	mfctr	r7
														
 
															 	mfspr	r8, SPRN_AMR
														
 
															 	mfspr	r10, SPRN_TAR
														
 
															 	mfxer	r11
														
 
															 	std	r5, VCPU_LR_TM(r9)
														
 
															-	stw	r6, VCPU_CR_TM(r9)
														
 
															 	std	r7, VCPU_CTR_TM(r9)
														
 
															 	std	r8, VCPU_AMR_TM(r9)
														
 
															 	std	r10, VCPU_TAR_TM(r9)
														
 
															 	std	r11, VCPU_XER_TM(r9)
														
 
															-	/* Restore r12 as trap number. */
														
 
															-	lwz	r12, VCPU_TRAP(r9)
														
 
															-
														
 
															 	/* Save FP/VSX. */
														
 
															 	addi	r3, r9, VCPU_FPRS_TM
														
 
															 	bl	store_fp_state
														
@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
															 	bl	store_vr_state
														
 
															 	mfspr	r6, SPRN_VRSAVE
														
 
															 	stw	r6, VCPU_VRSAVE_TM(r9)
														
 
															+
														
 
															+	/* Restore non-volatile registers if requested to */
														
 
															+	beq	cr7, 1f
														
 
															+	REST_NVGPRS(r1)
														
 
															+	REST_GPR(10, r1)
														
 
															 1:
														
 
															 	/*
														
 
															 	 * We need to save these SPRs after the treclaim so that the software
														
@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
															 	 */
														
 
															 	mfspr	r7, SPRN_TEXASR
														
 
															 	std	r7, VCPU_TEXASR(r9)
														
 
															-11:
														
 
															 	mfspr	r5, SPRN_TFHAR
														
 
															 	mfspr	r6, SPRN_TFIAR
														
 
															 	std	r5, VCPU_TFHAR(r9)
														
 
															 	std	r6, VCPU_TFIAR(r9)
														
 
															+	/* Restore MSR state if requested */
														
 
															+	beq	cr7, 2f
														
 
															+	mtmsrd	r10, 0
														
 
															+2:
														
 
															+	addi	r1, r1, SWITCH_FRAME_SIZE
														
 
															 	ld	r0, PPC_LR_STKOFF(r1)
														
 
															 	mtlr	r0
														
 
															 	blr
														
@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
															  * be invoked from C function by PR KVM only.
														
 
															  */
														
 
															 _GLOBAL(_kvmppc_save_tm_pr)
														
 
															-	mflr	r5
														
 
															-	std	r5, PPC_LR_STKOFF(r1)
														
 
															-	stdu    r1, -SWITCH_FRAME_SIZE(r1)
														
 
															-	SAVE_NVGPRS(r1)
														
 
															-
														
 
															-	/* save MSR since TM/math bits might be impacted
														
 
															-	 * by __kvmppc_save_tm().
														
 
															-	 */
														
 
															-	mfmsr	r5
														
 
															-	SAVE_GPR(5, r1)
														
 
															-
														
 
															-	/* also save DSCR/CR/TAR so that it can be recovered later */
														
 
															-	mfspr   r6, SPRN_DSCR
														
 
															-	SAVE_GPR(6, r1)
														
 
															-
														
 
															-	mfcr    r7
														
 
															-	stw     r7, _CCR(r1)
														
 
															+	mflr	r0
														
 
															+	std	r0, PPC_LR_STKOFF(r1)
														
 
															+	stdu    r1, -PPC_MIN_STKFRM(r1)
														
 
															 	mfspr   r8, SPRN_TAR
														
 
															-	SAVE_GPR(8, r1)
														
 
															+	std	r8, PPC_MIN_STKFRM-8(r1)
														
 
															+	li	r5, 1		/* preserve non-volatile registers */
														
 
															 	bl	__kvmppc_save_tm
														
 
															-	REST_GPR(8, r1)
														
 
															+	ld	r8, PPC_MIN_STKFRM-8(r1)
														
 
															 	mtspr   SPRN_TAR, r8
														
 
															-	ld      r7, _CCR(r1)
														
 
															-	mtcr	r7
														
 
															-
														
 
															-	REST_GPR(6, r1)
														
 
															-	mtspr   SPRN_DSCR, r6
														
 
															-
														
 
															-	/* need preserve current MSR's MSR_TS bits */
														
 
															-	REST_GPR(5, r1)
														
 
															-	mfmsr   r6
														
 
															-	rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
														
 
															-	rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
														
 
															-	mtmsrd  r5
														
 
															-
														
 
															-	REST_NVGPRS(r1)
														
 
															-	addi    r1, r1, SWITCH_FRAME_SIZE
														
 
															-	ld	r5, PPC_LR_STKOFF(r1)
														
 
															-	mtlr	r5
														
 
															+	addi    r1, r1, PPC_MIN_STKFRM
														
 
															+	ld	r0, PPC_LR_STKOFF(r1)
														
 
															+	mtlr	r0
														
 
															 	blr
														
 
															 EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
														
@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
 
															  *  - r4 is the guest MSR with desired TS bits:
														
 
															  * 	For HV KVM, it is VCPU_MSR
														
 
															  * 	For PR KVM, it is provided by caller
														
 
															- * This potentially modifies all checkpointed registers.
														
 
															- * It restores r1, r2 from the PACA.
														
 
															+ * - r5 containing a flag indicating that non-volatile registers
														
 
															+ *	must be preserved.
														
 
															+ * If r5 == 0, this potentially modifies all checkpointed registers, but
														
 
															+ * restores r1, r2 from the PACA before exit.
														
 
															+ * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
														
 
															  */
														
 
															 _GLOBAL(__kvmppc_restore_tm)
														
 
															 	mflr	r0
														
 
															 	std	r0, PPC_LR_STKOFF(r1)
														
 
															+	cmpdi	cr7, r5, 0
														
 
															+
														
 
															 	/* Turn on TM/FP/VSX/VMX so we can restore them. */
														
 
															 	mfmsr	r5
														
 
															+	mr	r10, r5
														
 
															 	li	r6, MSR_TM >> 32
														
 
															 	sldi	r6, r6, 32
														
 
															 	or	r5, r5, r6
														
@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm)
 
															 	mr	r5, r4
														
 
															 	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
														
 
															-	beqlr		/* TM not active in guest */
														
 
															-	std	r1, HSTATE_SCRATCH2(r13)
														
 
															+	beq	9f		/* TM not active in guest */
														
 
															 	/* Make sure the failure summary is set, otherwise we'll program check
														
 
															 	 * when we trechkpt.  It's possible that this might have been not set
														
@@ -255,6 +270,26 @@ _GLOBAL(__kvmppc_restore_tm)
 
															 	oris	r7, r7, (TEXASR_FS)@h
														
 
															 	mtspr	SPRN_TEXASR, r7
														
 
															+	/*
														
 
															+	 * Make a stack frame and save non-volatile registers if requested.
														
 
															+	 */
														
 
															+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
														
 
															+	std	r1, HSTATE_SCRATCH2(r13)
														
 
															+
														
 
															+	mfcr	r6
														
 
															+	mfspr	r7, SPRN_DSCR
														
 
															+	SAVE_GPR(2, r1)
														
 
															+	SAVE_GPR(6, r1)
														
 
															+	SAVE_GPR(7, r1)
														
 
															+
														
 
															+	beq	cr7, 4f
														
 
															+	SAVE_NVGPRS(r1)
														
 
															+
														
 
															+	/* MSR[TS] will be 1 (suspended) once we do trechkpt */
														
 
															+	li	r0, 1
														
 
															+	rldimi	r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
														
 
															+	SAVE_GPR(10, r1)	/* final MSR value */
														
 
															+4:
														
 
															 	/*
														
 
															 	 * We need to load up the checkpointed state for the guest.
														
 
															 	 * We need to do this early as it will blow away any GPRs, VSRs and
														
@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm)
 
															 	ld	r29, VCPU_DSCR_TM(r3)
														
 
															 	ld	r30, VCPU_PPR_TM(r3)
														
 
															-	std	r2, PACATMSCRATCH(r13) /* Save TOC */
														
 
															-
														
 
															 	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
														
 
															 	li	r5, 0
														
 
															 	mtmsrd	r5, 1
														
@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm)
 
															 	/* Now let's get back the state we need. */
														
 
															 	HMT_MEDIUM
														
 
															 	GET_PACA(r13)
														
 
															-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															-	ld	r29, HSTATE_DSCR(r13)
														
 
															-	mtspr	SPRN_DSCR, r29
														
 
															-#endif
														
 
															 	ld	r1, HSTATE_SCRATCH2(r13)
														
 
															-	ld	r2, PACATMSCRATCH(r13)
														
 
															+	REST_GPR(7, r1)
														
 
															+	mtspr	SPRN_DSCR, r7
														
 
															 	/* Set the MSR RI since we have our registers back. */
														
 
															 	li	r5, MSR_RI
														
 
															 	mtmsrd	r5, 1
														
 
															+
														
 
															+	/* Restore TOC pointer and CR */
														
 
															+	REST_GPR(2, r1)
														
 
															+	REST_GPR(6, r1)
														
 
															+	mtcr	r6
														
 
															+
														
 
															+	/* Restore non-volatile registers if requested to. */
														
 
															+	beq	cr7, 5f
														
 
															+	REST_GPR(10, r1)
														
 
															+	REST_NVGPRS(r1)
														
 
															+
														
 
															+5:	addi	r1, r1, SWITCH_FRAME_SIZE
														
 
															 	ld	r0, PPC_LR_STKOFF(r1)
														
 
															 	mtlr	r0
														
 
															+
														
 
															+9:	/* Restore MSR bits if requested */
														
 
															+	beqlr	cr7
														
 
															+	mtmsrd	r10, 0
														
 
															 	blr
														
 
															 /*
														
@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm)
 
															  * can be invoked from C function by PR KVM only.
														
 
															  */
														
 
															 _GLOBAL(_kvmppc_restore_tm_pr)
														
 
															-	mflr	r5
														
 
															-	std	r5, PPC_LR_STKOFF(r1)
														
 
															-	stdu    r1, -SWITCH_FRAME_SIZE(r1)
														
 
															-	SAVE_NVGPRS(r1)
														
 
															-
														
 
															-	/* save MSR to avoid TM/math bits change */
														
 
															-	mfmsr	r5
														
 
															-	SAVE_GPR(5, r1)
														
 
															-
														
 
															-	/* also save DSCR/CR/TAR so that it can be recovered later */
														
 
															-	mfspr   r6, SPRN_DSCR
														
 
															-	SAVE_GPR(6, r1)
														
 
															-
														
 
															-	mfcr    r7
														
 
															-	stw     r7, _CCR(r1)
														
 
															+	mflr	r0
														
 
															+	std	r0, PPC_LR_STKOFF(r1)
														
 
															+	stdu    r1, -PPC_MIN_STKFRM(r1)
														
 
															+	/* save TAR so that it can be recovered later */
														
 
															 	mfspr   r8, SPRN_TAR
														
 
															-	SAVE_GPR(8, r1)
														
 
															+	std	r8, PPC_MIN_STKFRM-8(r1)
														
 
															+	li	r5, 1
														
 
															 	bl	__kvmppc_restore_tm
														
 
															-	REST_GPR(8, r1)
														
 
															+	ld	r8, PPC_MIN_STKFRM-8(r1)
														
 
															 	mtspr   SPRN_TAR, r8
														
 
															-	ld      r7, _CCR(r1)
														
 
															-	mtcr	r7
														
 
															-
														
 
															-	REST_GPR(6, r1)
														
 
															-	mtspr   SPRN_DSCR, r6
														
 
															-
														
 
															-	/* need preserve current MSR's MSR_TS bits */
														
 
															-	REST_GPR(5, r1)
														
 
															-	mfmsr   r6
														
 
															-	rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
														
 
															-	rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
														
 
															-	mtmsrd  r5
														
 
															-
														
 
															-	REST_NVGPRS(r1)
														
 
															-	addi    r1, r1, SWITCH_FRAME_SIZE
														
 
															-	ld	r5, PPC_LR_STKOFF(r1)
														
 
															-	mtlr	r5
														
 
															+	addi    r1, r1, PPC_MIN_STKFRM
														
 
															+	ld	r0, PPC_LR_STKOFF(r1)
														
 
															+	mtlr	r0
														
 
															 	blr
														
 
															 EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
														
--- a/arch/powerpc/kvm/trace_book3s.h
+++ b/arch/powerpc/kvm/trace_book3s.h
@@ -14,7 +14,6 @@
 
															 	{0x400, "INST_STORAGE"}, \
														
 
															 	{0x480, "INST_SEGMENT"}, \
														
 
															 	{0x500, "EXTERNAL"}, \
														
 
															-	{0x501, "EXTERNAL_LEVEL"}, \
														
 
															 	{0x502, "EXTERNAL_HV"}, \
														
 
															 	{0x600, "ALIGNMENT"}, \
														
 
															 	{0x700, "PROGRAM"}, \
														
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -830,6 +830,15 @@ void radix__flush_pwc_lpid(unsigned int lpid)
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
														
 
															+/*
														
 
															+ * Flush partition scoped translations from LPID (=LPIDR)
														
 
															+ */
														
 
															+void radix__flush_tlb_lpid(unsigned int lpid)
														
 
															+{
														
 
															+	_tlbie_lpid(lpid, RIC_FLUSH_ALL);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
														
 
															+
														
 
															 /*
														
 
															  * Flush partition scoped translations from LPID (=LPIDR)
														
 
															  */
														
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -783,6 +783,17 @@ config VFIO_CCW
 
															 	  To compile this driver as a module, choose M here: the
														
 
															 	  module will be called vfio_ccw.
														
 
															+config VFIO_AP
														
 
															+	def_tristate n
														
 
															+	prompt "VFIO support for AP devices"
														
 
															+	depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM
														
 
															+	help
														
 
															+		This driver grants access to Adjunct Processor (AP) devices
														
 
															+		via the VFIO mediated device interface.
														
 
															+
														
 
															+		To compile this driver as a module, choose M here: the module
														
 
															+		will be called vfio_ap.
														
 
															+
														
 
															 endmenu
														
 
															 menu "Dump support"
														
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -44,6 +44,7 @@
 
															 #define KVM_REQ_ICPT_OPEREXC	KVM_ARCH_REQ(2)
														
 
															 #define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3)
														
 
															 #define KVM_REQ_STOP_MIGRATION  KVM_ARCH_REQ(4)
														
 
															+#define KVM_REQ_VSIE_RESTART	KVM_ARCH_REQ(5)
														
 
															 #define SIGP_CTRL_C		0x80
														
 
															 #define SIGP_CTRL_SCN_MASK	0x3f
														
@@ -186,6 +187,7 @@ struct kvm_s390_sie_block {
 
															 #define ECA_AIV		0x00200000
														
 
															 #define ECA_VX		0x00020000
														
 
															 #define ECA_PROTEXCI	0x00002000
														
 
															+#define ECA_APIE	0x00000008
														
 
															 #define ECA_SII		0x00000001
														
 
															 	__u32	eca;			/* 0x004c */
														
 
															 #define ICPT_INST	0x04
														
@@ -237,7 +239,11 @@ struct kvm_s390_sie_block {
 
															 	psw_t	gpsw;			/* 0x0090 */
														
 
															 	__u64	gg14;			/* 0x00a0 */
														
 
															 	__u64	gg15;			/* 0x00a8 */
														
 
															-	__u8	reservedb0[20];		/* 0x00b0 */
														
 
															+	__u8	reservedb0[8];		/* 0x00b0 */
														
 
															+#define HPID_KVM	0x4
														
 
															+#define HPID_VSIE	0x5
														
 
															+	__u8	hpid;			/* 0x00b8 */
														
 
															+	__u8	reservedb9[11];		/* 0x00b9 */
														
 
															 	__u16	extcpuaddr;		/* 0x00c4 */
														
 
															 	__u16	eic;			/* 0x00c6 */
														
 
															 	__u32	reservedc8;		/* 0x00c8 */
														
@@ -255,6 +261,8 @@ struct kvm_s390_sie_block {
 
															 	__u8	reservede4[4];		/* 0x00e4 */
														
 
															 	__u64	tecmc;			/* 0x00e8 */
														
 
															 	__u8	reservedf0[12];		/* 0x00f0 */
														
 
															+#define CRYCB_FORMAT_MASK 0x00000003
														
 
															+#define CRYCB_FORMAT0 0x00000000
														
 
															 #define CRYCB_FORMAT1 0x00000001
														
 
															 #define CRYCB_FORMAT2 0x00000003
														
 
															 	__u32	crycbd;			/* 0x00fc */
														
@@ -715,6 +723,7 @@ struct kvm_s390_crypto {
 
															 	__u32 crycbd;
														
 
															 	__u8 aes_kw;
														
 
															 	__u8 dea_kw;
														
 
															+	__u8 apie;
														
 
															 };
														
 
															 #define APCB0_MASK_SIZE 1
														
@@ -855,6 +864,10 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 
															 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
														
 
															 				 struct kvm_async_pf *work);
														
 
															+void kvm_arch_crypto_clear_masks(struct kvm *kvm);
														
 
															+void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
														
 
															+			       unsigned long *aqm, unsigned long *adm);
														
 
															+
														
 
															 extern int sie64a(struct kvm_s390_sie_block *, u64 *);
														
 
															 extern char sie_exit;
														
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -160,6 +160,8 @@ struct kvm_s390_vm_cpu_subfunc {
 
															 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW	1
														
 
															 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW	2
														
 
															 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW	3
														
 
															+#define KVM_S390_VM_CRYPTO_ENABLE_APIE		4
														
 
															+#define KVM_S390_VM_CRYPTO_DISABLE_APIE		5
														
 
															 /* kvm attributes for migration mode */
														
 
															 #define KVM_S390_VM_MIGRATION_STOP	0
														
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -40,6 +40,7 @@
 
															 #include <asm/sclp.h>
														
 
															 #include <asm/cpacf.h>
														
 
															 #include <asm/timex.h>
														
 
															+#include <asm/ap.h>
														
 
															 #include "kvm-s390.h"
														
 
															 #include "gaccess.h"
														
@@ -844,20 +845,24 @@ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm)
 
															 	kvm_s390_vcpu_block_all(kvm);
														
 
															-	kvm_for_each_vcpu(i, vcpu, kvm)
														
 
															+	kvm_for_each_vcpu(i, vcpu, kvm) {
														
 
															 		kvm_s390_vcpu_crypto_setup(vcpu);
														
 
															+		/* recreate the shadow crycb by leaving the VSIE handler */
														
 
															+		kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu);
														
 
															+	}
														
 
															 	kvm_s390_vcpu_unblock_all(kvm);
														
 
															 }
														
 
															 static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
														
 
															 {
														
 
															-	if (!test_kvm_facility(kvm, 76))
														
 
															-		return -EINVAL;
														
 
															-
														
 
															 	mutex_lock(&kvm->lock);
														
 
															 	switch (attr->attr) {
														
 
															 	case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
														
 
															+		if (!test_kvm_facility(kvm, 76)) {
														
 
															+			mutex_unlock(&kvm->lock);
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															 		get_random_bytes(
														
 
															 			kvm->arch.crypto.crycb->aes_wrapping_key_mask,
														
 
															 			sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
														
@@ -865,6 +870,10 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
 
															 		VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support");
														
 
															 		break;
														
 
															 	case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
														
 
															+		if (!test_kvm_facility(kvm, 76)) {
														
 
															+			mutex_unlock(&kvm->lock);
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															 		get_random_bytes(
														
 
															 			kvm->arch.crypto.crycb->dea_wrapping_key_mask,
														
 
															 			sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
														
@@ -872,17 +881,39 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
 
															 		VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support");
														
 
															 		break;
														
 
															 	case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
														
 
															+		if (!test_kvm_facility(kvm, 76)) {
														
 
															+			mutex_unlock(&kvm->lock);
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															 		kvm->arch.crypto.aes_kw = 0;
														
 
															 		memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0,
														
 
															 			sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
														
 
															 		VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support");
														
 
															 		break;
														
 
															 	case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
														
 
															+		if (!test_kvm_facility(kvm, 76)) {
														
 
															+			mutex_unlock(&kvm->lock);
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															 		kvm->arch.crypto.dea_kw = 0;
														
 
															 		memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0,
														
 
															 			sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
														
 
															 		VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support");
														
 
															 		break;
														
 
															+	case KVM_S390_VM_CRYPTO_ENABLE_APIE:
														
 
															+		if (!ap_instructions_available()) {
														
 
															+			mutex_unlock(&kvm->lock);
														
 
															+			return -EOPNOTSUPP;
														
 
															+		}
														
 
															+		kvm->arch.crypto.apie = 1;
														
 
															+		break;
														
 
															+	case KVM_S390_VM_CRYPTO_DISABLE_APIE:
														
 
															+		if (!ap_instructions_available()) {
														
 
															+			mutex_unlock(&kvm->lock);
														
 
															+			return -EOPNOTSUPP;
														
 
															+		}
														
 
															+		kvm->arch.crypto.apie = 0;
														
 
															+		break;
														
 
															 	default:
														
 
															 		mutex_unlock(&kvm->lock);
														
 
															 		return -ENXIO;
														
@@ -1491,6 +1522,10 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 
															 		case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
														
 
															 			ret = 0;
														
 
															 			break;
														
 
															+		case KVM_S390_VM_CRYPTO_ENABLE_APIE:
														
 
															+		case KVM_S390_VM_CRYPTO_DISABLE_APIE:
														
 
															+			ret = ap_instructions_available() ? 0 : -ENXIO;
														
 
															+			break;
														
 
															 		default:
														
 
															 			ret = -ENXIO;
														
 
															 			break;
														
@@ -1992,55 +2027,101 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
															 	return r;
														
 
															 }
														
 
															-static int kvm_s390_query_ap_config(u8 *config)
														
 
															-{
														
 
															-	u32 fcn_code = 0x04000000UL;
														
 
															-	u32 cc = 0;
														
 
															-
														
 
															-	memset(config, 0, 128);
														
 
															-	asm volatile(
														
 
															-		"lgr 0,%1\n"
														
 
															-		"lgr 2,%2\n"
														
 
															-		".long 0xb2af0000\n"		/* PQAP(QCI) */
														
 
															-		"0: ipm %0\n"
														
 
															-		"srl %0,28\n"
														
 
															-		"1:\n"
														
 
															-		EX_TABLE(0b, 1b)
														
 
															-		: "+r" (cc)
														
 
															-		: "r" (fcn_code), "r" (config)
														
 
															-		: "cc", "0", "2", "memory"
														
 
															-	);
														
 
															-
														
 
															-	return cc;
														
 
															-}
														
 
															-
														
 
															 static int kvm_s390_apxa_installed(void)
														
 
															 {
														
 
															-	u8 config[128];
														
 
															-	int cc;
														
 
															+	struct ap_config_info info;
														
 
															-	if (test_facility(12)) {
														
 
															-		cc = kvm_s390_query_ap_config(config);
														
 
															-
														
 
															-		if (cc)
														
 
															-			pr_err("PQAP(QCI) failed with cc=%d", cc);
														
 
															-		else
														
 
															-			return config[0] & 0x40;
														
 
															+	if (ap_instructions_available()) {
														
 
															+		if (ap_qci(&info) == 0)
														
 
															+			return info.apxa;
														
 
															 	}
														
 
															 	return 0;
														
 
															 }
														
 
															+/*
														
 
															+ * The format of the crypto control block (CRYCB) is specified in the 3 low
														
 
															+ * order bits of the CRYCB designation (CRYCBD) field as follows:
														
 
															+ * Format 0: Neither the message security assist extension 3 (MSAX3) nor the
														
 
															+ *	     AP extended addressing (APXA) facility are installed.
														
 
															+ * Format 1: The APXA facility is not installed but the MSAX3 facility is.
														
 
															+ * Format 2: Both the APXA and MSAX3 facilities are installed
														
 
															+ */
														
 
															 static void kvm_s390_set_crycb_format(struct kvm *kvm)
														
 
															 {
														
 
															 	kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb;
														
 
															+	/* Clear the CRYCB format bits - i.e., set format 0 by default */
														
 
															+	kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK);
														
 
															+
														
 
															+	/* Check whether MSAX3 is installed */
														
 
															+	if (!test_kvm_facility(kvm, 76))
														
 
															+		return;
														
 
															+
														
 
															 	if (kvm_s390_apxa_installed())
														
 
															 		kvm->arch.crypto.crycbd |= CRYCB_FORMAT2;
														
 
															 	else
														
 
															 		kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
														
 
															 }
														
 
															+void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
														
 
															+			       unsigned long *aqm, unsigned long *adm)
														
 
															+{
														
 
															+	struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb;
														
 
															+
														
 
															+	mutex_lock(&kvm->lock);
														
 
															+	kvm_s390_vcpu_block_all(kvm);
														
 
															+
														
 
															+	switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) {
														
 
															+	case CRYCB_FORMAT2: /* APCB1 use 256 bits */
														
 
															+		memcpy(crycb->apcb1.apm, apm, 32);
														
 
															+		VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx %016lx %016lx %016lx",
														
 
															+			 apm[0], apm[1], apm[2], apm[3]);
														
 
															+		memcpy(crycb->apcb1.aqm, aqm, 32);
														
 
															+		VM_EVENT(kvm, 3, "SET CRYCB: aqm %016lx %016lx %016lx %016lx",
														
 
															+			 aqm[0], aqm[1], aqm[2], aqm[3]);
														
 
															+		memcpy(crycb->apcb1.adm, adm, 32);
														
 
															+		VM_EVENT(kvm, 3, "SET CRYCB: adm %016lx %016lx %016lx %016lx",
														
 
															+			 adm[0], adm[1], adm[2], adm[3]);
														
 
															+		break;
														
 
															+	case CRYCB_FORMAT1:
														
 
															+	case CRYCB_FORMAT0: /* Fall through both use APCB0 */
														
 
															+		memcpy(crycb->apcb0.apm, apm, 8);
														
 
															+		memcpy(crycb->apcb0.aqm, aqm, 2);
														
 
															+		memcpy(crycb->apcb0.adm, adm, 2);
														
 
															+		VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx aqm %04x adm %04x",
														
 
															+			 apm[0], *((unsigned short *)aqm),
														
 
															+			 *((unsigned short *)adm));
														
 
															+		break;
														
 
															+	default:	/* Can not happen */
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	/* recreate the shadow crycb for each vcpu */
														
 
															+	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
														
 
															+	kvm_s390_vcpu_unblock_all(kvm);
														
 
															+	mutex_unlock(&kvm->lock);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks);
														
 
															+
														
 
															+void kvm_arch_crypto_clear_masks(struct kvm *kvm)
														
 
															+{
														
 
															+	mutex_lock(&kvm->lock);
														
 
															+	kvm_s390_vcpu_block_all(kvm);
														
 
															+
														
 
															+	memset(&kvm->arch.crypto.crycb->apcb0, 0,
														
 
															+	       sizeof(kvm->arch.crypto.crycb->apcb0));
														
 
															+	memset(&kvm->arch.crypto.crycb->apcb1, 0,
														
 
															+	       sizeof(kvm->arch.crypto.crycb->apcb1));
														
 
															+
														
 
															+	VM_EVENT(kvm, 3, "%s", "CLR CRYCB:");
														
 
															+	/* recreate the shadow crycb for each vcpu */
														
 
															+	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
														
 
															+	kvm_s390_vcpu_unblock_all(kvm);
														
 
															+	mutex_unlock(&kvm->lock);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks);
														
 
															+
														
 
															 static u64 kvm_s390_get_initial_cpuid(void)
														
 
															 {
														
 
															 	struct cpuid cpuid;
														
@@ -2052,12 +2133,12 @@ static u64 kvm_s390_get_initial_cpuid(void)
 
															 static void kvm_s390_crypto_init(struct kvm *kvm)
														
 
															 {
														
 
															-	if (!test_kvm_facility(kvm, 76))
														
 
															-		return;
														
 
															-
														
 
															 	kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
														
 
															 	kvm_s390_set_crycb_format(kvm);
														
 
															+	if (!test_kvm_facility(kvm, 76))
														
 
															+		return;
														
 
															+
														
 
															 	/* Enable AES/DEA protected key functions by default */
														
 
															 	kvm->arch.crypto.aes_kw = 1;
														
 
															 	kvm->arch.crypto.dea_kw = 1;
														
@@ -2583,17 +2664,25 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
															 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	if (!test_kvm_facility(vcpu->kvm, 76))
														
 
															+	/*
														
 
															+	 * If the AP instructions are not being interpreted and the MSAX3
														
 
															+	 * facility is not configured for the guest, there is nothing to set up.
														
 
															+	 */
														
 
															+	if (!vcpu->kvm->arch.crypto.apie && !test_kvm_facility(vcpu->kvm, 76))
														
 
															 		return;
														
 
															+	vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
														
 
															 	vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
														
 
															+	vcpu->arch.sie_block->eca &= ~ECA_APIE;
														
 
															+
														
 
															+	if (vcpu->kvm->arch.crypto.apie)
														
 
															+		vcpu->arch.sie_block->eca |= ECA_APIE;
														
 
															+	/* Set up protected key support */
														
 
															 	if (vcpu->kvm->arch.crypto.aes_kw)
														
 
															 		vcpu->arch.sie_block->ecb3 |= ECB3_AES;
														
 
															 	if (vcpu->kvm->arch.crypto.dea_kw)
														
 
															 		vcpu->arch.sie_block->ecb3 |= ECB3_DEA;
														
 
															-
														
 
															-	vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
														
 
															 }
														
 
															 void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
														
@@ -2685,6 +2774,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
															 	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
														
 
															 	vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
														
 
															+	vcpu->arch.sie_block->hpid = HPID_KVM;
														
 
															+
														
 
															 	kvm_s390_vcpu_crypto_setup(vcpu);
														
 
															 	return rc;
														
@@ -2768,18 +2859,25 @@ static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu)
 
															 	exit_sie(vcpu);
														
 
															 }
														
 
															+bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	return atomic_read(&vcpu->arch.sie_block->prog20) &
														
 
															+	       (PROG_BLOCK_SIE | PROG_REQUEST);
														
 
															+}
														
 
															+
														
 
															 static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	atomic_andnot(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
														
 
															 }
														
 
															 /*
														
 
															- * Kick a guest cpu out of SIE and wait until SIE is not running.
														
 
															+ * Kick a guest cpu out of (v)SIE and wait until (v)SIE is not running.
														
 
															  * If the CPU is not running (e.g. waiting as idle) the function will
														
 
															  * return immediately. */
														
 
															 void exit_sie(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
														
 
															+	kvm_s390_vsie_kick(vcpu);
														
 
															 	while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
														
 
															 		cpu_relax();
														
 
															 }
														
@@ -3196,6 +3294,8 @@ retry:
 
															 	/* nothing to do, just clear the request */
														
 
															 	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
														
 
															+	/* we left the vsie handler, nothing to do, just clear the request */
														
 
															+	kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
														
 
															 	return 0;
														
 
															 }
														
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -290,6 +290,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
 
															 void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
														
 
															 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
														
 
															 void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
														
 
															+bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu);
														
 
															 void exit_sie(struct kvm_vcpu *vcpu);
														
 
															 void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu);
														
 
															 int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
														
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -135,14 +135,148 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 	atomic_set(&scb_s->cpuflags, newflags);
														
 
															 	return 0;
														
 
															 }
														
 
															+/* Copy to APCB FORMAT1 from APCB FORMAT0 */
														
 
															+static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
														
 
															+			unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
														
 
															+{
														
 
															+	struct kvm_s390_apcb0 tmp;
														
 
															-/*
														
 
															+	if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
														
 
															+	apcb_s->aqm[0] = apcb_h->aqm[0] & tmp.aqm[0] & 0xffff000000000000UL;
														
 
															+	apcb_s->adm[0] = apcb_h->adm[0] & tmp.adm[0] & 0xffff000000000000UL;
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
														
 
															+ * @vcpu: pointer to the virtual CPU
														
 
															+ * @apcb_s: pointer to start of apcb in the shadow crycb
														
 
															+ * @apcb_o: pointer to start of original apcb in the guest2
														
 
															+ * @apcb_h: pointer to start of apcb in the guest1
														
 
															+ *
														
 
															+ * Returns 0 and -EFAULT on error reading guest apcb
														
 
															+ */
														
 
															+static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
														
 
															+			unsigned long apcb_o, unsigned long *apcb_h)
														
 
															+{
														
 
															+	if (read_guest_real(vcpu, apcb_o, apcb_s,
														
 
															+			    sizeof(struct kvm_s390_apcb0)))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
														
 
															+ * @vcpu: pointer to the virtual CPU
														
 
															+ * @apcb_s: pointer to start of apcb in the shadow crycb
														
 
															+ * @apcb_o: pointer to start of original guest apcb
														
 
															+ * @apcb_h: pointer to start of apcb in the host
														
 
															+ *
														
 
															+ * Returns 0 and -EFAULT on error reading guest apcb
														
 
															+ */
														
 
															+static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
														
 
															+			unsigned long apcb_o,
														
 
															+			unsigned long *apcb_h)
														
 
															+{
														
 
															+	if (read_guest_real(vcpu, apcb_o, apcb_s,
														
 
															+			    sizeof(struct kvm_s390_apcb1)))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * setup_apcb - Create a shadow copy of the apcb.
														
 
															+ * @vcpu: pointer to the virtual CPU
														
 
															+ * @crycb_s: pointer to shadow crycb
														
 
															+ * @crycb_o: pointer to original guest crycb
														
 
															+ * @crycb_h: pointer to the host crycb
														
 
															+ * @fmt_o: format of the original guest crycb.
														
 
															+ * @fmt_h: format of the host crycb.
														
 
															+ *
														
 
															+ * Checks the compatibility between the guest and host crycb and calls the
														
 
															+ * appropriate copy function.
														
 
															+ *
														
 
															+ * Return 0 or an error number if the guest and host crycb are incompatible.
														
 
															+ */
														
 
															+static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
														
 
															+	       const u32 crycb_o,
														
 
															+	       struct kvm_s390_crypto_cb *crycb_h,
														
 
															+	       int fmt_o, int fmt_h)
														
 
															+{
														
 
															+	struct kvm_s390_crypto_cb *crycb;
														
 
															+
														
 
															+	crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
														
 
															+
														
 
															+	switch (fmt_o) {
														
 
															+	case CRYCB_FORMAT2:
														
 
															+		if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
														
 
															+			return -EACCES;
														
 
															+		if (fmt_h != CRYCB_FORMAT2)
														
 
															+			return -EINVAL;
														
 
															+		return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
														
 
															+				    (unsigned long) &crycb->apcb1,
														
 
															+				    (unsigned long *)&crycb_h->apcb1);
														
 
															+	case CRYCB_FORMAT1:
														
 
															+		switch (fmt_h) {
														
 
															+		case CRYCB_FORMAT2:
														
 
															+			return setup_apcb10(vcpu, &crycb_s->apcb1,
														
 
															+					    (unsigned long) &crycb->apcb0,
														
 
															+					    &crycb_h->apcb1);
														
 
															+		case CRYCB_FORMAT1:
														
 
															+			return setup_apcb00(vcpu,
														
 
															+					    (unsigned long *) &crycb_s->apcb0,
														
 
															+					    (unsigned long) &crycb->apcb0,
														
 
															+					    (unsigned long *) &crycb_h->apcb0);
														
 
															+		}
														
 
															+		break;
														
 
															+	case CRYCB_FORMAT0:
														
 
															+		if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
														
 
															+			return -EACCES;
														
 
															+
														
 
															+		switch (fmt_h) {
														
 
															+		case CRYCB_FORMAT2:
														
 
															+			return setup_apcb10(vcpu, &crycb_s->apcb1,
														
 
															+					    (unsigned long) &crycb->apcb0,
														
 
															+					    &crycb_h->apcb1);
														
 
															+		case CRYCB_FORMAT1:
														
 
															+		case CRYCB_FORMAT0:
														
 
															+			return setup_apcb00(vcpu,
														
 
															+					    (unsigned long *) &crycb_s->apcb0,
														
 
															+					    (unsigned long) &crycb->apcb0,
														
 
															+					    (unsigned long *) &crycb_h->apcb0);
														
 
															+		}
														
 
															+	}
														
 
															+	return -EINVAL;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * shadow_crycb - Create a shadow copy of the crycb block
														
 
															+ * @vcpu: a pointer to the virtual CPU
														
 
															+ * @vsie_page: a pointer to internal date used for the vSIE
														
 
															+ *
														
 
															  * Create a shadow copy of the crycb block and setup key wrapping, if
														
 
															  * requested for guest 3 and enabled for guest 2.
														
 
															  *
														
 
															- * We only accept format-1 (no AP in g2), but convert it into format-2
														
 
															+ * We accept format-1 or format-2, but we convert format-1 into format-2
														
 
															+ * in the shadow CRYCB.
														
 
															+ * Using format-2 enables the firmware to choose the right format when
														
 
															+ * scheduling the SIE.
														
 
															  * There is nothing to do for format-0.
														
 
															  *
														
 
															+ * This function centralize the issuing of set_validity_icpt() for all
														
 
															+ * the subfunctions working on the crycb.
														
 
															+ *
														
 
															  * Returns: - 0 if shadowed or nothing to do
														
 
															  *          - > 0 if control has to be given to guest 2
														
 
															  */
														
@@ -154,23 +288,40 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 	const u32 crycb_addr = crycbd_o & 0x7ffffff8U;
														
 
															 	unsigned long *b1, *b2;
														
 
															 	u8 ecb3_flags;
														
 
															+	int apie_h;
														
 
															+	int key_msk = test_kvm_facility(vcpu->kvm, 76);
														
 
															+	int fmt_o = crycbd_o & CRYCB_FORMAT_MASK;
														
 
															+	int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK;
														
 
															+	int ret = 0;
														
 
															 	scb_s->crycbd = 0;
														
 
															-	if (!(crycbd_o & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
														
 
															-		return 0;
														
 
															-	/* format-1 is supported with message-security-assist extension 3 */
														
 
															-	if (!test_kvm_facility(vcpu->kvm, 76))
														
 
															+
														
 
															+	apie_h = vcpu->arch.sie_block->eca & ECA_APIE;
														
 
															+	if (!apie_h && !key_msk)
														
 
															 		return 0;
														
 
															+
														
 
															+	if (!crycb_addr)
														
 
															+		return set_validity_icpt(scb_s, 0x0039U);
														
 
															+
														
 
															+	if (fmt_o == CRYCB_FORMAT1)
														
 
															+		if ((crycb_addr & PAGE_MASK) !=
														
 
															+		    ((crycb_addr + 128) & PAGE_MASK))
														
 
															+			return set_validity_icpt(scb_s, 0x003CU);
														
 
															+
														
 
															+	if (apie_h && (scb_o->eca & ECA_APIE)) {
														
 
															+		ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr,
														
 
															+				 vcpu->kvm->arch.crypto.crycb,
														
 
															+				 fmt_o, fmt_h);
														
 
															+		if (ret)
														
 
															+			goto end;
														
 
															+		scb_s->eca |= scb_o->eca & ECA_APIE;
														
 
															+	}
														
 
															+
														
 
															 	/* we may only allow it if enabled for guest 2 */
														
 
															 	ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
														
 
															 		     (ECB3_AES | ECB3_DEA);
														
 
															 	if (!ecb3_flags)
														
 
															-		return 0;
														
 
															-
														
 
															-	if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
														
 
															-		return set_validity_icpt(scb_s, 0x003CU);
														
 
															-	else if (!crycb_addr)
														
 
															-		return set_validity_icpt(scb_s, 0x0039U);
														
 
															+		goto end;
														
 
															 	/* copy only the wrapping keys */
														
 
															 	if (read_guest_real(vcpu, crycb_addr + 72,
														
@@ -178,8 +329,6 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 		return set_validity_icpt(scb_s, 0x0035U);
														
 
															 	scb_s->ecb3 |= ecb3_flags;
														
 
															-	scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
														
 
															-			CRYCB_FORMAT2;
														
 
															 	/* xor both blocks in one run */
														
 
															 	b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
														
@@ -187,6 +336,16 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 			    vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
														
 
															 	/* as 56%8 == 0, bitmap_xor won't overwrite any data */
														
 
															 	bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
														
 
															+end:
														
 
															+	switch (ret) {
														
 
															+	case -EINVAL:
														
 
															+		return set_validity_icpt(scb_s, 0x0020U);
														
 
															+	case -EFAULT:
														
 
															+		return set_validity_icpt(scb_s, 0x0035U);
														
 
															+	case -EACCES:
														
 
															+		return set_validity_icpt(scb_s, 0x003CU);
														
 
															+	}
														
 
															+	scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
														
 
															 	return 0;
														
 
															 }
														
@@ -383,6 +542,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 	if (test_kvm_facility(vcpu->kvm, 156))
														
 
															 		scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
														
 
															+	scb_s->hpid = HPID_VSIE;
														
 
															+
														
 
															 	prepare_ibc(vcpu, vsie_page);
														
 
															 	rc = shadow_crycb(vcpu, vsie_page);
														
 
															 out:
														
@@ -830,7 +991,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
														
 
															 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
														
 
															 	int guest_bp_isolation;
														
 
															-	int rc;
														
 
															+	int rc = 0;
														
 
															 	handle_last_fault(vcpu, vsie_page);
														
@@ -858,7 +1019,18 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 	guest_enter_irqoff();
														
 
															 	local_irq_enable();
														
 
															-	rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
														
 
															+	/*
														
 
															+	 * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
														
 
															+	 * and VCPU requests also hinder the vSIE from running and lead
														
 
															+	 * to an immediate exit. kvm_s390_vsie_kick() has to be used to
														
 
															+	 * also kick the vSIE.
														
 
															+	 */
														
 
															+	vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
														
 
															+	barrier();
														
 
															+	if (!kvm_s390_vcpu_sie_inhibited(vcpu))
														
 
															+		rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
														
 
															+	barrier();
														
 
															+	vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
														
 
															 	local_irq_disable();
														
 
															 	guest_exit_irqoff();
														
@@ -1005,7 +1177,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 		if (rc == -EAGAIN)
														
 
															 			rc = 0;
														
 
															 		if (rc || scb_s->icptcode || signal_pending(current) ||
														
 
															-		    kvm_s390_vcpu_has_irq(vcpu, 0))
														
 
															+		    kvm_s390_vcpu_has_irq(vcpu, 0) ||
														
 
															+		    kvm_s390_vcpu_sie_inhibited(vcpu))
														
 
															 			break;
														
 
															 	}
														
@@ -1122,7 +1295,8 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
 
															 	if (unlikely(scb_addr & 0x1ffUL))
														
 
															 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
														
 
															-	if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
														
 
															+	if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
														
 
															+	    kvm_s390_vcpu_sie_inhibited(vcpu))
														
 
															 		return 0;
														
 
															 	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
														
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -907,10 +907,16 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
 
															 	pmd_t *pmdp;
														
 
															 	BUG_ON(gmap_is_shadow(gmap));
														
 
															-	spin_lock(&gmap->guest_table_lock);
														
 
															 	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
														
 
															+	if (!pmdp)
														
 
															+		return NULL;
														
 
															-	if (!pmdp || pmd_none(*pmdp)) {
														
 
															+	/* without huge pages, there is no need to take the table lock */
														
 
															+	if (!gmap->mm->context.allow_gmap_hpage_1m)
														
 
															+		return pmd_none(*pmdp) ? NULL : pmdp;
														
 
															+
														
 
															+	spin_lock(&gmap->guest_table_lock);
														
 
															+	if (pmd_none(*pmdp)) {
														
 
															 		spin_unlock(&gmap->guest_table_lock);
														
 
															 		return NULL;
														
 
															 	}
														
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -106,6 +106,8 @@ static struct facility_def facility_defs[] = {
 
															 		.name = "FACILITIES_KVM_CPUMODEL",
														
 
															 		.bits = (int[]){
														
 
															+			12, /* AP Query Configuration Information */
														
 
															+			15, /* AP Facilities Test */
														
 
															 			156, /* etoken facility */
														
 
															 			-1  /* END */
														
 
															 		}
														
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -102,7 +102,15 @@
 
															 #define UNMAPPED_GVA (~(gpa_t)0)
														
 
															 /* KVM Hugepage definitions for x86 */
														
 
															-#define KVM_NR_PAGE_SIZES	3
														
 
															+enum {
														
 
															+	PT_PAGE_TABLE_LEVEL   = 1,
														
 
															+	PT_DIRECTORY_LEVEL    = 2,
														
 
															+	PT_PDPE_LEVEL         = 3,
														
 
															+	/* set max level to the biggest one */
														
 
															+	PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL,
														
 
															+};
														
 
															+#define KVM_NR_PAGE_SIZES	(PT_MAX_HUGEPAGE_LEVEL - \
														
 
															+				 PT_PAGE_TABLE_LEVEL + 1)
														
 
															 #define KVM_HPAGE_GFN_SHIFT(x)	(((x) - 1) * 9)
														
 
															 #define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
														
 
															 #define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
														
@@ -177,6 +185,7 @@ enum {
 
															 #define DR6_BD		(1 << 13)
														
 
															 #define DR6_BS		(1 << 14)
														
 
															+#define DR6_BT		(1 << 15)
														
 
															 #define DR6_RTM		(1 << 16)
														
 
															 #define DR6_FIXED_1	0xfffe0ff0
														
 
															 #define DR6_INIT	0xffff0ff0
														
@@ -247,7 +256,7 @@ struct kvm_mmu_memory_cache {
 
															  * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
														
 
															  */
														
 
															 union kvm_mmu_page_role {
														
 
															-	unsigned word;
														
 
															+	u32 word;
														
 
															 	struct {
														
 
															 		unsigned level:4;
														
 
															 		unsigned cr4_pae:1;
														
@@ -273,6 +282,34 @@ union kvm_mmu_page_role {
 
															 	};
														
 
															 };
														
 
															+union kvm_mmu_extended_role {
														
 
															+/*
														
 
															+ * This structure complements kvm_mmu_page_role caching everything needed for
														
 
															+ * MMU configuration. If nothing in both these structures changed, MMU
														
 
															+ * re-configuration can be skipped. @valid bit is set on first usage so we don't
														
 
															+ * treat all-zero structure as valid data.
														
 
															+ */
														
 
															+	u32 word;
														
 
															+	struct {
														
 
															+		unsigned int valid:1;
														
 
															+		unsigned int execonly:1;
														
 
															+		unsigned int cr0_pg:1;
														
 
															+		unsigned int cr4_pse:1;
														
 
															+		unsigned int cr4_pke:1;
														
 
															+		unsigned int cr4_smap:1;
														
 
															+		unsigned int cr4_smep:1;
														
 
															+		unsigned int cr4_la57:1;
														
 
															+	};
														
 
															+};
														
 
															+
														
 
															+union kvm_mmu_role {
														
 
															+	u64 as_u64;
														
 
															+	struct {
														
 
															+		union kvm_mmu_page_role base;
														
 
															+		union kvm_mmu_extended_role ext;
														
 
															+	};
														
 
															+};
														
 
															+
														
 
															 struct kvm_rmap_head {
														
 
															 	unsigned long val;
														
 
															 };
														
@@ -280,18 +317,18 @@ struct kvm_rmap_head {
 
															 struct kvm_mmu_page {
														
 
															 	struct list_head link;
														
 
															 	struct hlist_node hash_link;
														
 
															+	bool unsync;
														
 
															 	/*
														
 
															 	 * The following two entries are used to key the shadow page in the
														
 
															 	 * hash table.
														
 
															 	 */
														
 
															-	gfn_t gfn;
														
 
															 	union kvm_mmu_page_role role;
														
 
															+	gfn_t gfn;
														
 
															 	u64 *spt;
														
 
															 	/* hold the gfn of each spte inside spt */
														
 
															 	gfn_t *gfns;
														
 
															-	bool unsync;
														
 
															 	int root_count;          /* Currently serving as active root */
														
 
															 	unsigned int unsync_children;
														
 
															 	struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
														
@@ -360,7 +397,7 @@ struct kvm_mmu {
 
															 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
														
 
															 			   u64 *spte, const void *pte);
														
 
															 	hpa_t root_hpa;
														
 
															-	union kvm_mmu_page_role base_role;
														
 
															+	union kvm_mmu_role mmu_role;
														
 
															 	u8 root_level;
														
 
															 	u8 shadow_root_level;
														
 
															 	u8 ept_ad;
														
@@ -490,7 +527,7 @@ struct kvm_vcpu_hv {
 
															 	struct kvm_hyperv_exit exit;
														
 
															 	struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
														
 
															 	DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
														
 
															-	cpumask_t tlb_lush;
														
 
															+	cpumask_t tlb_flush;
														
 
															 };
														
 
															 struct kvm_vcpu_arch {
														
@@ -534,7 +571,13 @@ struct kvm_vcpu_arch {
 
															 	 * the paging mode of the l1 guest. This context is always used to
														
 
															 	 * handle faults.
														
 
															 	 */
														
 
															-	struct kvm_mmu mmu;
														
 
															+	struct kvm_mmu *mmu;
														
 
															+
														
 
															+	/* Non-nested MMU for L1 */
														
 
															+	struct kvm_mmu root_mmu;
														
 
															+
														
 
															+	/* L1 MMU when running nested */
														
 
															+	struct kvm_mmu guest_mmu;
														
 
															 	/*
														
 
															 	 * Paging state of an L2 guest (used for nested npt)
														
@@ -585,6 +628,8 @@ struct kvm_vcpu_arch {
 
															 		bool has_error_code;
														
 
															 		u8 nr;
														
 
															 		u32 error_code;
														
 
															+		unsigned long payload;
														
 
															+		bool has_payload;
														
 
															 		u8 nested_apf;
														
 
															 	} exception;
														
@@ -781,6 +826,9 @@ struct kvm_hv {
 
															 	u64 hv_reenlightenment_control;
														
 
															 	u64 hv_tsc_emulation_control;
														
 
															 	u64 hv_tsc_emulation_status;
														
 
															+
														
 
															+	/* How many vCPUs have VP index != vCPU index */
														
 
															+	atomic_t num_mismatched_vp_indexes;
														
 
															 };
														
 
															 enum kvm_irqchip_mode {
														
@@ -871,6 +919,7 @@ struct kvm_arch {
 
															 	bool x2apic_broadcast_quirk_disabled;
														
 
															 	bool guest_can_read_msr_platform_info;
														
 
															+	bool exception_payload_enabled;
														
 
															 };
														
 
															 struct kvm_vm_stat {
														
@@ -1133,6 +1182,9 @@ struct kvm_x86_ops {
 
															 	int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
														
 
															 	int (*get_msr_feature)(struct kvm_msr_entry *entry);
														
 
															+
														
 
															+	int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
														
 
															+				   uint16_t *vmcs_version);
														
 
															 };
														
 
															 struct kvm_arch_async_pf {
														
@@ -1170,7 +1222,6 @@ void kvm_mmu_module_exit(void);
 
															 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
														
 
															 int kvm_mmu_create(struct kvm_vcpu *vcpu);
														
 
															-void kvm_mmu_setup(struct kvm_vcpu *vcpu);
														
 
															 void kvm_mmu_init_vm(struct kvm *kvm);
														
 
															 void kvm_mmu_uninit_vm(struct kvm *kvm);
														
 
															 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
														
@@ -1324,7 +1375,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 
															 int kvm_mmu_load(struct kvm_vcpu *vcpu);
														
 
															 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
														
 
															 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
														
 
															-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free);
														
 
															+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
														
 
															+			ulong roots_to_free);
														
 
															 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
														
 
															 			   struct x86_exception *exception);
														
 
															 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
														
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -40,7 +40,7 @@ static inline int cpu_has_vmx(void)
 
															  */
														
 
															 static inline void cpu_vmxoff(void)
														
 
															 {
														
 
															-	asm volatile (ASM_VMX_VMXOFF : : : "cc");
														
 
															+	asm volatile ("vmxoff");
														
 
															 	cr4_clear_bits(X86_CR4_VMXE);
														
 
															 }
														
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -503,19 +503,6 @@ enum vmcs_field {
 
															 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
														
 
															-
														
 
															-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
														
 
															-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
														
 
															-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
														
 
															-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
														
 
															-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
														
 
															-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
														
 
															-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
														
 
															-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
														
 
															-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
														
 
															-#define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
														
 
															-#define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
														
 
															-
														
 
															 struct vmx_msr_entry {
														
 
															 	u32 index;
														
 
															 	u32 reserved;
														
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -288,6 +288,7 @@ struct kvm_reinject_control {
 
															 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
														
 
															 #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
														
 
															 #define KVM_VCPUEVENT_VALID_SMM		0x00000008
														
 
															+#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
														
 
															 /* Interrupt shadow states */
														
 
															 #define KVM_X86_SHADOW_INT_MOV_SS	0x01
														
@@ -299,7 +300,7 @@ struct kvm_vcpu_events {
 
															 		__u8 injected;
														
 
															 		__u8 nr;
														
 
															 		__u8 has_error_code;
														
 
															-		__u8 pad;
														
 
															+		__u8 pending;
														
 
															 		__u32 error_code;
														
 
															 	} exception;
														
 
															 	struct {
														
@@ -322,7 +323,9 @@ struct kvm_vcpu_events {
 
															 		__u8 smm_inside_nmi;
														
 
															 		__u8 latched_init;
														
 
															 	} smi;
														
 
															-	__u32 reserved[9];
														
 
															+	__u8 reserved[27];
														
 
															+	__u8 exception_has_payload;
														
 
															+	__u64 exception_payload;
														
 
															 };
														
 
															 /* for KVM_GET/SET_DEBUGREGS */
														
@@ -381,6 +384,7 @@ struct kvm_sync_regs {
 
															 #define KVM_STATE_NESTED_GUEST_MODE	0x00000001
														
 
															 #define KVM_STATE_NESTED_RUN_PENDING	0x00000002
														
 
															+#define KVM_STATE_NESTED_EVMCS		0x00000004
														
 
															 #define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
														
 
															 #define KVM_STATE_NESTED_SMM_VMXON	0x00000002
														
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -36,6 +36,8 @@
 
															 #include "trace.h"
														
 
															+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
														
 
															+
														
 
															 static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
														
 
															 {
														
 
															 	return atomic64_read(&synic->sint[sint]);
														
@@ -132,8 +134,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
 
															 	struct kvm_vcpu *vcpu = NULL;
														
 
															 	int i;
														
 
															-	if (vpidx < KVM_MAX_VCPUS)
														
 
															-		vcpu = kvm_get_vcpu(kvm, vpidx);
														
 
															+	if (vpidx >= KVM_MAX_VCPUS)
														
 
															+		return NULL;
														
 
															+
														
 
															+	vcpu = kvm_get_vcpu(kvm, vpidx);
														
 
															 	if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
														
 
															 		return vcpu;
														
 
															 	kvm_for_each_vcpu(i, vcpu, kvm)
														
@@ -689,6 +693,24 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
 
															 		stimer_cleanup(&hv_vcpu->stimer[i]);
														
 
															 }
														
 
															+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
														
 
															+		return false;
														
 
															+	return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
														
 
															+
														
 
															+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
														
 
															+			    struct hv_vp_assist_page *assist_page)
														
 
															+{
														
 
															+	if (!kvm_hv_assist_page_enabled(vcpu))
														
 
															+		return false;
														
 
															+	return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
														
 
															+				      assist_page, sizeof(*assist_page));
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
														
 
															+
														
 
															 static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
														
 
															 {
														
 
															 	struct hv_message *msg = &stimer->msg;
														
@@ -1040,21 +1062,41 @@ static u64 current_task_runtime_100ns(void)
 
															 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
														
 
															 {
														
 
															-	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
														
 
															+	struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
														
 
															 	switch (msr) {
														
 
															-	case HV_X64_MSR_VP_INDEX:
														
 
															-		if (!host)
														
 
															+	case HV_X64_MSR_VP_INDEX: {
														
 
															+		struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
														
 
															+		int vcpu_idx = kvm_vcpu_get_idx(vcpu);
														
 
															+		u32 new_vp_index = (u32)data;
														
 
															+
														
 
															+		if (!host || new_vp_index >= KVM_MAX_VCPUS)
														
 
															 			return 1;
														
 
															-		hv->vp_index = (u32)data;
														
 
															+
														
 
															+		if (new_vp_index == hv_vcpu->vp_index)
														
 
															+			return 0;
														
 
															+
														
 
															+		/*
														
 
															+		 * The VP index is initialized to vcpu_index by
														
 
															+		 * kvm_hv_vcpu_postcreate so they initially match.  Now the
														
 
															+		 * VP index is changing, adjust num_mismatched_vp_indexes if
														
 
															+		 * it now matches or no longer matches vcpu_idx.
														
 
															+		 */
														
 
															+		if (hv_vcpu->vp_index == vcpu_idx)
														
 
															+			atomic_inc(&hv->num_mismatched_vp_indexes);
														
 
															+		else if (new_vp_index == vcpu_idx)
														
 
															+			atomic_dec(&hv->num_mismatched_vp_indexes);
														
 
															+
														
 
															+		hv_vcpu->vp_index = new_vp_index;
														
 
															 		break;
														
 
															+	}
														
 
															 	case HV_X64_MSR_VP_ASSIST_PAGE: {
														
 
															 		u64 gfn;
														
 
															 		unsigned long addr;
														
 
															 		if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
														
 
															-			hv->hv_vapic = data;
														
 
															-			if (kvm_lapic_enable_pv_eoi(vcpu, 0))
														
 
															+			hv_vcpu->hv_vapic = data;
														
 
															+			if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
														
 
															 				return 1;
														
 
															 			break;
														
 
															 		}
														
@@ -1062,12 +1104,19 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
															 		addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
														
 
															 		if (kvm_is_error_hva(addr))
														
 
															 			return 1;
														
 
															-		if (__clear_user((void __user *)addr, PAGE_SIZE))
														
 
															+
														
 
															+		/*
														
 
															+		 * Clear apic_assist portion of f(struct hv_vp_assist_page
														
 
															+		 * only, there can be valuable data in the rest which needs
														
 
															+		 * to be preserved e.g. on migration.
														
 
															+		 */
														
 
															+		if (__clear_user((void __user *)addr, sizeof(u32)))
														
 
															 			return 1;
														
 
															-		hv->hv_vapic = data;
														
 
															+		hv_vcpu->hv_vapic = data;
														
 
															 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
														
 
															 		if (kvm_lapic_enable_pv_eoi(vcpu,
														
 
															-					    gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
														
 
															+					    gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
														
 
															+					    sizeof(struct hv_vp_assist_page)))
														
 
															 			return 1;
														
 
															 		break;
														
 
															 	}
														
@@ -1080,7 +1129,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
															 	case HV_X64_MSR_VP_RUNTIME:
														
 
															 		if (!host)
														
 
															 			return 1;
														
 
															-		hv->runtime_offset = data - current_task_runtime_100ns();
														
 
															+		hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
														
 
															 		break;
														
 
															 	case HV_X64_MSR_SCONTROL:
														
 
															 	case HV_X64_MSR_SVERSION:
														
@@ -1172,11 +1221,11 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
 
															 			  bool host)
														
 
															 {
														
 
															 	u64 data = 0;
														
 
															-	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
														
 
															+	struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
														
 
															 	switch (msr) {
														
 
															 	case HV_X64_MSR_VP_INDEX:
														
 
															-		data = hv->vp_index;
														
 
															+		data = hv_vcpu->vp_index;
														
 
															 		break;
														
 
															 	case HV_X64_MSR_EOI:
														
 
															 		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
														
@@ -1185,10 +1234,10 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
 
															 	case HV_X64_MSR_TPR:
														
 
															 		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
														
 
															 	case HV_X64_MSR_VP_ASSIST_PAGE:
														
 
															-		data = hv->hv_vapic;
														
 
															+		data = hv_vcpu->hv_vapic;
														
 
															 		break;
														
 
															 	case HV_X64_MSR_VP_RUNTIME:
														
 
															-		data = current_task_runtime_100ns() + hv->runtime_offset;
														
 
															+		data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
														
 
															 		break;
														
 
															 	case HV_X64_MSR_SCONTROL:
														
 
															 	case HV_X64_MSR_SVERSION:
														
@@ -1255,32 +1304,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 
															 		return kvm_hv_get_msr(vcpu, msr, pdata, host);
														
 
															 }
														
 
															-static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
														
 
															+static __always_inline unsigned long *sparse_set_to_vcpu_mask(
														
 
															+	struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
														
 
															+	u64 *vp_bitmap, unsigned long *vcpu_bitmap)
														
 
															 {
														
 
															-	int i = 0, j;
														
 
															+	struct kvm_hv *hv = &kvm->arch.hyperv;
														
 
															+	struct kvm_vcpu *vcpu;
														
 
															+	int i, bank, sbank = 0;
														
 
															-	if (!(valid_bank_mask & BIT_ULL(bank_no)))
														
 
															-		return -1;
														
 
															+	memset(vp_bitmap, 0,
														
 
															+	       KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
														
 
															+	for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
														
 
															+			 KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
														
 
															+		vp_bitmap[bank] = sparse_banks[sbank++];
														
 
															-	for (j = 0; j < bank_no; j++)
														
 
															-		if (valid_bank_mask & BIT_ULL(j))
														
 
															-			i++;
														
 
															+	if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) {
														
 
															+		/* for all vcpus vp_index == vcpu_idx */
														
 
															+		return (unsigned long *)vp_bitmap;
														
 
															+	}
														
 
															-	return i;
														
 
															+	bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
														
 
															+	kvm_for_each_vcpu(i, vcpu, kvm) {
														
 
															+		if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index,
														
 
															+			     (unsigned long *)vp_bitmap))
														
 
															+			__set_bit(i, vcpu_bitmap);
														
 
															+	}
														
 
															+	return vcpu_bitmap;
														
 
															 }
														
 
															 static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
														
 
															 			    u16 rep_cnt, bool ex)
														
 
															 {
														
 
															 	struct kvm *kvm = current_vcpu->kvm;
														
 
															-	struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
														
 
															+	struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
														
 
															 	struct hv_tlb_flush_ex flush_ex;
														
 
															 	struct hv_tlb_flush flush;
														
 
															-	struct kvm_vcpu *vcpu;
														
 
															-	unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
														
 
															-	unsigned long valid_bank_mask = 0;
														
 
															+	u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
														
 
															+	DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
														
 
															+	unsigned long *vcpu_mask;
														
 
															+	u64 valid_bank_mask;
														
 
															 	u64 sparse_banks[64];
														
 
															-	int sparse_banks_len, i;
														
 
															+	int sparse_banks_len;
														
 
															 	bool all_cpus;
														
 
															 	if (!ex) {
														
@@ -1290,6 +1354,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
 
															 		trace_kvm_hv_flush_tlb(flush.processor_mask,
														
 
															 				       flush.address_space, flush.flags);
														
 
															+		valid_bank_mask = BIT_ULL(0);
														
 
															 		sparse_banks[0] = flush.processor_mask;
														
 
															 		all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
														
 
															 	} else {
														
@@ -1306,7 +1371,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
 
															 		all_cpus = flush_ex.hv_vp_set.format !=
														
 
															 			HV_GENERIC_SET_SPARSE_4K;
														
 
															-		sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
														
 
															+		sparse_banks_len =
														
 
															+			bitmap_weight((unsigned long *)&valid_bank_mask, 64) *
														
 
															 			sizeof(sparse_banks[0]);
														
 
															 		if (!sparse_banks_len && !all_cpus)
														
@@ -1321,48 +1387,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
 
															 			return HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															 	}
														
 
															-	cpumask_clear(&hv_current->tlb_lush);
														
 
															-
														
 
															-	kvm_for_each_vcpu(i, vcpu, kvm) {
														
 
															-		struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
														
 
															-		int bank = hv->vp_index / 64, sbank = 0;
														
 
															-
														
 
															-		if (!all_cpus) {
														
 
															-			/* Banks >64 can't be represented */
														
 
															-			if (bank >= 64)
														
 
															-				continue;
														
 
															-
														
 
															-			/* Non-ex hypercalls can only address first 64 vCPUs */
														
 
															-			if (!ex && bank)
														
 
															-				continue;
														
 
															-
														
 
															-			if (ex) {
														
 
															-				/*
														
 
															-				 * Check is the bank of this vCPU is in sparse
														
 
															-				 * set and get the sparse bank number.
														
 
															-				 */
														
 
															-				sbank = get_sparse_bank_no(valid_bank_mask,
														
 
															-							   bank);
														
 
															-
														
 
															-				if (sbank < 0)
														
 
															-					continue;
														
 
															-			}
														
 
															-
														
 
															-			if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
														
 
															-				continue;
														
 
															-		}
														
 
															+	cpumask_clear(&hv_vcpu->tlb_flush);
														
 
															-		/*
														
 
															-		 * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
														
 
															-		 * can't analyze it here, flush TLB regardless of the specified
														
 
															-		 * address space.
														
 
															-		 */
														
 
															-		__set_bit(i, vcpu_bitmap);
														
 
															-	}
														
 
															+	vcpu_mask = all_cpus ? NULL :
														
 
															+		sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
														
 
															+					vp_bitmap, vcpu_bitmap);
														
 
															+	/*
														
 
															+	 * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
														
 
															+	 * analyze it here, flush TLB regardless of the specified address space.
														
 
															+	 */
														
 
															 	kvm_make_vcpus_request_mask(kvm,
														
 
															 				    KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
														
 
															-				    vcpu_bitmap, &hv_current->tlb_lush);
														
 
															+				    vcpu_mask, &hv_vcpu->tlb_flush);
														
 
															 ret_success:
														
 
															 	/* We always do full TLB flush, set rep_done = rep_cnt. */
														
@@ -1370,6 +1407,99 @@ ret_success:
 
															 		((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
														
 
															 }
														
 
															+static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
														
 
															+				 unsigned long *vcpu_bitmap)
														
 
															+{
														
 
															+	struct kvm_lapic_irq irq = {
														
 
															+		.delivery_mode = APIC_DM_FIXED,
														
 
															+		.vector = vector
														
 
															+	};
														
 
															+	struct kvm_vcpu *vcpu;
														
 
															+	int i;
														
 
															+
														
 
															+	kvm_for_each_vcpu(i, vcpu, kvm) {
														
 
															+		if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
														
 
															+			continue;
														
 
															+
														
 
															+		/* We fail only when APIC is disabled */
														
 
															+		kvm_apic_set_irq(vcpu, &irq, NULL);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
														
 
															+			   bool ex, bool fast)
														
 
															+{
														
 
															+	struct kvm *kvm = current_vcpu->kvm;
														
 
															+	struct hv_send_ipi_ex send_ipi_ex;
														
 
															+	struct hv_send_ipi send_ipi;
														
 
															+	u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
														
 
															+	DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
														
 
															+	unsigned long *vcpu_mask;
														
 
															+	unsigned long valid_bank_mask;
														
 
															+	u64 sparse_banks[64];
														
 
															+	int sparse_banks_len;
														
 
															+	u32 vector;
														
 
															+	bool all_cpus;
														
 
															+
														
 
															+	if (!ex) {
														
 
															+		if (!fast) {
														
 
															+			if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
														
 
															+						    sizeof(send_ipi))))
														
 
															+				return HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															+			sparse_banks[0] = send_ipi.cpu_mask;
														
 
															+			vector = send_ipi.vector;
														
 
															+		} else {
														
 
															+			/* 'reserved' part of hv_send_ipi should be 0 */
														
 
															+			if (unlikely(ingpa >> 32 != 0))
														
 
															+				return HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															+			sparse_banks[0] = outgpa;
														
 
															+			vector = (u32)ingpa;
														
 
															+		}
														
 
															+		all_cpus = false;
														
 
															+		valid_bank_mask = BIT_ULL(0);
														
 
															+
														
 
															+		trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
														
 
															+	} else {
														
 
															+		if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
														
 
															+					    sizeof(send_ipi_ex))))
														
 
															+			return HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															+
														
 
															+		trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
														
 
															+					 send_ipi_ex.vp_set.format,
														
 
															+					 send_ipi_ex.vp_set.valid_bank_mask);
														
 
															+
														
 
															+		vector = send_ipi_ex.vector;
														
 
															+		valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
														
 
															+		sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
														
 
															+			sizeof(sparse_banks[0]);
														
 
															+
														
 
															+		all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
														
 
															+
														
 
															+		if (!sparse_banks_len)
														
 
															+			goto ret_success;
														
 
															+
														
 
															+		if (!all_cpus &&
														
 
															+		    kvm_read_guest(kvm,
														
 
															+				   ingpa + offsetof(struct hv_send_ipi_ex,
														
 
															+						    vp_set.bank_contents),
														
 
															+				   sparse_banks,
														
 
															+				   sparse_banks_len))
														
 
															+			return HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															+	}
														
 
															+
														
 
															+	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
														
 
															+		return HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															+
														
 
															+	vcpu_mask = all_cpus ? NULL :
														
 
															+		sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
														
 
															+					vp_bitmap, vcpu_bitmap);
														
 
															+
														
 
															+	kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
														
 
															+
														
 
															+ret_success:
														
 
															+	return HV_STATUS_SUCCESS;
														
 
															+}
														
 
															+
														
 
															 bool kvm_hv_hypercall_enabled(struct kvm *kvm)
														
 
															 {
														
 
															 	return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
														
@@ -1539,6 +1669,20 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
															 		}
														
 
															 		ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
														
 
															 		break;
														
 
															+	case HVCALL_SEND_IPI:
														
 
															+		if (unlikely(rep)) {
														
 
															+			ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															+			break;
														
 
															+		}
														
 
															+		ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast);
														
 
															+		break;
														
 
															+	case HVCALL_SEND_IPI_EX:
														
 
															+		if (unlikely(fast || rep)) {
														
 
															+			ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
														
 
															+			break;
														
 
															+		}
														
 
															+		ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false);
														
 
															+		break;
														
 
															 	default:
														
 
															 		ret = HV_STATUS_INVALID_HYPERCALL_CODE;
														
 
															 		break;
														
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -62,6 +62,10 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
 
															 void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
														
 
															 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
														
 
															+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
														
 
															+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
														
 
															+			    struct hv_vp_assist_page *assist_page);
														
 
															+
														
 
															 static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
														
 
															 							int timer_index)
														
 
															 {
														
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,6 +70,11 @@
 
															 #define APIC_BROADCAST			0xFF
														
 
															 #define X2APIC_BROADCAST		0xFFFFFFFFul
														
 
															+static bool lapic_timer_advance_adjust_done = false;
														
 
															+#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
														
 
															+/* step-by-step approximation to mitigate fluctuation */
														
 
															+#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
														
 
															+
														
 
															 static inline int apic_test_vector(int vec, void *bitmap)
														
 
															 {
														
 
															 	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
														
@@ -955,14 +960,14 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
															 	map = rcu_dereference(kvm->arch.apic_map);
														
 
															 	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
														
 
															-	if (ret)
														
 
															+	if (ret) {
														
 
															+		*r = 0;
														
 
															 		for_each_set_bit(i, &bitmap, 16) {
														
 
															 			if (!dst[i])
														
 
															 				continue;
														
 
															-			if (*r < 0)
														
 
															-				*r = 0;
														
 
															 			*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
														
 
															 		}
														
 
															+	}
														
 
															 	rcu_read_unlock();
														
 
															 	return ret;
														
@@ -1472,7 +1477,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
 
															 void wait_lapic_expire(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	struct kvm_lapic *apic = vcpu->arch.apic;
														
 
															-	u64 guest_tsc, tsc_deadline;
														
 
															+	u64 guest_tsc, tsc_deadline, ns;
														
 
															 	if (!lapic_in_kernel(vcpu))
														
 
															 		return;
														
@@ -1492,6 +1497,24 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 
															 	if (guest_tsc < tsc_deadline)
														
 
															 		__delay(min(tsc_deadline - guest_tsc,
														
 
															 			nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
														
 
															+
														
 
															+	if (!lapic_timer_advance_adjust_done) {
														
 
															+		/* too early */
														
 
															+		if (guest_tsc < tsc_deadline) {
														
 
															+			ns = (tsc_deadline - guest_tsc) * 1000000ULL;
														
 
															+			do_div(ns, vcpu->arch.virtual_tsc_khz);
														
 
															+			lapic_timer_advance_ns -= min((unsigned int)ns,
														
 
															+				lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
														
 
															+		} else {
														
 
															+		/* too late */
														
 
															+			ns = (guest_tsc - tsc_deadline) * 1000000ULL;
														
 
															+			do_div(ns, vcpu->arch.virtual_tsc_khz);
														
 
															+			lapic_timer_advance_ns += min((unsigned int)ns,
														
 
															+				lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
														
 
															+		}
														
 
															+		if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
														
 
															+			lapic_timer_advance_adjust_done = true;
														
 
															+	}
														
 
															 }
														
 
															 static void start_sw_tscdeadline(struct kvm_lapic *apic)
														
@@ -2621,17 +2644,25 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 
															 	return 0;
														
 
															 }
														
 
															-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
														
 
															+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
														
 
															 {
														
 
															 	u64 addr = data & ~KVM_MSR_ENABLED;
														
 
															+	struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
														
 
															+	unsigned long new_len;
														
 
															+
														
 
															 	if (!IS_ALIGNED(addr, 4))
														
 
															 		return 1;
														
 
															 	vcpu->arch.pv_eoi.msr_val = data;
														
 
															 	if (!pv_eoi_enabled(vcpu))
														
 
															 		return 0;
														
 
															-	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
														
 
															-					 addr, sizeof(u8));
														
 
															+
														
 
															+	if (addr == ghc->gpa && len <= ghc->len)
														
 
															+		new_len = ghc->len;
														
 
															+	else
														
 
															+		new_len = len;
														
 
															+
														
 
															+	return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
														
 
															 }
														
 
															 void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
														
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -120,7 +120,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 
															 	return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
														
 
															 }
														
 
															-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
														
 
															+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
														
 
															 void kvm_lapic_init(void);
														
 
															 void kvm_lapic_exit(void);
														
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -932,7 +932,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 
															 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
														
 
															 		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
														
 
															 		if (!obj)
														
 
															-			return -ENOMEM;
														
 
															+			return cache->nobjs >= min ? 0 : -ENOMEM;
														
 
															 		cache->objects[cache->nobjs++] = obj;
														
 
															 	}
														
 
															 	return 0;
														
@@ -960,7 +960,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 
															 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
														
 
															 		page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
														
 
															 		if (!page)
														
 
															-			return -ENOMEM;
														
 
															+			return cache->nobjs >= min ? 0 : -ENOMEM;
														
 
															 		cache->objects[cache->nobjs++] = page;
														
 
															 	}
														
 
															 	return 0;
														
@@ -1265,24 +1265,24 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
 
															 	mmu_free_pte_list_desc(desc);
														
 
															 }
														
 
															-static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
														
 
															+static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
														
 
															 {
														
 
															 	struct pte_list_desc *desc;
														
 
															 	struct pte_list_desc *prev_desc;
														
 
															 	int i;
														
 
															 	if (!rmap_head->val) {
														
 
															-		printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
														
 
															+		pr_err("%s: %p 0->BUG\n", __func__, spte);
														
 
															 		BUG();
														
 
															 	} else if (!(rmap_head->val & 1)) {
														
 
															-		rmap_printk("pte_list_remove:  %p 1->0\n", spte);
														
 
															+		rmap_printk("%s:  %p 1->0\n", __func__, spte);
														
 
															 		if ((u64 *)rmap_head->val != spte) {
														
 
															-			printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
														
 
															+			pr_err("%s:  %p 1->BUG\n", __func__, spte);
														
 
															 			BUG();
														
 
															 		}
														
 
															 		rmap_head->val = 0;
														
 
															 	} else {
														
 
															-		rmap_printk("pte_list_remove:  %p many->many\n", spte);
														
 
															+		rmap_printk("%s:  %p many->many\n", __func__, spte);
														
 
															 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
														
 
															 		prev_desc = NULL;
														
 
															 		while (desc) {
														
@@ -1296,11 +1296,17 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
 
															 			prev_desc = desc;
														
 
															 			desc = desc->more;
														
 
															 		}
														
 
															-		pr_err("pte_list_remove: %p many->many\n", spte);
														
 
															+		pr_err("%s: %p many->many\n", __func__, spte);
														
 
															 		BUG();
														
 
															 	}
														
 
															 }
														
 
															+static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
														
 
															+{
														
 
															+	mmu_spte_clear_track_bits(sptep);
														
 
															+	__pte_list_remove(sptep, rmap_head);
														
 
															+}
														
 
															+
														
 
															 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
														
 
															 					   struct kvm_memory_slot *slot)
														
 
															 {
														
@@ -1349,7 +1355,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
															 	sp = page_header(__pa(spte));
														
 
															 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
														
 
															 	rmap_head = gfn_to_rmap(kvm, gfn, sp);
														
 
															-	pte_list_remove(spte, rmap_head);
														
 
															+	__pte_list_remove(spte, rmap_head);
														
 
															 }
														
 
															 /*
														
@@ -1685,7 +1691,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 
															 	while ((sptep = rmap_get_first(rmap_head, &iter))) {
														
 
															 		rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
														
 
															-		drop_spte(kvm, sptep);
														
 
															+		pte_list_remove(rmap_head, sptep);
														
 
															 		flush = true;
														
 
															 	}
														
@@ -1721,7 +1727,7 @@ restart:
 
															 		need_flush = 1;
														
 
															 		if (pte_write(*ptep)) {
														
 
															-			drop_spte(kvm, sptep);
														
 
															+			pte_list_remove(rmap_head, sptep);
														
 
															 			goto restart;
														
 
															 		} else {
														
 
															 			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
														
@@ -1988,7 +1994,7 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
 
															 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
														
 
															 				       u64 *parent_pte)
														
 
															 {
														
 
															-	pte_list_remove(parent_pte, &sp->parent_ptes);
														
 
															+	__pte_list_remove(parent_pte, &sp->parent_ptes);
														
 
															 }
														
 
															 static void drop_parent_pte(struct kvm_mmu_page *sp,
														
@@ -2181,7 +2187,7 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
															 			    struct list_head *invalid_list)
														
 
															 {
														
 
															 	if (sp->role.cr4_pae != !!is_pae(vcpu)
														
 
															-	    || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
														
 
															+	    || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
														
 
															 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
														
 
															 		return false;
														
 
															 	}
														
@@ -2375,14 +2381,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
															 	int collisions = 0;
														
 
															 	LIST_HEAD(invalid_list);
														
 
															-	role = vcpu->arch.mmu.base_role;
														
 
															+	role = vcpu->arch.mmu->mmu_role.base;
														
 
															 	role.level = level;
														
 
															 	role.direct = direct;
														
 
															 	if (role.direct)
														
 
															 		role.cr4_pae = 0;
														
 
															 	role.access = access;
														
 
															-	if (!vcpu->arch.mmu.direct_map
														
 
															-	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
														
 
															+	if (!vcpu->arch.mmu->direct_map
														
 
															+	    && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
														
 
															 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
														
 
															 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
														
 
															 		role.quadrant = quadrant;
														
@@ -2457,11 +2463,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 
															 {
														
 
															 	iterator->addr = addr;
														
 
															 	iterator->shadow_addr = root;
														
 
															-	iterator->level = vcpu->arch.mmu.shadow_root_level;
														
 
															+	iterator->level = vcpu->arch.mmu->shadow_root_level;
														
 
															 	if (iterator->level == PT64_ROOT_4LEVEL &&
														
 
															-	    vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
														
 
															-	    !vcpu->arch.mmu.direct_map)
														
 
															+	    vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
														
 
															+	    !vcpu->arch.mmu->direct_map)
														
 
															 		--iterator->level;
														
 
															 	if (iterator->level == PT32E_ROOT_LEVEL) {
														
@@ -2469,10 +2475,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 
															 		 * prev_root is currently only used for 64-bit hosts. So only
														
 
															 		 * the active root_hpa is valid here.
														
 
															 		 */
														
 
															-		BUG_ON(root != vcpu->arch.mmu.root_hpa);
														
 
															+		BUG_ON(root != vcpu->arch.mmu->root_hpa);
														
 
															 		iterator->shadow_addr
														
 
															-			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
														
 
															+			= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
														
 
															 		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
														
 
															 		--iterator->level;
														
 
															 		if (!iterator->shadow_addr)
														
@@ -2483,7 +2489,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 
															 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
														
 
															 			     struct kvm_vcpu *vcpu, u64 addr)
														
 
															 {
														
 
															-	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
														
 
															+	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
														
 
															 				    addr);
														
 
															 }
														
@@ -3095,7 +3101,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
 
															 	int emulate = 0;
														
 
															 	gfn_t pseudo_gfn;
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 		return 0;
														
 
															 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
														
@@ -3301,7 +3307,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 
															 	u64 spte = 0ull;
														
 
															 	uint retry_count = 0;
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 		return false;
														
 
															 	if (!page_fault_can_be_fast(error_code))
														
@@ -3471,11 +3477,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 
															 }
														
 
															 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
														
 
															-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
														
 
															+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
														
 
															+			ulong roots_to_free)
														
 
															 {
														
 
															 	int i;
														
 
															 	LIST_HEAD(invalid_list);
														
 
															-	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															 	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
														
 
															 	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
														
@@ -3535,20 +3541,20 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
															 	struct kvm_mmu_page *sp;
														
 
															 	unsigned i;
														
 
															-	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
														
 
															+	if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
														
 
															 		spin_lock(&vcpu->kvm->mmu_lock);
														
 
															 		if(make_mmu_pages_available(vcpu) < 0) {
														
 
															 			spin_unlock(&vcpu->kvm->mmu_lock);
														
 
															 			return -ENOSPC;
														
 
															 		}
														
 
															 		sp = kvm_mmu_get_page(vcpu, 0, 0,
														
 
															-				vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
														
 
															+				vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
														
 
															 		++sp->root_count;
														
 
															 		spin_unlock(&vcpu->kvm->mmu_lock);
														
 
															-		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
														
 
															-	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
														
 
															+		vcpu->arch.mmu->root_hpa = __pa(sp->spt);
														
 
															+	} else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
														
 
															 		for (i = 0; i < 4; ++i) {
														
 
															-			hpa_t root = vcpu->arch.mmu.pae_root[i];
														
 
															+			hpa_t root = vcpu->arch.mmu->pae_root[i];
														
 
															 			MMU_WARN_ON(VALID_PAGE(root));
														
 
															 			spin_lock(&vcpu->kvm->mmu_lock);
														
@@ -3561,9 +3567,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
															 			root = __pa(sp->spt);
														
 
															 			++sp->root_count;
														
 
															 			spin_unlock(&vcpu->kvm->mmu_lock);
														
 
															-			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
														
 
															+			vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
														
 
															 		}
														
 
															-		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
														
 
															+		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
														
 
															 	} else
														
 
															 		BUG();
														
@@ -3577,7 +3583,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
															 	gfn_t root_gfn;
														
 
															 	int i;
														
 
															-	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
														
 
															+	root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT;
														
 
															 	if (mmu_check_root(vcpu, root_gfn))
														
 
															 		return 1;
														
@@ -3586,8 +3592,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
															 	 * Do we shadow a long mode page table? If so we need to
														
 
															 	 * write-protect the guests page table root.
														
 
															 	 */
														
 
															-	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
														
 
															-		hpa_t root = vcpu->arch.mmu.root_hpa;
														
 
															+	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
														
 
															+		hpa_t root = vcpu->arch.mmu->root_hpa;
														
 
															 		MMU_WARN_ON(VALID_PAGE(root));
														
@@ -3597,11 +3603,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
															 			return -ENOSPC;
														
 
															 		}
														
 
															 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
														
 
															-				vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
														
 
															+				vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
														
 
															 		root = __pa(sp->spt);
														
 
															 		++sp->root_count;
														
 
															 		spin_unlock(&vcpu->kvm->mmu_lock);
														
 
															-		vcpu->arch.mmu.root_hpa = root;
														
 
															+		vcpu->arch.mmu->root_hpa = root;
														
 
															 		return 0;
														
 
															 	}
														
@@ -3611,17 +3617,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
															 	 * the shadow page table may be a PAE or a long mode page table.
														
 
															 	 */
														
 
															 	pm_mask = PT_PRESENT_MASK;
														
 
															-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
														
 
															+	if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
														
 
															 		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
														
 
															 	for (i = 0; i < 4; ++i) {
														
 
															-		hpa_t root = vcpu->arch.mmu.pae_root[i];
														
 
															+		hpa_t root = vcpu->arch.mmu->pae_root[i];
														
 
															 		MMU_WARN_ON(VALID_PAGE(root));
														
 
															-		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
														
 
															-			pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
														
 
															+		if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
														
 
															+			pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
														
 
															 			if (!(pdptr & PT_PRESENT_MASK)) {
														
 
															-				vcpu->arch.mmu.pae_root[i] = 0;
														
 
															+				vcpu->arch.mmu->pae_root[i] = 0;
														
 
															 				continue;
														
 
															 			}
														
 
															 			root_gfn = pdptr >> PAGE_SHIFT;
														
@@ -3639,16 +3645,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
															 		++sp->root_count;
														
 
															 		spin_unlock(&vcpu->kvm->mmu_lock);
														
 
															-		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
														
 
															+		vcpu->arch.mmu->pae_root[i] = root | pm_mask;
														
 
															 	}
														
 
															-	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
														
 
															+	vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
														
 
															 	/*
														
 
															 	 * If we shadow a 32 bit page table with a long mode page
														
 
															 	 * table we enter this path.
														
 
															 	 */
														
 
															-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
														
 
															-		if (vcpu->arch.mmu.lm_root == NULL) {
														
 
															+	if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
														
 
															+		if (vcpu->arch.mmu->lm_root == NULL) {
														
 
															 			/*
														
 
															 			 * The additional page necessary for this is only
														
 
															 			 * allocated on demand.
														
@@ -3660,12 +3666,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
															 			if (lm_root == NULL)
														
 
															 				return 1;
														
 
															-			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
														
 
															+			lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
														
 
															-			vcpu->arch.mmu.lm_root = lm_root;
														
 
															+			vcpu->arch.mmu->lm_root = lm_root;
														
 
															 		}
														
 
															-		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
														
 
															+		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
														
 
															 	}
														
 
															 	return 0;
														
@@ -3673,7 +3679,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
															 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	if (vcpu->arch.mmu.direct_map)
														
 
															+	if (vcpu->arch.mmu->direct_map)
														
 
															 		return mmu_alloc_direct_roots(vcpu);
														
 
															 	else
														
 
															 		return mmu_alloc_shadow_roots(vcpu);
														
@@ -3684,17 +3690,16 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 
															 	int i;
														
 
															 	struct kvm_mmu_page *sp;
														
 
															-	if (vcpu->arch.mmu.direct_map)
														
 
															+	if (vcpu->arch.mmu->direct_map)
														
 
															 		return;
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 		return;
														
 
															 	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
														
 
															-	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
														
 
															-		hpa_t root = vcpu->arch.mmu.root_hpa;
														
 
															-
														
 
															+	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
														
 
															+		hpa_t root = vcpu->arch.mmu->root_hpa;
														
 
															 		sp = page_header(root);
														
 
															 		/*
														
@@ -3725,7 +3730,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 
															 	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
														
 
															 	for (i = 0; i < 4; ++i) {
														
 
															-		hpa_t root = vcpu->arch.mmu.pae_root[i];
														
 
															+		hpa_t root = vcpu->arch.mmu->pae_root[i];
														
 
															 		if (root && VALID_PAGE(root)) {
														
 
															 			root &= PT64_BASE_ADDR_MASK;
														
@@ -3799,7 +3804,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 
															 	int root, leaf;
														
 
															 	bool reserved = false;
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 		goto exit;
														
 
															 	walk_shadow_page_lockless_begin(vcpu);
														
@@ -3816,7 +3821,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 
															 		if (!is_shadow_present_pte(spte))
														
 
															 			break;
														
 
															-		reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
														
 
															+		reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
														
 
															 						    iterator.level);
														
 
															 	}
														
@@ -3895,7 +3900,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
 
															 	struct kvm_shadow_walk_iterator iterator;
														
 
															 	u64 spte;
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 		return;
														
 
															 	walk_shadow_page_lockless_begin(vcpu);
														
@@ -3922,7 +3927,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 
															 	if (r)
														
 
															 		return r;
														
 
															-	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
														
 
															+	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
														
 
															 	return nonpaging_map(vcpu, gva & PAGE_MASK,
														
@@ -3935,8 +3940,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 
															 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
														
 
															 	arch.gfn = gfn;
														
 
															-	arch.direct_map = vcpu->arch.mmu.direct_map;
														
 
															-	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
														
 
															+	arch.direct_map = vcpu->arch.mmu->direct_map;
														
 
															+	arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
														
 
															 	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
														
 
															 }
														
@@ -4042,7 +4047,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
															 	int write = error_code & PFERR_WRITE_MASK;
														
 
															 	bool map_writable;
														
 
															-	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
														
 
															+	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
														
 
															 	if (page_fault_handle_page_track(vcpu, error_code, gfn))
														
 
															 		return RET_PF_EMULATE;
														
@@ -4118,7 +4123,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 
															 {
														
 
															 	uint i;
														
 
															 	struct kvm_mmu_root_info root;
														
 
															-	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+	struct kvm_mmu *mmu = vcpu->arch.mmu;
														
 
															 	root.cr3 = mmu->get_cr3(vcpu);
														
 
															 	root.hpa = mmu->root_hpa;
														
@@ -4141,7 +4146,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 
															 			    union kvm_mmu_page_role new_role,
														
 
															 			    bool skip_tlb_flush)
														
 
															 {
														
 
															-	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+	struct kvm_mmu *mmu = vcpu->arch.mmu;
														
 
															 	/*
														
 
															 	 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
														
@@ -4192,7 +4197,8 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 
															 			      bool skip_tlb_flush)
														
 
															 {
														
 
															 	if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
														
 
															-		kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
														
 
															+		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
														
 
															+				   KVM_MMU_ROOT_CURRENT);
														
 
															 }
														
 
															 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
														
@@ -4210,7 +4216,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 
															 static void inject_page_fault(struct kvm_vcpu *vcpu,
														
 
															 			      struct x86_exception *fault)
														
 
															 {
														
 
															-	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
														
 
															+	vcpu->arch.mmu->inject_page_fault(vcpu, fault);
														
 
															 }
														
 
															 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
														
@@ -4414,7 +4420,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 
															 void
														
 
															 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
														
 
															 {
														
 
															-	bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
														
 
															+	bool uses_nx = context->nx ||
														
 
															+		context->mmu_role.base.smep_andnot_wp;
														
 
															 	struct rsvd_bits_validate *shadow_zero_check;
														
 
															 	int i;
														
@@ -4553,7 +4560,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
 
															 			 * SMAP:kernel-mode data accesses from user-mode
														
 
															 			 * mappings should fault. A fault is considered
														
 
															 			 * as a SMAP violation if all of the following
														
 
															-			 * conditions are ture:
														
 
															+			 * conditions are true:
														
 
															 			 *   - X86_CR4_SMAP is set in CR4
														
 
															 			 *   - A user page is accessed
														
 
															 			 *   - The access is not a fetch
														
@@ -4714,27 +4721,65 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
 
															 	paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
														
 
															 }
														
 
															-static union kvm_mmu_page_role
														
 
															-kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
														
 
															+static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	union kvm_mmu_extended_role ext = {0};
														
 
															+
														
 
															+	ext.cr0_pg = !!is_paging(vcpu);
														
 
															+	ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
														
 
															+	ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
														
 
															+	ext.cr4_pse = !!is_pse(vcpu);
														
 
															+	ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
														
 
															+	ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
														
 
															+
														
 
															+	ext.valid = 1;
														
 
															+
														
 
															+	return ext;
														
 
															+}
														
 
															+
														
 
															+static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
														
 
															+						   bool base_only)
														
 
															+{
														
 
															+	union kvm_mmu_role role = {0};
														
 
															+
														
 
															+	role.base.access = ACC_ALL;
														
 
															+	role.base.nxe = !!is_nx(vcpu);
														
 
															+	role.base.cr4_pae = !!is_pae(vcpu);
														
 
															+	role.base.cr0_wp = is_write_protection(vcpu);
														
 
															+	role.base.smm = is_smm(vcpu);
														
 
															+	role.base.guest_mode = is_guest_mode(vcpu);
														
 
															+
														
 
															+	if (base_only)
														
 
															+		return role;
														
 
															+
														
 
															+	role.ext = kvm_calc_mmu_role_ext(vcpu);
														
 
															+
														
 
															+	return role;
														
 
															+}
														
 
															+
														
 
															+static union kvm_mmu_role
														
 
															+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
														
 
															 {
														
 
															-	union kvm_mmu_page_role role = {0};
														
 
															+	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
														
 
															-	role.guest_mode = is_guest_mode(vcpu);
														
 
															-	role.smm = is_smm(vcpu);
														
 
															-	role.ad_disabled = (shadow_accessed_mask == 0);
														
 
															-	role.level = kvm_x86_ops->get_tdp_level(vcpu);
														
 
															-	role.direct = true;
														
 
															-	role.access = ACC_ALL;
														
 
															+	role.base.ad_disabled = (shadow_accessed_mask == 0);
														
 
															+	role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
														
 
															+	role.base.direct = true;
														
 
															 	return role;
														
 
															 }
														
 
															 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															+	struct kvm_mmu *context = vcpu->arch.mmu;
														
 
															+	union kvm_mmu_role new_role =
														
 
															+		kvm_calc_tdp_mmu_root_page_role(vcpu, false);
														
 
															-	context->base_role.word = mmu_base_role_mask.word &
														
 
															-				  kvm_calc_tdp_mmu_root_page_role(vcpu).word;
														
 
															+	new_role.base.word &= mmu_base_role_mask.word;
														
 
															+	if (new_role.as_u64 == context->mmu_role.as_u64)
														
 
															+		return;
														
 
															+
														
 
															+	context->mmu_role.as_u64 = new_role.as_u64;
														
 
															 	context->page_fault = tdp_page_fault;
														
 
															 	context->sync_page = nonpaging_sync_page;
														
 
															 	context->invlpg = nonpaging_invlpg;
														
@@ -4774,36 +4819,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
															 	reset_tdp_shadow_zero_bits_mask(vcpu, context);
														
 
															 }
														
 
															-static union kvm_mmu_page_role
														
 
															-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
														
 
															-{
														
 
															-	union kvm_mmu_page_role role = {0};
														
 
															-	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
														
 
															-	bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
														
 
															-
														
 
															-	role.nxe = is_nx(vcpu);
														
 
															-	role.cr4_pae = !!is_pae(vcpu);
														
 
															-	role.cr0_wp  = is_write_protection(vcpu);
														
 
															-	role.smep_andnot_wp = smep && !is_write_protection(vcpu);
														
 
															-	role.smap_andnot_wp = smap && !is_write_protection(vcpu);
														
 
															-	role.guest_mode = is_guest_mode(vcpu);
														
 
															-	role.smm = is_smm(vcpu);
														
 
															-	role.direct = !is_paging(vcpu);
														
 
															-	role.access = ACC_ALL;
														
 
															+static union kvm_mmu_role
														
 
															+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
														
 
															+{
														
 
															+	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
														
 
															+
														
 
															+	role.base.smep_andnot_wp = role.ext.cr4_smep &&
														
 
															+		!is_write_protection(vcpu);
														
 
															+	role.base.smap_andnot_wp = role.ext.cr4_smap &&
														
 
															+		!is_write_protection(vcpu);
														
 
															+	role.base.direct = !is_paging(vcpu);
														
 
															 	if (!is_long_mode(vcpu))
														
 
															-		role.level = PT32E_ROOT_LEVEL;
														
 
															+		role.base.level = PT32E_ROOT_LEVEL;
														
 
															 	else if (is_la57_mode(vcpu))
														
 
															-		role.level = PT64_ROOT_5LEVEL;
														
 
															+		role.base.level = PT64_ROOT_5LEVEL;
														
 
															 	else
														
 
															-		role.level = PT64_ROOT_4LEVEL;
														
 
															+		role.base.level = PT64_ROOT_4LEVEL;
														
 
															 	return role;
														
 
															 }
														
 
															 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															+	struct kvm_mmu *context = vcpu->arch.mmu;
														
 
															+	union kvm_mmu_role new_role =
														
 
															+		kvm_calc_shadow_mmu_root_page_role(vcpu, false);
														
 
															+
														
 
															+	new_role.base.word &= mmu_base_role_mask.word;
														
 
															+	if (new_role.as_u64 == context->mmu_role.as_u64)
														
 
															+		return;
														
 
															 	if (!is_paging(vcpu))
														
 
															 		nonpaging_init_context(vcpu, context);
														
@@ -4814,22 +4859,28 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 
															 	else
														
 
															 		paging32_init_context(vcpu, context);
														
 
															-	context->base_role.word = mmu_base_role_mask.word &
														
 
															-				  kvm_calc_shadow_mmu_root_page_role(vcpu).word;
														
 
															+	context->mmu_role.as_u64 = new_role.as_u64;
														
 
															 	reset_shadow_zero_bits_mask(vcpu, context);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
														
 
															-static union kvm_mmu_page_role
														
 
															-kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
														
 
															+static union kvm_mmu_role
														
 
															+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
														
 
															+				   bool execonly)
														
 
															 {
														
 
															-	union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
														
 
															+	union kvm_mmu_role role;
														
 
															+
														
 
															+	/* Base role is inherited from root_mmu */
														
 
															+	role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
														
 
															+	role.ext = kvm_calc_mmu_role_ext(vcpu);
														
 
															+
														
 
															+	role.base.level = PT64_ROOT_4LEVEL;
														
 
															+	role.base.direct = false;
														
 
															+	role.base.ad_disabled = !accessed_dirty;
														
 
															+	role.base.guest_mode = true;
														
 
															+	role.base.access = ACC_ALL;
														
 
															-	role.level = PT64_ROOT_4LEVEL;
														
 
															-	role.direct = false;
														
 
															-	role.ad_disabled = !accessed_dirty;
														
 
															-	role.guest_mode = true;
														
 
															-	role.access = ACC_ALL;
														
 
															+	role.ext.execonly = execonly;
														
 
															 	return role;
														
 
															 }
														
@@ -4837,11 +4888,17 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
 
															 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
														
 
															 			     bool accessed_dirty, gpa_t new_eptp)
														
 
															 {
														
 
															-	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															-	union kvm_mmu_page_role root_page_role =
														
 
															-		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
														
 
															+	struct kvm_mmu *context = vcpu->arch.mmu;
														
 
															+	union kvm_mmu_role new_role =
														
 
															+		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
														
 
															+						   execonly);
														
 
															+
														
 
															+	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
														
 
															+
														
 
															+	new_role.base.word &= mmu_base_role_mask.word;
														
 
															+	if (new_role.as_u64 == context->mmu_role.as_u64)
														
 
															+		return;
														
 
															-	__kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
														
 
															 	context->shadow_root_level = PT64_ROOT_4LEVEL;
														
 
															 	context->nx = true;
														
@@ -4853,7 +4910,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 
															 	context->update_pte = ept_update_pte;
														
 
															 	context->root_level = PT64_ROOT_4LEVEL;
														
 
															 	context->direct_map = false;
														
 
															-	context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
														
 
															+	context->mmu_role.as_u64 = new_role.as_u64;
														
 
															+
														
 
															 	update_permission_bitmask(vcpu, context, true);
														
 
															 	update_pkru_bitmask(vcpu, context, true);
														
 
															 	update_last_nonleaf_level(vcpu, context);
														
@@ -4864,7 +4922,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
															 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															+	struct kvm_mmu *context = vcpu->arch.mmu;
														
 
															 	kvm_init_shadow_mmu(vcpu);
														
 
															 	context->set_cr3           = kvm_x86_ops->set_cr3;
														
@@ -4875,14 +4933,20 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 
															 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
														
 
															 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
														
 
															+	new_role.base.word &= mmu_base_role_mask.word;
														
 
															+	if (new_role.as_u64 == g_context->mmu_role.as_u64)
														
 
															+		return;
														
 
															+
														
 
															+	g_context->mmu_role.as_u64 = new_role.as_u64;
														
 
															 	g_context->get_cr3           = get_cr3;
														
 
															 	g_context->get_pdptr         = kvm_pdptr_read;
														
 
															 	g_context->inject_page_fault = kvm_inject_page_fault;
														
 
															 	/*
														
 
															-	 * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
														
 
															+	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
														
 
															 	 * L1's nested page tables (e.g. EPT12). The nested translation
														
 
															 	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
														
 
															 	 * L2's page tables as the first level of translation and L1's
														
@@ -4921,10 +4985,10 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
 
															 	if (reset_roots) {
														
 
															 		uint i;
														
 
															-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
														
 
															+		vcpu->arch.mmu->root_hpa = INVALID_PAGE;
														
 
															 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															-			vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
														
 
															+			vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
														
 
															 	}
														
 
															 	if (mmu_is_nested(vcpu))
														
@@ -4939,10 +5003,14 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu);
 
															 static union kvm_mmu_page_role
														
 
															 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	union kvm_mmu_role role;
														
 
															+
														
 
															 	if (tdp_enabled)
														
 
															-		return kvm_calc_tdp_mmu_root_page_role(vcpu);
														
 
															+		role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
														
 
															 	else
														
 
															-		return kvm_calc_shadow_mmu_root_page_role(vcpu);
														
 
															+		role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
														
 
															+
														
 
															+	return role.base;
														
 
															 }
														
 
															 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
														
@@ -4972,8 +5040,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
															 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
														
 
															-	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
														
 
															+	kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
														
 
															+	WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
														
 
															+	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
														
 
															+	WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
														
@@ -4987,7 +5057,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 
															         }
														
 
															 	++vcpu->kvm->stat.mmu_pte_updated;
														
 
															-	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
														
 
															+	vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
														
 
															 }
														
 
															 static bool need_remote_flush(u64 old, u64 new)
														
@@ -5164,10 +5234,12 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
															 		local_flush = true;
														
 
															 		while (npte--) {
														
 
															+			u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
														
 
															+
														
 
															 			entry = *spte;
														
 
															 			mmu_page_zap_pte(vcpu->kvm, sp, spte);
														
 
															 			if (gentry &&
														
 
															-			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
														
 
															+			      !((sp->role.word ^ base_role)
														
 
															 			      & mmu_base_role_mask.word) && rmap_can_add(vcpu))
														
 
															 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
														
 
															 			if (need_remote_flush(entry, *spte))
														
@@ -5185,7 +5257,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 
															 	gpa_t gpa;
														
 
															 	int r;
														
 
															-	if (vcpu->arch.mmu.direct_map)
														
 
															+	if (vcpu->arch.mmu->direct_map)
														
 
															 		return 0;
														
 
															 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
														
@@ -5221,10 +5293,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 
															 {
														
 
															 	int r, emulation_type = 0;
														
 
															 	enum emulation_result er;
														
 
															-	bool direct = vcpu->arch.mmu.direct_map;
														
 
															+	bool direct = vcpu->arch.mmu->direct_map;
														
 
															 	/* With shadow page tables, fault_address contains a GVA or nGPA.  */
														
 
															-	if (vcpu->arch.mmu.direct_map) {
														
 
															+	if (vcpu->arch.mmu->direct_map) {
														
 
															 		vcpu->arch.gpa_available = true;
														
 
															 		vcpu->arch.gpa_val = cr2;
														
 
															 	}
														
@@ -5237,8 +5309,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 
															 	}
														
 
															 	if (r == RET_PF_INVALID) {
														
 
															-		r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
														
 
															-					      false);
														
 
															+		r = vcpu->arch.mmu->page_fault(vcpu, cr2,
														
 
															+					       lower_32_bits(error_code),
														
 
															+					       false);
														
 
															 		WARN_ON(r == RET_PF_INVALID);
														
 
															 	}
														
@@ -5254,7 +5327,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 
															 	 * paging in both guests. If true, we simply unprotect the page
														
 
															 	 * and resume the guest.
														
 
															 	 */
														
 
															-	if (vcpu->arch.mmu.direct_map &&
														
 
															+	if (vcpu->arch.mmu->direct_map &&
														
 
															 	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
														
 
															 		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
														
 
															 		return 1;
														
@@ -5302,7 +5375,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
															 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
														
 
															 {
														
 
															-	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+	struct kvm_mmu *mmu = vcpu->arch.mmu;
														
 
															 	int i;
														
 
															 	/* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
														
@@ -5333,7 +5406,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
 
															 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
														
 
															 {
														
 
															-	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+	struct kvm_mmu *mmu = vcpu->arch.mmu;
														
 
															 	bool tlb_flush = false;
														
 
															 	uint i;
														
@@ -5377,8 +5450,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 
															 static void free_mmu_pages(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	free_page((unsigned long)vcpu->arch.mmu.pae_root);
														
 
															-	free_page((unsigned long)vcpu->arch.mmu.lm_root);
														
 
															+	free_page((unsigned long)vcpu->arch.mmu->pae_root);
														
 
															+	free_page((unsigned long)vcpu->arch.mmu->lm_root);
														
 
															 }
														
 
															 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
														
@@ -5398,9 +5471,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
															 	if (!page)
														
 
															 		return -ENOMEM;
														
 
															-	vcpu->arch.mmu.pae_root = page_address(page);
														
 
															+	vcpu->arch.mmu->pae_root = page_address(page);
														
 
															 	for (i = 0; i < 4; ++i)
														
 
															-		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
														
 
															+		vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
														
 
															 	return 0;
														
 
															 }
														
@@ -5409,27 +5482,21 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 
															 {
														
 
															 	uint i;
														
 
															-	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
														
 
															-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
														
 
															-	vcpu->arch.mmu.translate_gpa = translate_gpa;
														
 
															-	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
														
 
															+	vcpu->arch.mmu = &vcpu->arch.root_mmu;
														
 
															+	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
														
 
															+	vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
														
 
															+	vcpu->arch.root_mmu.translate_gpa = translate_gpa;
														
 
															 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															-		vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
														
 
															-
														
 
															-	return alloc_mmu_pages(vcpu);
														
 
															-}
														
 
															+		vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
														
 
															-void kvm_mmu_setup(struct kvm_vcpu *vcpu)
														
 
															-{
														
 
															-	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
														
 
															+	vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
														
 
															+	vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
														
 
															+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															+		vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
														
 
															-	/*
														
 
															-	 * kvm_mmu_setup() is called only on vCPU initialization.  
														
 
															-	 * Therefore, no need to reset mmu roots as they are not yet
														
 
															-	 * initialized.
														
 
															-	 */
														
 
															-	kvm_init_mmu(vcpu, false);
														
 
															+	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
														
 
															+	return alloc_mmu_pages(vcpu);
														
 
															 }
														
 
															 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
														
@@ -5612,7 +5679,7 @@ restart:
 
															 		if (sp->role.direct &&
														
 
															 			!kvm_is_reserved_pfn(pfn) &&
														
 
															 			PageTransCompoundMap(pfn_to_page(pfn))) {
														
 
															-			drop_spte(kvm, sptep);
														
 
															+			pte_list_remove(rmap_head, sptep);
														
 
															 			need_tlb_flush = 1;
														
 
															 			goto restart;
														
 
															 		}
														
@@ -5869,6 +5936,16 @@ int kvm_mmu_module_init(void)
 
															 {
														
 
															 	int ret = -ENOMEM;
														
 
															+	/*
														
 
															+	 * MMU roles use union aliasing which is, generally speaking, an
														
 
															+	 * undefined behavior. However, we supposedly know how compilers behave
														
 
															+	 * and the current status quo is unlikely to change. Guardians below are
														
 
															+	 * supposed to let us know if the assumption becomes false.
														
 
															+	 */
														
 
															+	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
														
 
															+	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
														
 
															+	BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
														
 
															+
														
 
															 	kvm_mmu_reset_all_pte_masks();
														
 
															 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
														
@@ -5898,7 +5975,7 @@ out:
 
															 }
														
 
															 /*
														
 
															- * Caculate mmu pages needed for kvm.
														
 
															+ * Calculate mmu pages needed for kvm.
														
 
															  */
														
 
															 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
														
 
															 {
														
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -43,11 +43,6 @@
 
															 #define PT32_ROOT_LEVEL 2
														
 
															 #define PT32E_ROOT_LEVEL 3
														
 
															-#define PT_PDPE_LEVEL 3
														
 
															-#define PT_DIRECTORY_LEVEL 2
														
 
															-#define PT_PAGE_TABLE_LEVEL 1
														
 
															-#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
														
 
															-
														
 
															 static inline u64 rsvd_bits(int s, int e)
														
 
															 {
														
 
															 	if (e < s)
														
@@ -80,7 +75,7 @@ static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 
															 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
														
 
															+	if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
														
 
															 		return 0;
														
 
															 	return kvm_mmu_load(vcpu);
														
@@ -102,9 +97,9 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
 
															 static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															-		vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
														
 
															-					     kvm_get_active_pcid(vcpu));
														
 
															+	if (VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															+		vcpu->arch.mmu->set_cr3(vcpu, vcpu->arch.mmu->root_hpa |
														
 
															+					      kvm_get_active_pcid(vcpu));
														
 
															 }
														
 
															 /*
														
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -59,19 +59,19 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 
															 	int i;
														
 
															 	struct kvm_mmu_page *sp;
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 		return;
														
 
															-	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
														
 
															-		hpa_t root = vcpu->arch.mmu.root_hpa;
														
 
															+	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
														
 
															+		hpa_t root = vcpu->arch.mmu->root_hpa;
														
 
															 		sp = page_header(root);
														
 
															-		__mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
														
 
															+		__mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
														
 
															 		return;
														
 
															 	}
														
 
															 	for (i = 0; i < 4; ++i) {
														
 
															-		hpa_t root = vcpu->arch.mmu.pae_root[i];
														
 
															+		hpa_t root = vcpu->arch.mmu->pae_root[i];
														
 
															 		if (root && VALID_PAGE(root)) {
														
 
															 			root &= PT64_BASE_ADDR_MASK;
														
@@ -122,7 +122,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
															 	hpa =  pfn << PAGE_SHIFT;
														
 
															 	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
														
 
															 		audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
														
 
															-			     "ent %llxn", vcpu->arch.mmu.root_level, pfn,
														
 
															+			     "ent %llxn", vcpu->arch.mmu->root_level, pfn,
														
 
															 			     hpa, *sptep);
														
 
															 }
														
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -158,14 +158,15 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 
															 				  struct kvm_mmu_page *sp, u64 *spte,
														
 
															 				  u64 gpte)
														
 
															 {
														
 
															-	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
														
 
															+	if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
														
 
															 		goto no_present;
														
 
															 	if (!FNAME(is_present_gpte)(gpte))
														
 
															 		goto no_present;
														
 
															 	/* if accessed bit is not supported prefetch non accessed gpte */
														
 
															-	if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
														
 
															+	if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
														
 
															+	    !(gpte & PT_GUEST_ACCESSED_MASK))
														
 
															 		goto no_present;
														
 
															 	return false;
														
@@ -480,7 +481,7 @@ error:
 
															 static int FNAME(walk_addr)(struct guest_walker *walker,
														
 
															 			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
														
 
															 {
														
 
															-	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
														
 
															+	return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
														
 
															 					access);
														
 
															 }
														
@@ -509,7 +510,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
															 	gfn = gpte_to_gfn(gpte);
														
 
															 	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
														
 
															-	FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
														
 
															+	FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
														
 
															 	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
														
 
															 			no_dirty_log && (pte_access & ACC_WRITE_MASK));
														
 
															 	if (is_error_pfn(pfn))
														
@@ -604,7 +605,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
															 	direct_access = gw->pte_access;
														
 
															-	top_level = vcpu->arch.mmu.root_level;
														
 
															+	top_level = vcpu->arch.mmu->root_level;
														
 
															 	if (top_level == PT32E_ROOT_LEVEL)
														
 
															 		top_level = PT32_ROOT_LEVEL;
														
 
															 	/*
														
@@ -616,7 +617,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
															 	if (FNAME(gpte_changed)(vcpu, gw, top_level))
														
 
															 		goto out_gpte_changed;
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 		goto out_gpte_changed;
														
 
															 	for (shadow_walk_init(&it, vcpu, addr);
														
@@ -1004,7 +1005,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
															 		gfn = gpte_to_gfn(gpte);
														
 
															 		pte_access = sp->role.access;
														
 
															 		pte_access &= FNAME(gpte_access)(gpte);
														
 
															-		FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
														
 
															+		FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
														
 
															 		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
														
 
															 		      &nr_present))
														
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -809,6 +809,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
 
															 	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
														
 
															 		return;
														
 
															+	kvm_deliver_exception_payload(&svm->vcpu);
														
 
															+
														
 
															 	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
														
 
															 		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
														
@@ -2922,18 +2924,18 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 
															 {
														
 
															 	WARN_ON(mmu_is_nested(vcpu));
														
 
															 	kvm_init_shadow_mmu(vcpu);
														
 
															-	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
														
 
															-	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
														
 
															-	vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
														
 
															-	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
														
 
															-	vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
														
 
															-	reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
														
 
															+	vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
														
 
															+	vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
														
 
															+	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
														
 
															+	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
														
 
															+	vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
														
 
															+	reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
														
 
															 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
														
 
															 }
														
 
															 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
														
 
															+	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
														
 
															 }
														
 
															 static int nested_svm_check_permissions(struct vcpu_svm *svm)
														
@@ -2969,16 +2971,13 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 
															 	svm->vmcb->control.exit_info_1 = error_code;
														
 
															 	/*
														
 
															-	 * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
														
 
															-	 * The fix is to add the ancillary datum (CR2 or DR6) to structs
														
 
															-	 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
														
 
															-	 * written only when inject_pending_event runs (DR6 would written here
														
 
															-	 * too).  This should be conditional on a new capability---if the
														
 
															-	 * capability is disabled, kvm_multiple_exception would write the
														
 
															-	 * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
														
 
															+	 * EXITINFO2 is undefined for all exception intercepts other
														
 
															+	 * than #PF.
														
 
															 	 */
														
 
															 	if (svm->vcpu.arch.exception.nested_apf)
														
 
															 		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
														
 
															+	else if (svm->vcpu.arch.exception.has_payload)
														
 
															+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
														
 
															 	else
														
 
															 		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
														
@@ -5642,26 +5641,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
															 		"mov %%r13, %c[r13](%[svm]) \n\t"
														
 
															 		"mov %%r14, %c[r14](%[svm]) \n\t"
														
 
															 		"mov %%r15, %c[r15](%[svm]) \n\t"
														
 
															-#endif
														
 
															 		/*
														
 
															 		* Clear host registers marked as clobbered to prevent
														
 
															 		* speculative use.
														
 
															 		*/
														
 
															-		"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
														
 
															-		"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
														
 
															-		"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
														
 
															-		"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
														
 
															-		"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
														
 
															-#ifdef CONFIG_X86_64
														
 
															-		"xor %%r8, %%r8 \n\t"
														
 
															-		"xor %%r9, %%r9 \n\t"
														
 
															-		"xor %%r10, %%r10 \n\t"
														
 
															-		"xor %%r11, %%r11 \n\t"
														
 
															-		"xor %%r12, %%r12 \n\t"
														
 
															-		"xor %%r13, %%r13 \n\t"
														
 
															-		"xor %%r14, %%r14 \n\t"
														
 
															-		"xor %%r15, %%r15 \n\t"
														
 
															+		"xor %%r8d, %%r8d \n\t"
														
 
															+		"xor %%r9d, %%r9d \n\t"
														
 
															+		"xor %%r10d, %%r10d \n\t"
														
 
															+		"xor %%r11d, %%r11d \n\t"
														
 
															+		"xor %%r12d, %%r12d \n\t"
														
 
															+		"xor %%r13d, %%r13d \n\t"
														
 
															+		"xor %%r14d, %%r14d \n\t"
														
 
															+		"xor %%r15d, %%r15d \n\t"
														
 
															 #endif
														
 
															+		"xor %%ebx, %%ebx \n\t"
														
 
															+		"xor %%ecx, %%ecx \n\t"
														
 
															+		"xor %%edx, %%edx \n\t"
														
 
															+		"xor %%esi, %%esi \n\t"
														
 
															+		"xor %%edi, %%edi \n\t"
														
 
															 		"pop %%" _ASM_BP
														
 
															 		:
														
 
															 		: [svm]"a"(svm),
														
@@ -7040,6 +7037,13 @@ failed:
 
															 	return ret;
														
 
															 }
														
 
															+static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
														
 
															+				   uint16_t *vmcs_version)
														
 
															+{
														
 
															+	/* Intel-only feature */
														
 
															+	return -ENODEV;
														
 
															+}
														
 
															+
														
 
															 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
														
 
															 	.cpu_has_kvm_support = has_svm,
														
 
															 	.disabled_by_bios = is_disabled,
														
@@ -7169,6 +7173,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
															 	.mem_enc_op = svm_mem_enc_op,
														
 
															 	.mem_enc_reg_region = svm_register_enc_region,
														
 
															 	.mem_enc_unreg_region = svm_unregister_enc_region,
														
 
															+
														
 
															+	.nested_enable_evmcs = nested_enable_evmcs,
														
 
															 };
														
 
															 static int __init svm_init(void)
														
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1418,6 +1418,48 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
 
															 		  __entry->valid_bank_mask, __entry->format,
														
 
															 		  __entry->address_space, __entry->flags)
														
 
															 );
														
 
															+
														
 
															+/*
														
 
															+ * Tracepoints for kvm_hv_send_ipi.
														
 
															+ */
														
 
															+TRACE_EVENT(kvm_hv_send_ipi,
														
 
															+	TP_PROTO(u32 vector, u64 processor_mask),
														
 
															+	TP_ARGS(vector, processor_mask),
														
 
															+
														
 
															+	TP_STRUCT__entry(
														
 
															+		__field(u32, vector)
														
 
															+		__field(u64, processor_mask)
														
 
															+	),
														
 
															+
														
 
															+	TP_fast_assign(
														
 
															+		__entry->vector = vector;
														
 
															+		__entry->processor_mask = processor_mask;
														
 
															+	),
														
 
															+
														
 
															+	TP_printk("vector %x processor_mask 0x%llx",
														
 
															+		  __entry->vector, __entry->processor_mask)
														
 
															+);
														
 
															+
														
 
															+TRACE_EVENT(kvm_hv_send_ipi_ex,
														
 
															+	TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask),
														
 
															+	TP_ARGS(vector, format, valid_bank_mask),
														
 
															+
														
 
															+	TP_STRUCT__entry(
														
 
															+		__field(u32, vector)
														
 
															+		__field(u64, format)
														
 
															+		__field(u64, valid_bank_mask)
														
 
															+	),
														
 
															+
														
 
															+	TP_fast_assign(
														
 
															+		__entry->vector = vector;
														
 
															+		__entry->format = format;
														
 
															+		__entry->valid_bank_mask = valid_bank_mask;
														
 
															+	),
														
 
															+
														
 
															+	TP_printk("vector %x format %llx valid_bank_mask 0x%llx",
														
 
															+		  __entry->vector, __entry->format,
														
 
															+		  __entry->valid_bank_mask)
														
 
															+);
														
 
															 #endif /* _TRACE_KVM_H */
														
 
															 #undef TRACE_INCLUDE_PATH
														
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -20,6 +20,7 @@
 
															 #include "mmu.h"
														
 
															 #include "cpuid.h"
														
 
															 #include "lapic.h"
														
 
															+#include "hyperv.h"
														
 
															 #include <linux/kvm_host.h>
														
 
															 #include <linux/module.h>
														
@@ -61,7 +62,7 @@
 
															 #define __ex(x) __kvm_handle_fault_on_reboot(x)
														
 
															 #define __ex_clear(x, reg) \
														
 
															-	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
														
 
															+	____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
														
 
															 MODULE_AUTHOR("Qumranet");
														
 
															 MODULE_LICENSE("GPL");
														
@@ -107,9 +108,12 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 
															  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
														
 
															  * use VMX instructions.
														
 
															  */
														
 
															-static bool __read_mostly nested = 0;
														
 
															+static bool __read_mostly nested = 1;
														
 
															 module_param(nested, bool, S_IRUGO);
														
 
															+static bool __read_mostly nested_early_check = 0;
														
 
															+module_param(nested_early_check, bool, S_IRUGO);
														
 
															+
														
 
															 static u64 __read_mostly host_xss;
														
 
															 static bool __read_mostly enable_pml = 1;
														
@@ -131,7 +135,7 @@ static bool __read_mostly enable_preemption_timer = 1;
 
															 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
														
 
															 #endif
														
 
															-#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
														
 
															+#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
														
 
															 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
														
 
															 #define KVM_VM_CR0_ALWAYS_ON				\
														
 
															 	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | 	\
														
@@ -187,6 +191,7 @@ static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 
															 module_param(ple_window_max, uint, 0444);
														
 
															 extern const ulong vmx_return;
														
 
															+extern const ulong vmx_early_consistency_check_return;
														
 
															 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
														
 
															 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
														
@@ -827,14 +832,28 @@ struct nested_vmx {
 
															 	 */
														
 
															 	struct vmcs12 *cached_shadow_vmcs12;
														
 
															 	/*
														
 
															-	 * Indicates if the shadow vmcs must be updated with the
														
 
															-	 * data hold by vmcs12
														
 
															+	 * Indicates if the shadow vmcs or enlightened vmcs must be updated
														
 
															+	 * with the data held by struct vmcs12.
														
 
															 	 */
														
 
															-	bool sync_shadow_vmcs;
														
 
															+	bool need_vmcs12_sync;
														
 
															 	bool dirty_vmcs12;
														
 
															+	/*
														
 
															+	 * vmcs02 has been initialized, i.e. state that is constant for
														
 
															+	 * vmcs02 has been written to the backing VMCS.  Initialization
														
 
															+	 * is delayed until L1 actually attempts to run a nested VM.
														
 
															+	 */
														
 
															+	bool vmcs02_initialized;
														
 
															+
														
 
															 	bool change_vmcs01_virtual_apic_mode;
														
 
															+	/*
														
 
															+	 * Enlightened VMCS has been enabled. It does not mean that L1 has to
														
 
															+	 * use it. However, VMX features available to L1 will be limited based
														
 
															+	 * on what the enlightened VMCS supports.
														
 
															+	 */
														
 
															+	bool enlightened_vmcs_enabled;
														
 
															+
														
 
															 	/* L2 must run next, and mustn't decide to exit to L1. */
														
 
															 	bool nested_run_pending;
														
@@ -870,6 +889,10 @@ struct nested_vmx {
 
															 		/* in guest mode on SMM entry? */
														
 
															 		bool guest_mode;
														
 
															 	} smm;
														
 
															+
														
 
															+	gpa_t hv_evmcs_vmptr;
														
 
															+	struct page *hv_evmcs_page;
														
 
															+	struct hv_enlightened_vmcs *hv_evmcs;
														
 
															 };
														
 
															 #define POSTED_INTR_ON  0
														
@@ -1381,6 +1404,49 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs);
 
															 #define KVM_EVMCS_VERSION 1
														
 
															+/*
														
 
															+ * Enlightened VMCSv1 doesn't support these:
														
 
															+ *
														
 
															+ *	POSTED_INTR_NV                  = 0x00000002,
														
 
															+ *	GUEST_INTR_STATUS               = 0x00000810,
														
 
															+ *	APIC_ACCESS_ADDR		= 0x00002014,
														
 
															+ *	POSTED_INTR_DESC_ADDR           = 0x00002016,
														
 
															+ *	EOI_EXIT_BITMAP0                = 0x0000201c,
														
 
															+ *	EOI_EXIT_BITMAP1                = 0x0000201e,
														
 
															+ *	EOI_EXIT_BITMAP2                = 0x00002020,
														
 
															+ *	EOI_EXIT_BITMAP3                = 0x00002022,
														
 
															+ *	GUEST_PML_INDEX			= 0x00000812,
														
 
															+ *	PML_ADDRESS			= 0x0000200e,
														
 
															+ *	VM_FUNCTION_CONTROL             = 0x00002018,
														
 
															+ *	EPTP_LIST_ADDRESS               = 0x00002024,
														
 
															+ *	VMREAD_BITMAP                   = 0x00002026,
														
 
															+ *	VMWRITE_BITMAP                  = 0x00002028,
														
 
															+ *
														
 
															+ *	TSC_MULTIPLIER                  = 0x00002032,
														
 
															+ *	PLE_GAP                         = 0x00004020,
														
 
															+ *	PLE_WINDOW                      = 0x00004022,
														
 
															+ *	VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
														
 
															+ *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
														
 
															+ *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
														
 
															+ *
														
 
															+ * Currently unsupported in KVM:
														
 
															+ *	GUEST_IA32_RTIT_CTL		= 0x00002814,
														
 
															+ */
														
 
															+#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
														
 
															+				    PIN_BASED_VMX_PREEMPTION_TIMER)
														
 
															+#define EVMCS1_UNSUPPORTED_2NDEXEC					\
														
 
															+	(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |				\
														
 
															+	 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |			\
														
 
															+	 SECONDARY_EXEC_APIC_REGISTER_VIRT |				\
														
 
															+	 SECONDARY_EXEC_ENABLE_PML |					\
														
 
															+	 SECONDARY_EXEC_ENABLE_VMFUNC |					\
														
 
															+	 SECONDARY_EXEC_SHADOW_VMCS |					\
														
 
															+	 SECONDARY_EXEC_TSC_SCALING |					\
														
 
															+	 SECONDARY_EXEC_PAUSE_LOOP_EXITING)
														
 
															+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
														
 
															+#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
														
 
															+#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
														
 
															+
														
 
															 #if IS_ENABLED(CONFIG_HYPERV)
														
 
															 static bool __read_mostly enlightened_vmcs = true;
														
 
															 module_param(enlightened_vmcs, bool, 0444);
														
@@ -1473,69 +1539,12 @@ static void evmcs_load(u64 phys_addr)
 
															 static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
														
 
															 {
														
 
															-	/*
														
 
															-	 * Enlightened VMCSv1 doesn't support these:
														
 
															-	 *
														
 
															-	 *	POSTED_INTR_NV                  = 0x00000002,
														
 
															-	 *	GUEST_INTR_STATUS               = 0x00000810,
														
 
															-	 *	APIC_ACCESS_ADDR		= 0x00002014,
														
 
															-	 *	POSTED_INTR_DESC_ADDR           = 0x00002016,
														
 
															-	 *	EOI_EXIT_BITMAP0                = 0x0000201c,
														
 
															-	 *	EOI_EXIT_BITMAP1                = 0x0000201e,
														
 
															-	 *	EOI_EXIT_BITMAP2                = 0x00002020,
														
 
															-	 *	EOI_EXIT_BITMAP3                = 0x00002022,
														
 
															-	 */
														
 
															-	vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &=
														
 
															-		~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &=
														
 
															-		~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &=
														
 
															-		~SECONDARY_EXEC_APIC_REGISTER_VIRT;
														
 
															-
														
 
															-	/*
														
 
															-	 *	GUEST_PML_INDEX			= 0x00000812,
														
 
															-	 *	PML_ADDRESS			= 0x0000200e,
														
 
															-	 */
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
														
 
															-
														
 
															-	/*	VM_FUNCTION_CONTROL             = 0x00002018, */
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
														
 
															-
														
 
															-	/*
														
 
															-	 *	EPTP_LIST_ADDRESS               = 0x00002024,
														
 
															-	 *	VMREAD_BITMAP                   = 0x00002026,
														
 
															-	 *	VMWRITE_BITMAP                  = 0x00002028,
														
 
															-	 */
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
														
 
															-
														
 
															-	/*
														
 
															-	 *	TSC_MULTIPLIER                  = 0x00002032,
														
 
															-	 */
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
														
 
															-
														
 
															-	/*
														
 
															-	 *	PLE_GAP                         = 0x00004020,
														
 
															-	 *	PLE_WINDOW                      = 0x00004022,
														
 
															-	 */
														
 
															-	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
														
 
															-
														
 
															-	/*
														
 
															-	 *	VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
														
 
															-	 */
														
 
															-	vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
														
 
															+	vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
														
 
															+	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
														
 
															-	/*
														
 
															-	 *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
														
 
															-	 *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
														
 
															-	 */
														
 
															-	vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
														
 
															-	vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
														
 
															+	vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
														
 
															+	vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
														
 
															-	/*
														
 
															-	 * Currently unsupported in KVM:
														
 
															-	 *	GUEST_IA32_RTIT_CTL		= 0x00002814,
														
 
															-	 */
														
 
															 }
														
 
															 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
														
@@ -1560,26 +1569,27 @@ static void check_ept_pointer_match(struct kvm *kvm)
 
															 static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
														
 
															 {
														
 
															-	int ret;
														
 
															+	struct kvm_vcpu *vcpu;
														
 
															+	int ret = -ENOTSUPP, i;
														
 
															 	spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
														
 
															 	if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
														
 
															 		check_ept_pointer_match(kvm);
														
 
															-	if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
														
 
															-		ret = -ENOTSUPP;
														
 
															-		goto out;
														
 
															-	}
														
 
															-
														
 
															 	/*
														
 
															 	 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
														
 
															 	 * base of EPT PML4 table, strip off EPT configuration information.
														
 
															 	 */
														
 
															-	ret = hyperv_flush_guest_mapping(
														
 
															-			to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
														
 
															+	if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
														
 
															+		kvm_for_each_vcpu(i, vcpu, kvm)
														
 
															+			ret |= hyperv_flush_guest_mapping(
														
 
															+				to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK);
														
 
															+	} else {
														
 
															+		ret = hyperv_flush_guest_mapping(
														
 
															+				to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
														
 
															+	}
														
 
															-out:
														
 
															 	spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
														
 
															 	return ret;
														
 
															 }
														
@@ -1595,6 +1605,35 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
 
															 static inline void evmcs_touch_msr_bitmap(void) {}
														
 
															 #endif /* IS_ENABLED(CONFIG_HYPERV) */
														
 
															+static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
														
 
															+			       uint16_t *vmcs_version)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															+	/* We don't support disabling the feature for simplicity. */
														
 
															+	if (vmx->nested.enlightened_vmcs_enabled)
														
 
															+		return 0;
														
 
															+
														
 
															+	vmx->nested.enlightened_vmcs_enabled = true;
														
 
															+
														
 
															+	/*
														
 
															+	 * vmcs_version represents the range of supported Enlightened VMCS
														
 
															+	 * versions: lower 8 bits is the minimal version, higher 8 bits is the
														
 
															+	 * maximum supported version. KVM supports versions from 1 to
														
 
															+	 * KVM_EVMCS_VERSION.
														
 
															+	 */
														
 
															+	if (vmcs_version)
														
 
															+		*vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
														
 
															+
														
 
															+	vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
														
 
															+	vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
														
 
															+	vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
														
 
															+	vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
														
 
															+	vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 static inline bool is_exception_n(u32 intr_info, u8 vector)
														
 
															 {
														
 
															 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
														
@@ -1617,11 +1656,6 @@ static inline bool is_page_fault(u32 intr_info)
 
															 	return is_exception_n(intr_info, PF_VECTOR);
														
 
															 }
														
 
															-static inline bool is_no_device(u32 intr_info)
														
 
															-{
														
 
															-	return is_exception_n(intr_info, NM_VECTOR);
														
 
															-}
														
 
															-
														
 
															 static inline bool is_invalid_opcode(u32 intr_info)
														
 
															 {
														
 
															 	return is_exception_n(intr_info, UD_VECTOR);
														
@@ -1632,12 +1666,6 @@ static inline bool is_gp_fault(u32 intr_info)
 
															 	return is_exception_n(intr_info, GP_VECTOR);
														
 
															 }
														
 
															-static inline bool is_external_interrupt(u32 intr_info)
														
 
															-{
														
 
															-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
														
 
															-		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
														
 
															-}
														
 
															-
														
 
															 static inline bool is_machine_check(u32 intr_info)
														
 
															 {
														
 
															 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
														
@@ -2063,9 +2091,6 @@ static inline bool is_nmi(u32 intr_info)
 
															 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
														
 
															 			      u32 exit_intr_info,
														
 
															 			      unsigned long exit_qualification);
														
 
															-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
														
 
															-			struct vmcs12 *vmcs12,
														
 
															-			u32 reason, unsigned long qualification);
														
 
															 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
														
 
															 {
														
@@ -2077,7 +2102,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 
															 	return -1;
														
 
															 }
														
 
															-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
														
 
															+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
														
 
															 {
														
 
															     struct {
														
 
															 	u64 vpid : 16;
														
@@ -2086,22 +2111,20 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 
															     } operand = { vpid, 0, gva };
														
 
															     bool error;
														
 
															-    asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
														
 
															-		  : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
														
 
															-		  : "memory");
														
 
															+    asm volatile (__ex("invvpid %2, %1") CC_SET(na)
														
 
															+		  : CC_OUT(na) (error) : "r"(ext), "m"(operand));
														
 
															     BUG_ON(error);
														
 
															 }
														
 
															-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
														
 
															+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
														
 
															 {
														
 
															 	struct {
														
 
															 		u64 eptp, gpa;
														
 
															 	} operand = {eptp, gpa};
														
 
															 	bool error;
														
 
															-	asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
														
 
															-		      : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
														
 
															-		      : "memory");
														
 
															+	asm volatile (__ex("invept %2, %1") CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "r"(ext), "m"(operand));
														
 
															 	BUG_ON(error);
														
 
															 }
														
@@ -2120,9 +2143,8 @@ static void vmcs_clear(struct vmcs *vmcs)
 
															 	u64 phys_addr = __pa(vmcs);
														
 
															 	bool error;
														
 
															-	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
														
 
															-		      : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
														
 
															-		      : "memory");
														
 
															+	asm volatile (__ex("vmclear %1") CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "m"(phys_addr));
														
 
															 	if (unlikely(error))
														
 
															 		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
														
 
															 		       vmcs, phys_addr);
														
@@ -2145,9 +2167,8 @@ static void vmcs_load(struct vmcs *vmcs)
 
															 	if (static_branch_unlikely(&enable_evmcs))
														
 
															 		return evmcs_load(phys_addr);
														
 
															-	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
														
 
															-		      : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
														
 
															-		      : "memory");
														
 
															+	asm volatile (__ex("vmptrld %1") CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "m"(phys_addr));
														
 
															 	if (unlikely(error))
														
 
															 		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
														
 
															 		       vmcs, phys_addr);
														
@@ -2323,8 +2344,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
 
															 {
														
 
															 	unsigned long value;
														
 
															-	asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
														
 
															-		      : "=a"(value) : "d"(field) : "cc");
														
 
															+	asm volatile (__ex_clear("vmread %1, %0", "%k0")
														
 
															+		      : "=r"(value) : "r"(field));
														
 
															 	return value;
														
 
															 }
														
@@ -2375,8 +2396,8 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
 
															 {
														
 
															 	bool error;
														
 
															-	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
														
 
															-		      : CC_OUT(na) (error) : "a"(value), "d"(field));
														
 
															+	asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "r"(field), "rm"(value));
														
 
															 	if (unlikely(error))
														
 
															 		vmwrite_error(field, value);
														
 
															 }
														
@@ -2707,7 +2728,8 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 
															 		u64 guest_val, u64 host_val)
														
 
															 {
														
 
															 	vmcs_write64(guest_val_vmcs, guest_val);
														
 
															-	vmcs_write64(host_val_vmcs, host_val);
														
 
															+	if (host_val_vmcs != HOST_IA32_EFER)
														
 
															+		vmcs_write64(host_val_vmcs, host_val);
														
 
															 	vm_entry_controls_setbit(vmx, entry);
														
 
															 	vm_exit_controls_setbit(vmx, exit);
														
 
															 }
														
@@ -2805,8 +2827,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 
															 		ignore_bits &= ~(u64)EFER_SCE;
														
 
															 #endif
														
 
															-	clear_atomic_switch_msr(vmx, MSR_EFER);
														
 
															-
														
 
															 	/*
														
 
															 	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
														
 
															 	 * On CPUs that support "load IA32_EFER", always switch EFER
														
@@ -2819,8 +2839,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 
															 		if (guest_efer != host_efer)
														
 
															 			add_atomic_switch_msr(vmx, MSR_EFER,
														
 
															 					      guest_efer, host_efer, false);
														
 
															+		else
														
 
															+			clear_atomic_switch_msr(vmx, MSR_EFER);
														
 
															 		return false;
														
 
															 	} else {
														
 
															+		clear_atomic_switch_msr(vmx, MSR_EFER);
														
 
															+
														
 
															 		guest_efer &= ~ignore_bits;
														
 
															 		guest_efer |= host_efer & ignore_bits;
														
@@ -3272,34 +3296,30 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
 
															 {
														
 
															 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
														
 
															 	unsigned int nr = vcpu->arch.exception.nr;
														
 
															+	bool has_payload = vcpu->arch.exception.has_payload;
														
 
															+	unsigned long payload = vcpu->arch.exception.payload;
														
 
															 	if (nr == PF_VECTOR) {
														
 
															 		if (vcpu->arch.exception.nested_apf) {
														
 
															 			*exit_qual = vcpu->arch.apf.nested_apf_token;
														
 
															 			return 1;
														
 
															 		}
														
 
															-		/*
														
 
															-		 * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
														
 
															-		 * The fix is to add the ancillary datum (CR2 or DR6) to structs
														
 
															-		 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
														
 
															-		 * can be written only when inject_pending_event runs.  This should be
														
 
															-		 * conditional on a new capability---if the capability is disabled,
														
 
															-		 * kvm_multiple_exception would write the ancillary information to
														
 
															-		 * CR2 or DR6, for backwards ABI-compatibility.
														
 
															-		 */
														
 
															 		if (nested_vmx_is_page_fault_vmexit(vmcs12,
														
 
															 						    vcpu->arch.exception.error_code)) {
														
 
															-			*exit_qual = vcpu->arch.cr2;
														
 
															-			return 1;
														
 
															-		}
														
 
															-	} else {
														
 
															-		if (vmcs12->exception_bitmap & (1u << nr)) {
														
 
															-			if (nr == DB_VECTOR)
														
 
															-				*exit_qual = vcpu->arch.dr6;
														
 
															-			else
														
 
															-				*exit_qual = 0;
														
 
															+			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
														
 
															 			return 1;
														
 
															 		}
														
 
															+	} else if (vmcs12->exception_bitmap & (1u << nr)) {
														
 
															+		if (nr == DB_VECTOR) {
														
 
															+			if (!has_payload) {
														
 
															+				payload = vcpu->arch.dr6;
														
 
															+				payload &= ~(DR6_FIXED_1 | DR6_BT);
														
 
															+				payload ^= DR6_RTM;
														
 
															+			}
														
 
															+			*exit_qual = payload;
														
 
															+		} else
														
 
															+			*exit_qual = 0;
														
 
															+		return 1;
														
 
															 	}
														
 
															 	return 0;
														
@@ -3326,6 +3346,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 
															 	u32 error_code = vcpu->arch.exception.error_code;
														
 
															 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
														
 
															+	kvm_deliver_exception_payload(vcpu);
														
 
															+
														
 
															 	if (has_error_code) {
														
 
															 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
														
 
															 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
														
@@ -4397,9 +4419,7 @@ static void kvm_cpu_vmxon(u64 addr)
 
															 	cr4_set_bits(X86_CR4_VMXE);
														
 
															 	intel_pt_handle_vmx(1);
														
 
															-	asm volatile (ASM_VMX_VMXON_RAX
														
 
															-			: : "a"(&addr), "m"(addr)
														
 
															-			: "memory", "cc");
														
 
															+	asm volatile ("vmxon %0" : : "m"(addr));
														
 
															 }
														
 
															 static int hardware_enable(void)
														
@@ -4468,7 +4488,7 @@ static void vmclear_local_loaded_vmcss(void)
 
															  */
														
 
															 static void kvm_cpu_vmxoff(void)
														
 
															 {
														
 
															-	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
														
 
															+	asm volatile (__ex("vmxoff"));
														
 
															 	intel_pt_handle_vmx(0);
														
 
															 	cr4_clear_bits(X86_CR4_VMXE);
														
@@ -5112,9 +5132,10 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
 
															 				bool invalidate_gpa)
														
 
															 {
														
 
															 	if (enable_ept && (invalidate_gpa || !enable_vpid)) {
														
 
															-		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+		if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
														
 
															 			return;
														
 
															-		ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
														
 
															+		ept_sync_context(construct_eptp(vcpu,
														
 
															+						vcpu->arch.mmu->root_hpa));
														
 
															 	} else {
														
 
															 		vpid_sync_context(vpid);
														
 
															 	}
														
@@ -5264,7 +5285,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															 	unsigned long hw_cr0;
														
 
															-	hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
														
 
															+	hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
														
 
															 	if (enable_unrestricted_guest)
														
 
															 		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
														
 
															 	else {
														
@@ -6339,6 +6360,9 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 
															 		rdmsr(MSR_IA32_CR_PAT, low32, high32);
														
 
															 		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
														
 
															 	}
														
 
															+
														
 
															+	if (cpu_has_load_ia32_efer)
														
 
															+		vmcs_write64(HOST_IA32_EFER, host_efer);
														
 
															 }
														
 
															 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
														
@@ -6666,7 +6690,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
															 		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
														
 
															 	if (enable_pml) {
														
 
															-		ASSERT(vmx->pml_pg);
														
 
															 		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
														
 
															 		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
														
 
															 	}
														
@@ -8067,35 +8090,39 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 
															 /*
														
 
															  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
														
 
															- * set the success or error code of an emulated VMX instruction, as specified
														
 
															- * by Vol 2B, VMX Instruction Reference, "Conventions".
														
 
															+ * set the success or error code of an emulated VMX instruction (as specified
														
 
															+ * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
														
 
															+ * instruction.
														
 
															  */
														
 
															-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
														
 
															+static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
														
 
															 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
														
 
															 			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
														
 
															+	return kvm_skip_emulated_instruction(vcpu);
														
 
															 }
														
 
															-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
														
 
															+static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
														
 
															 			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
														
 
															 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
														
 
															 			| X86_EFLAGS_CF);
														
 
															+	return kvm_skip_emulated_instruction(vcpu);
														
 
															 }
														
 
															-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
														
 
															-					u32 vm_instruction_error)
														
 
															+static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
														
 
															+				u32 vm_instruction_error)
														
 
															 {
														
 
															-	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
														
 
															-		/*
														
 
															-		 * failValid writes the error number to the current VMCS, which
														
 
															-		 * can't be done there isn't a current VMCS.
														
 
															-		 */
														
 
															-		nested_vmx_failInvalid(vcpu);
														
 
															-		return;
														
 
															-	}
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															+	/*
														
 
															+	 * failValid writes the error number to the current VMCS, which
														
 
															+	 * can't be done if there isn't a current VMCS.
														
 
															+	 */
														
 
															+	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															+
														
 
															 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
														
 
															 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
														
 
															 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
														
@@ -8105,6 +8132,7 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
 
															 	 * We don't need to force a shadow sync because
														
 
															 	 * VM_INSTRUCTION_ERROR is not shadowed
														
 
															 	 */
														
 
															+	return kvm_skip_emulated_instruction(vcpu);
														
 
															 }
														
 
															 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
														
@@ -8292,6 +8320,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 
															 	vmx->nested.vpid02 = allocate_vpid();
														
 
															+	vmx->nested.vmcs02_initialized = false;
														
 
															 	vmx->nested.vmxon = true;
														
 
															 	return 0;
														
@@ -8345,10 +8374,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
															 		return 1;
														
 
															 	}
														
 
															-	if (vmx->nested.vmxon) {
														
 
															-		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (vmx->nested.vmxon)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
														
 
															 	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
														
 
															 			!= VMXON_NEEDED_FEATURES) {
														
@@ -8367,21 +8395,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
															 	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
														
 
															 	 * which replaces physical address width with 32
														
 
															 	 */
														
 
															-	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
														
 
															-		nested_vmx_failInvalid(vcpu);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															 	page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
														
 
															-	if (is_error_page(page)) {
														
 
															-		nested_vmx_failInvalid(vcpu);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (is_error_page(page))
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															+
														
 
															 	if (*(u32 *)kmap(page) != VMCS12_REVISION) {
														
 
															 		kunmap(page);
														
 
															 		kvm_release_page_clean(page);
														
 
															-		nested_vmx_failInvalid(vcpu);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															 	}
														
 
															 	kunmap(page);
														
 
															 	kvm_release_page_clean(page);
														
@@ -8391,8 +8415,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
															 	if (ret)
														
 
															 		return ret;
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															 }
														
 
															 /*
														
@@ -8423,8 +8446,24 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 
															 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
														
 
															 }
														
 
															-static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
														
 
															+static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															+	if (!vmx->nested.hv_evmcs)
														
 
															+		return;
														
 
															+
														
 
															+	kunmap(vmx->nested.hv_evmcs_page);
														
 
															+	kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
														
 
															+	vmx->nested.hv_evmcs_vmptr = -1ull;
														
 
															+	vmx->nested.hv_evmcs_page = NULL;
														
 
															+	vmx->nested.hv_evmcs = NULL;
														
 
															+}
														
 
															+
														
 
															+static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															 	if (vmx->nested.current_vmptr == -1ull)
														
 
															 		return;
														
@@ -8432,16 +8471,18 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 
															 		/* copy to memory all shadowed fields in case
														
 
															 		   they were modified */
														
 
															 		copy_shadow_to_vmcs12(vmx);
														
 
															-		vmx->nested.sync_shadow_vmcs = false;
														
 
															+		vmx->nested.need_vmcs12_sync = false;
														
 
															 		vmx_disable_shadow_vmcs(vmx);
														
 
															 	}
														
 
															 	vmx->nested.posted_intr_nv = -1;
														
 
															 	/* Flush VMCS12 to guest memory */
														
 
															-	kvm_vcpu_write_guest_page(&vmx->vcpu,
														
 
															+	kvm_vcpu_write_guest_page(vcpu,
														
 
															 				  vmx->nested.current_vmptr >> PAGE_SHIFT,
														
 
															 				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
														
 
															+	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
														
 
															+
														
 
															 	vmx->nested.current_vmptr = -1ull;
														
 
															 }
														
@@ -8449,8 +8490,10 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 
															  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
														
 
															  * just stops using VMX.
														
 
															  */
														
 
															-static void free_nested(struct vcpu_vmx *vmx)
														
 
															+static void free_nested(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															 	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
														
 
															 		return;
														
@@ -8483,6 +8526,10 @@ static void free_nested(struct vcpu_vmx *vmx)
 
															 		vmx->nested.pi_desc = NULL;
														
 
															 	}
														
 
															+	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
														
 
															+
														
 
															+	nested_release_evmcs(vcpu);
														
 
															+
														
 
															 	free_loaded_vmcs(&vmx->nested.vmcs02);
														
 
															 }
														
@@ -8491,9 +8538,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
 
															 {
														
 
															 	if (!nested_vmx_check_permission(vcpu))
														
 
															 		return 1;
														
 
															-	free_nested(to_vmx(vcpu));
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	free_nested(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															 }
														
 
															 /* Emulate the VMCLEAR instruction */
														
@@ -8509,25 +8555,28 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 
															 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
														
 
															 		return 1;
														
 
															-	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
														
 
															-		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_VMCLEAR_INVALID_ADDRESS);
														
 
															-	if (vmptr == vmx->nested.vmxon_ptr) {
														
 
															-		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (vmptr == vmx->nested.vmxon_ptr)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_VMCLEAR_VMXON_POINTER);
														
 
															-	if (vmptr == vmx->nested.current_vmptr)
														
 
															-		nested_release_vmcs12(vmx);
														
 
															+	if (vmx->nested.hv_evmcs_page) {
														
 
															+		if (vmptr == vmx->nested.hv_evmcs_vmptr)
														
 
															+			nested_release_evmcs(vcpu);
														
 
															+	} else {
														
 
															+		if (vmptr == vmx->nested.current_vmptr)
														
 
															+			nested_release_vmcs12(vcpu);
														
 
															-	kvm_vcpu_write_guest(vcpu,
														
 
															-			vmptr + offsetof(struct vmcs12, launch_state),
														
 
															-			&zero, sizeof(zero));
														
 
															+		kvm_vcpu_write_guest(vcpu,
														
 
															+				     vmptr + offsetof(struct vmcs12,
														
 
															+						      launch_state),
														
 
															+				     &zero, sizeof(zero));
														
 
															+	}
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															 }
														
 
															 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
														
@@ -8610,6 +8659,395 @@ static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
 
															 }
														
 
															+static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
														
 
															+{
														
 
															+	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
														
 
															+	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
														
 
															+
														
 
															+	vmcs12->hdr.revision_id = evmcs->revision_id;
														
 
															+
														
 
															+	/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
														
 
															+	vmcs12->tpr_threshold = evmcs->tpr_threshold;
														
 
															+	vmcs12->guest_rip = evmcs->guest_rip;
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
														
 
															+		vmcs12->guest_rsp = evmcs->guest_rsp;
														
 
															+		vmcs12->guest_rflags = evmcs->guest_rflags;
														
 
															+		vmcs12->guest_interruptibility_info =
														
 
															+			evmcs->guest_interruptibility_info;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
														
 
															+		vmcs12->cpu_based_vm_exec_control =
														
 
															+			evmcs->cpu_based_vm_exec_control;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
														
 
															+		vmcs12->exception_bitmap = evmcs->exception_bitmap;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
														
 
															+		vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
														
 
															+		vmcs12->vm_entry_intr_info_field =
														
 
															+			evmcs->vm_entry_intr_info_field;
														
 
															+		vmcs12->vm_entry_exception_error_code =
														
 
															+			evmcs->vm_entry_exception_error_code;
														
 
															+		vmcs12->vm_entry_instruction_len =
														
 
															+			evmcs->vm_entry_instruction_len;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
														
 
															+		vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
														
 
															+		vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
														
 
															+		vmcs12->host_cr0 = evmcs->host_cr0;
														
 
															+		vmcs12->host_cr3 = evmcs->host_cr3;
														
 
															+		vmcs12->host_cr4 = evmcs->host_cr4;
														
 
															+		vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
														
 
															+		vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
														
 
															+		vmcs12->host_rip = evmcs->host_rip;
														
 
															+		vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
														
 
															+		vmcs12->host_es_selector = evmcs->host_es_selector;
														
 
															+		vmcs12->host_cs_selector = evmcs->host_cs_selector;
														
 
															+		vmcs12->host_ss_selector = evmcs->host_ss_selector;
														
 
															+		vmcs12->host_ds_selector = evmcs->host_ds_selector;
														
 
															+		vmcs12->host_fs_selector = evmcs->host_fs_selector;
														
 
															+		vmcs12->host_gs_selector = evmcs->host_gs_selector;
														
 
															+		vmcs12->host_tr_selector = evmcs->host_tr_selector;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
														
 
															+		vmcs12->pin_based_vm_exec_control =
														
 
															+			evmcs->pin_based_vm_exec_control;
														
 
															+		vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
														
 
															+		vmcs12->secondary_vm_exec_control =
														
 
															+			evmcs->secondary_vm_exec_control;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
														
 
															+		vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
														
 
															+		vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
														
 
															+		vmcs12->msr_bitmap = evmcs->msr_bitmap;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
														
 
															+		vmcs12->guest_es_base = evmcs->guest_es_base;
														
 
															+		vmcs12->guest_cs_base = evmcs->guest_cs_base;
														
 
															+		vmcs12->guest_ss_base = evmcs->guest_ss_base;
														
 
															+		vmcs12->guest_ds_base = evmcs->guest_ds_base;
														
 
															+		vmcs12->guest_fs_base = evmcs->guest_fs_base;
														
 
															+		vmcs12->guest_gs_base = evmcs->guest_gs_base;
														
 
															+		vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
														
 
															+		vmcs12->guest_tr_base = evmcs->guest_tr_base;
														
 
															+		vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
														
 
															+		vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
														
 
															+		vmcs12->guest_es_limit = evmcs->guest_es_limit;
														
 
															+		vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
														
 
															+		vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
														
 
															+		vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
														
 
															+		vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
														
 
															+		vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
														
 
															+		vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
														
 
															+		vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
														
 
															+		vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
														
 
															+		vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
														
 
															+		vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
														
 
															+		vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
														
 
															+		vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
														
 
															+		vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
														
 
															+		vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
														
 
															+		vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
														
 
															+		vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
														
 
															+		vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
														
 
															+		vmcs12->guest_es_selector = evmcs->guest_es_selector;
														
 
															+		vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
														
 
															+		vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
														
 
															+		vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
														
 
															+		vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
														
 
															+		vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
														
 
															+		vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
														
 
															+		vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
														
 
															+		vmcs12->tsc_offset = evmcs->tsc_offset;
														
 
															+		vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
														
 
															+		vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
														
 
															+		vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
														
 
															+		vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
														
 
															+		vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
														
 
															+		vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
														
 
															+		vmcs12->guest_cr0 = evmcs->guest_cr0;
														
 
															+		vmcs12->guest_cr3 = evmcs->guest_cr3;
														
 
															+		vmcs12->guest_cr4 = evmcs->guest_cr4;
														
 
															+		vmcs12->guest_dr7 = evmcs->guest_dr7;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
														
 
															+		vmcs12->host_fs_base = evmcs->host_fs_base;
														
 
															+		vmcs12->host_gs_base = evmcs->host_gs_base;
														
 
															+		vmcs12->host_tr_base = evmcs->host_tr_base;
														
 
															+		vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
														
 
															+		vmcs12->host_idtr_base = evmcs->host_idtr_base;
														
 
															+		vmcs12->host_rsp = evmcs->host_rsp;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
														
 
															+		vmcs12->ept_pointer = evmcs->ept_pointer;
														
 
															+		vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(!(evmcs->hv_clean_fields &
														
 
															+		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
														
 
															+		vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
														
 
															+		vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
														
 
															+		vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
														
 
															+		vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
														
 
															+		vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
														
 
															+		vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
														
 
															+		vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
														
 
															+		vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
														
 
															+		vmcs12->guest_pending_dbg_exceptions =
														
 
															+			evmcs->guest_pending_dbg_exceptions;
														
 
															+		vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
														
 
															+		vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
														
 
															+		vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
														
 
															+		vmcs12->guest_activity_state = evmcs->guest_activity_state;
														
 
															+		vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Not used?
														
 
															+	 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
														
 
															+	 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
														
 
															+	 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
														
 
															+	 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
														
 
															+	 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
														
 
															+	 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
														
 
															+	 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
														
 
															+	 * vmcs12->page_fault_error_code_mask =
														
 
															+	 *		evmcs->page_fault_error_code_mask;
														
 
															+	 * vmcs12->page_fault_error_code_match =
														
 
															+	 *		evmcs->page_fault_error_code_match;
														
 
															+	 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
														
 
															+	 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
														
 
															+	 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
														
 
															+	 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
														
 
															+	 */
														
 
															+
														
 
															+	/*
														
 
															+	 * Read only fields:
														
 
															+	 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
														
 
															+	 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
														
 
															+	 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
														
 
															+	 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
														
 
															+	 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
														
 
															+	 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
														
 
															+	 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
														
 
															+	 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
														
 
															+	 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
														
 
															+	 * vmcs12->exit_qualification = evmcs->exit_qualification;
														
 
															+	 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
														
 
															+	 *
														
 
															+	 * Not present in struct vmcs12:
														
 
															+	 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
														
 
															+	 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
														
 
															+	 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
														
 
															+	 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
														
 
															+	 */
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
														
 
															+{
														
 
															+	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
														
 
															+	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
														
 
															+
														
 
															+	/*
														
 
															+	 * Should not be changed by KVM:
														
 
															+	 *
														
 
															+	 * evmcs->host_es_selector = vmcs12->host_es_selector;
														
 
															+	 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
														
 
															+	 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
														
 
															+	 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
														
 
															+	 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
														
 
															+	 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
														
 
															+	 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
														
 
															+	 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
														
 
															+	 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
														
 
															+	 * evmcs->host_cr0 = vmcs12->host_cr0;
														
 
															+	 * evmcs->host_cr3 = vmcs12->host_cr3;
														
 
															+	 * evmcs->host_cr4 = vmcs12->host_cr4;
														
 
															+	 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
														
 
															+	 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
														
 
															+	 * evmcs->host_rip = vmcs12->host_rip;
														
 
															+	 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
														
 
															+	 * evmcs->host_fs_base = vmcs12->host_fs_base;
														
 
															+	 * evmcs->host_gs_base = vmcs12->host_gs_base;
														
 
															+	 * evmcs->host_tr_base = vmcs12->host_tr_base;
														
 
															+	 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
														
 
															+	 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
														
 
															+	 * evmcs->host_rsp = vmcs12->host_rsp;
														
 
															+	 * sync_vmcs12() doesn't read these:
														
 
															+	 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
														
 
															+	 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
														
 
															+	 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
														
 
															+	 * evmcs->ept_pointer = vmcs12->ept_pointer;
														
 
															+	 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
														
 
															+	 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
														
 
															+	 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
														
 
															+	 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
														
 
															+	 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
														
 
															+	 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
														
 
															+	 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
														
 
															+	 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
														
 
															+	 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
														
 
															+	 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
														
 
															+	 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
														
 
															+	 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
														
 
															+	 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
														
 
															+	 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
														
 
															+	 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
														
 
															+	 * evmcs->page_fault_error_code_mask =
														
 
															+	 *		vmcs12->page_fault_error_code_mask;
														
 
															+	 * evmcs->page_fault_error_code_match =
														
 
															+	 *		vmcs12->page_fault_error_code_match;
														
 
															+	 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
														
 
															+	 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
														
 
															+	 * evmcs->tsc_offset = vmcs12->tsc_offset;
														
 
															+	 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
														
 
															+	 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
														
 
															+	 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
														
 
															+	 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
														
 
															+	 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
														
 
															+	 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
														
 
															+	 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
														
 
															+	 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
														
 
															+	 *
														
 
															+	 * Not present in struct vmcs12:
														
 
															+	 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
														
 
															+	 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
														
 
															+	 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
														
 
															+	 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
														
 
															+	 */
														
 
															+
														
 
															+	evmcs->guest_es_selector = vmcs12->guest_es_selector;
														
 
															+	evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
														
 
															+	evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
														
 
															+	evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
														
 
															+	evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
														
 
															+	evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
														
 
															+	evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
														
 
															+	evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
														
 
															+
														
 
															+	evmcs->guest_es_limit = vmcs12->guest_es_limit;
														
 
															+	evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
														
 
															+	evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
														
 
															+	evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
														
 
															+	evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
														
 
															+	evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
														
 
															+	evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
														
 
															+	evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
														
 
															+	evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
														
 
															+	evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
														
 
															+
														
 
															+	evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
														
 
															+	evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
														
 
															+	evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
														
 
															+	evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
														
 
															+	evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
														
 
															+	evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
														
 
															+	evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
														
 
															+	evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
														
 
															+
														
 
															+	evmcs->guest_es_base = vmcs12->guest_es_base;
														
 
															+	evmcs->guest_cs_base = vmcs12->guest_cs_base;
														
 
															+	evmcs->guest_ss_base = vmcs12->guest_ss_base;
														
 
															+	evmcs->guest_ds_base = vmcs12->guest_ds_base;
														
 
															+	evmcs->guest_fs_base = vmcs12->guest_fs_base;
														
 
															+	evmcs->guest_gs_base = vmcs12->guest_gs_base;
														
 
															+	evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
														
 
															+	evmcs->guest_tr_base = vmcs12->guest_tr_base;
														
 
															+	evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
														
 
															+	evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
														
 
															+
														
 
															+	evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
														
 
															+	evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
														
 
															+
														
 
															+	evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
														
 
															+	evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
														
 
															+	evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
														
 
															+	evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
														
 
															+
														
 
															+	evmcs->guest_pending_dbg_exceptions =
														
 
															+		vmcs12->guest_pending_dbg_exceptions;
														
 
															+	evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
														
 
															+	evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
														
 
															+
														
 
															+	evmcs->guest_activity_state = vmcs12->guest_activity_state;
														
 
															+	evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
														
 
															+
														
 
															+	evmcs->guest_cr0 = vmcs12->guest_cr0;
														
 
															+	evmcs->guest_cr3 = vmcs12->guest_cr3;
														
 
															+	evmcs->guest_cr4 = vmcs12->guest_cr4;
														
 
															+	evmcs->guest_dr7 = vmcs12->guest_dr7;
														
 
															+
														
 
															+	evmcs->guest_physical_address = vmcs12->guest_physical_address;
														
 
															+
														
 
															+	evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
														
 
															+	evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
														
 
															+	evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
														
 
															+	evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
														
 
															+	evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
														
 
															+	evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
														
 
															+	evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
														
 
															+	evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
														
 
															+
														
 
															+	evmcs->exit_qualification = vmcs12->exit_qualification;
														
 
															+
														
 
															+	evmcs->guest_linear_address = vmcs12->guest_linear_address;
														
 
															+	evmcs->guest_rsp = vmcs12->guest_rsp;
														
 
															+	evmcs->guest_rflags = vmcs12->guest_rflags;
														
 
															+
														
 
															+	evmcs->guest_interruptibility_info =
														
 
															+		vmcs12->guest_interruptibility_info;
														
 
															+	evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
														
 
															+	evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
														
 
															+	evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
														
 
															+	evmcs->vm_entry_exception_error_code =
														
 
															+		vmcs12->vm_entry_exception_error_code;
														
 
															+	evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
														
 
															+
														
 
															+	evmcs->guest_rip = vmcs12->guest_rip;
														
 
															+
														
 
															+	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Copy the writable VMCS shadow fields back to the VMCS12, in case
														
 
															  * they have been modified by the L1 guest. Note that the "read-only"
														
@@ -8683,20 +9121,6 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 
															 	vmcs_load(vmx->loaded_vmcs->vmcs);
														
 
															 }
														
 
															-/*
														
 
															- * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
														
 
															- * used before) all generate the same failure when it is missing.
														
 
															- */
														
 
															-static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
														
 
															-{
														
 
															-	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															-	if (vmx->nested.current_vmptr == -1ull) {
														
 
															-		nested_vmx_failInvalid(vcpu);
														
 
															-		return 0;
														
 
															-	}
														
 
															-	return 1;
														
 
															-}
														
 
															-
														
 
															 static int handle_vmread(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	unsigned long field;
														
@@ -8709,8 +9133,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 
															 	if (!nested_vmx_check_permission(vcpu))
														
 
															 		return 1;
														
 
															-	if (!nested_vmx_check_vmcs12(vcpu))
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															+	if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															 	if (!is_guest_mode(vcpu))
														
 
															 		vmcs12 = get_vmcs12(vcpu);
														
@@ -8719,20 +9143,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 
															 		 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
														
 
															 		 * to shadowed-field sets the ALU flags for VMfailInvalid.
														
 
															 		 */
														
 
															-		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
														
 
															-			nested_vmx_failInvalid(vcpu);
														
 
															-			return kvm_skip_emulated_instruction(vcpu);
														
 
															-		}
														
 
															+		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
														
 
															+			return nested_vmx_failInvalid(vcpu);
														
 
															 		vmcs12 = get_shadow_vmcs12(vcpu);
														
 
															 	}
														
 
															 	/* Decode instruction info and find the field to read */
														
 
															 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
														
 
															 	/* Read the field, zero-extended to a u64 field_value */
														
 
															-	if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
														
 
															-		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
														
 
															+
														
 
															 	/*
														
 
															 	 * Now copy part of this value to register or memory, as requested.
														
 
															 	 * Note that the number of bits actually copied is 32 or 64 depending
														
@@ -8750,8 +9172,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 
															 					    (is_long_mode(vcpu) ? 8 : 4), NULL);
														
 
															 	}
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															 }
														
@@ -8776,8 +9197,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 
															 	if (!nested_vmx_check_permission(vcpu))
														
 
															 		return 1;
														
 
															-	if (!nested_vmx_check_vmcs12(vcpu))
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															+	if (vmx->nested.current_vmptr == -1ull)
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															 	if (vmx_instruction_info & (1u << 10))
														
 
															 		field_value = kvm_register_readl(vcpu,
														
@@ -8800,11 +9221,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 
															 	 * VMCS," then the "read-only" fields are actually read/write.
														
 
															 	 */
														
 
															 	if (vmcs_field_readonly(field) &&
														
 
															-	    !nested_cpu_has_vmwrite_any_field(vcpu)) {
														
 
															-		nested_vmx_failValid(vcpu,
														
 
															+	    !nested_cpu_has_vmwrite_any_field(vcpu))
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															 			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															 	if (!is_guest_mode(vcpu))
														
 
															 		vmcs12 = get_vmcs12(vcpu);
														
@@ -8813,18 +9232,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 
															 		 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
														
 
															 		 * to shadowed-field sets the ALU flags for VMfailInvalid.
														
 
															 		 */
														
 
															-		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
														
 
															-			nested_vmx_failInvalid(vcpu);
														
 
															-			return kvm_skip_emulated_instruction(vcpu);
														
 
															-		}
														
 
															+		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
														
 
															+			return nested_vmx_failInvalid(vcpu);
														
 
															 		vmcs12 = get_shadow_vmcs12(vcpu);
														
 
															-
														
 
															 	}
														
 
															-	if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
														
 
															-		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (vmcs12_write_any(vmcs12, field, field_value) < 0)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
														
 
															 	/*
														
 
															 	 * Do not track vmcs12 dirty-state if in guest-mode
														
@@ -8846,8 +9261,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 
															 		}
														
 
															 	}
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															 }
														
 
															 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
														
@@ -8858,7 +9272,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
 
															 			      SECONDARY_EXEC_SHADOW_VMCS);
														
 
															 		vmcs_write64(VMCS_LINK_POINTER,
														
 
															 			     __pa(vmx->vmcs01.shadow_vmcs));
														
 
															-		vmx->nested.sync_shadow_vmcs = true;
														
 
															+		vmx->nested.need_vmcs12_sync = true;
														
 
															 	}
														
 
															 	vmx->nested.dirty_vmcs12 = true;
														
 
															 }
														
@@ -8875,36 +9289,37 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 
															 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
														
 
															 		return 1;
														
 
															-	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
														
 
															-		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_VMPTRLD_INVALID_ADDRESS);
														
 
															-	if (vmptr == vmx->nested.vmxon_ptr) {
														
 
															-		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	if (vmptr == vmx->nested.vmxon_ptr)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_VMPTRLD_VMXON_POINTER);
														
 
															+
														
 
															+	/* Forbid normal VMPTRLD if Enlightened version was used */
														
 
															+	if (vmx->nested.hv_evmcs)
														
 
															+		return 1;
														
 
															 	if (vmx->nested.current_vmptr != vmptr) {
														
 
															 		struct vmcs12 *new_vmcs12;
														
 
															 		struct page *page;
														
 
															 		page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
														
 
															-		if (is_error_page(page)) {
														
 
															-			nested_vmx_failInvalid(vcpu);
														
 
															-			return kvm_skip_emulated_instruction(vcpu);
														
 
															-		}
														
 
															+		if (is_error_page(page))
														
 
															+			return nested_vmx_failInvalid(vcpu);
														
 
															+
														
 
															 		new_vmcs12 = kmap(page);
														
 
															 		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
														
 
															 		    (new_vmcs12->hdr.shadow_vmcs &&
														
 
															 		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
														
 
															 			kunmap(page);
														
 
															 			kvm_release_page_clean(page);
														
 
															-			nested_vmx_failValid(vcpu,
														
 
															+			return nested_vmx_failValid(vcpu,
														
 
															 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
														
 
															-			return kvm_skip_emulated_instruction(vcpu);
														
 
															 		}
														
 
															-		nested_release_vmcs12(vmx);
														
 
															+		nested_release_vmcs12(vcpu);
														
 
															+
														
 
															 		/*
														
 
															 		 * Load VMCS12 from guest memory since it is not already
														
 
															 		 * cached.
														
@@ -8916,8 +9331,71 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 
															 		set_current_vmptr(vmx, vmptr);
														
 
															 	}
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This is an equivalent of the nested hypervisor executing the vmptrld
														
 
															+ * instruction.
														
 
															+ */
														
 
															+static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
														
 
															+						 bool from_launch)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	struct hv_vp_assist_page assist_page;
														
 
															+
														
 
															+	if (likely(!vmx->nested.enlightened_vmcs_enabled))
														
 
															+		return 1;
														
 
															+
														
 
															+	if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
														
 
															+		return 1;
														
 
															+
														
 
															+	if (unlikely(!assist_page.enlighten_vmentry))
														
 
															+		return 1;
														
 
															+
														
 
															+	if (unlikely(assist_page.current_nested_vmcs !=
														
 
															+		     vmx->nested.hv_evmcs_vmptr)) {
														
 
															+
														
 
															+		if (!vmx->nested.hv_evmcs)
														
 
															+			vmx->nested.current_vmptr = -1ull;
														
 
															+
														
 
															+		nested_release_evmcs(vcpu);
														
 
															+
														
 
															+		vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
														
 
															+			vcpu, assist_page.current_nested_vmcs);
														
 
															+
														
 
															+		if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
														
 
															+			return 0;
														
 
															+
														
 
															+		vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
														
 
															+
														
 
															+		if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
														
 
															+			nested_release_evmcs(vcpu);
														
 
															+			return 0;
														
 
															+		}
														
 
															+
														
 
															+		vmx->nested.dirty_vmcs12 = true;
														
 
															+		/*
														
 
															+		 * As we keep L2 state for one guest only 'hv_clean_fields' mask
														
 
															+		 * can't be used when we switch between them. Reset it here for
														
 
															+		 * simplicity.
														
 
															+		 */
														
 
															+		vmx->nested.hv_evmcs->hv_clean_fields &=
														
 
															+			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
														
 
															+		vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
														
 
															+
														
 
															+		/*
														
 
															+		 * Unlike normal vmcs12, enlightened vmcs12 is not fully
														
 
															+		 * reloaded from guest's memory (read only fields, fields not
														
 
															+		 * present in struct hv_enlightened_vmcs, ...). Make sure there
														
 
															+		 * are no leftovers.
														
 
															+		 */
														
 
															+		if (from_launch)
														
 
															+			memset(vmx->nested.cached_vmcs12, 0,
														
 
															+			       sizeof(*vmx->nested.cached_vmcs12));
														
 
															+
														
 
															+	}
														
 
															+	return 1;
														
 
															 }
														
 
															 /* Emulate the VMPTRST instruction */
														
@@ -8932,6 +9410,9 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 
															 	if (!nested_vmx_check_permission(vcpu))
														
 
															 		return 1;
														
 
															+	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
														
 
															+		return 1;
														
 
															+
														
 
															 	if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
														
 
															 		return 1;
														
 
															 	/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
														
@@ -8940,8 +9421,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 
															 		kvm_inject_page_fault(vcpu, &e);
														
 
															 		return 1;
														
 
															 	}
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															 }
														
 
															 /* Emulate the INVEPT instruction */
														
@@ -8971,11 +9451,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
															 	types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
														
 
															-	if (type >= 32 || !(types & (1 << type))) {
														
 
															-		nested_vmx_failValid(vcpu,
														
 
															+	if (type >= 32 || !(types & (1 << type)))
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															 	/* According to the Intel VMX instruction reference, the memory
														
 
															 	 * operand is read even if it isn't needed (e.g., for type==global)
														
@@ -8997,14 +9475,20 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
															 	case VMX_EPT_EXTENT_CONTEXT:
														
 
															 		kvm_mmu_sync_roots(vcpu);
														
 
															 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															-		nested_vmx_succeed(vcpu);
														
 
															 		break;
														
 
															 	default:
														
 
															 		BUG_ON(1);
														
 
															 		break;
														
 
															 	}
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															+}
														
 
															+
														
 
															+static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															+	return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
														
 
															 }
														
 
															 static int handle_invvpid(struct kvm_vcpu *vcpu)
														
@@ -9018,6 +9502,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 
															 		u64 vpid;
														
 
															 		u64 gla;
														
 
															 	} operand;
														
 
															+	u16 vpid02;
														
 
															 	if (!(vmx->nested.msrs.secondary_ctls_high &
														
 
															 	      SECONDARY_EXEC_ENABLE_VPID) ||
														
@@ -9035,11 +9520,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 
															 	types = (vmx->nested.msrs.vpid_caps &
														
 
															 			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
														
 
															-	if (type >= 32 || !(types & (1 << type))) {
														
 
															-		nested_vmx_failValid(vcpu,
														
 
															+	if (type >= 32 || !(types & (1 << type)))
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															 	/* according to the intel vmx instruction reference, the memory
														
 
															 	 * operand is read even if it isn't needed (e.g., for type==global)
														
@@ -9051,47 +9534,39 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 
															 		kvm_inject_page_fault(vcpu, &e);
														
 
															 		return 1;
														
 
															 	}
														
 
															-	if (operand.vpid >> 16) {
														
 
															-		nested_vmx_failValid(vcpu,
														
 
															+	if (operand.vpid >> 16)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
														
 
															-		return kvm_skip_emulated_instruction(vcpu);
														
 
															-	}
														
 
															+	vpid02 = nested_get_vpid02(vcpu);
														
 
															 	switch (type) {
														
 
															 	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
														
 
															 		if (!operand.vpid ||
														
 
															-		    is_noncanonical_address(operand.gla, vcpu)) {
														
 
															-			nested_vmx_failValid(vcpu,
														
 
															+		    is_noncanonical_address(operand.gla, vcpu))
														
 
															+			return nested_vmx_failValid(vcpu,
														
 
															 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
														
 
															-			return kvm_skip_emulated_instruction(vcpu);
														
 
															-		}
														
 
															-		if (cpu_has_vmx_invvpid_individual_addr() &&
														
 
															-		    vmx->nested.vpid02) {
														
 
															+		if (cpu_has_vmx_invvpid_individual_addr()) {
														
 
															 			__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
														
 
															-				vmx->nested.vpid02, operand.gla);
														
 
															+				vpid02, operand.gla);
														
 
															 		} else
														
 
															-			__vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
														
 
															+			__vmx_flush_tlb(vcpu, vpid02, false);
														
 
															 		break;
														
 
															 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
														
 
															 	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
														
 
															-		if (!operand.vpid) {
														
 
															-			nested_vmx_failValid(vcpu,
														
 
															+		if (!operand.vpid)
														
 
															+			return nested_vmx_failValid(vcpu,
														
 
															 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
														
 
															-			return kvm_skip_emulated_instruction(vcpu);
														
 
															-		}
														
 
															-		__vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
														
 
															+		__vmx_flush_tlb(vcpu, vpid02, false);
														
 
															 		break;
														
 
															 	case VMX_VPID_EXTENT_ALL_CONTEXT:
														
 
															-		__vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
														
 
															+		__vmx_flush_tlb(vcpu, vpid02, false);
														
 
															 		break;
														
 
															 	default:
														
 
															 		WARN_ON_ONCE(1);
														
 
															 		return kvm_skip_emulated_instruction(vcpu);
														
 
															 	}
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															+	return nested_vmx_succeed(vcpu);
														
 
															 }
														
 
															 static int handle_invpcid(struct kvm_vcpu *vcpu)
														
@@ -9162,11 +9637,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 
															 		}
														
 
															 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															-			if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
														
 
															+			if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
														
 
															 			    == operand.pcid)
														
 
															 				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
														
 
															-		kvm_mmu_free_roots(vcpu, roots_to_free);
														
 
															+		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
														
 
															 		/*
														
 
															 		 * If neither the current cr3 nor any of the prev_roots use the
														
 
															 		 * given PCID, then nothing needs to be done here because a
														
@@ -9293,7 +9768,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
 
															 		kvm_mmu_unload(vcpu);
														
 
															 		mmu->ept_ad = accessed_dirty;
														
 
															-		mmu->base_role.ad_disabled = !accessed_dirty;
														
 
															+		mmu->mmu_role.base.ad_disabled = !accessed_dirty;
														
 
															 		vmcs12->ept_pointer = address;
														
 
															 		/*
														
 
															 		 * TODO: Check what's the correct approach in case
														
@@ -9652,9 +10127,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 
															 			return false;
														
 
															 		else if (is_page_fault(intr_info))
														
 
															 			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
														
 
															-		else if (is_no_device(intr_info) &&
														
 
															-			 !(vmcs12->guest_cr0 & X86_CR0_TS))
														
 
															-			return false;
														
 
															 		else if (is_debug(intr_info) &&
														
 
															 			 vcpu->guest_debug &
														
 
															 			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
														
@@ -10676,9 +11148,25 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
															 		vmcs_write32(PLE_WINDOW, vmx->ple_window);
														
 
															 	}
														
 
															-	if (vmx->nested.sync_shadow_vmcs) {
														
 
															-		copy_vmcs12_to_shadow(vmx);
														
 
															-		vmx->nested.sync_shadow_vmcs = false;
														
 
															+	if (vmx->nested.need_vmcs12_sync) {
														
 
															+		/*
														
 
															+		 * hv_evmcs may end up being not mapped after migration (when
														
 
															+		 * L2 was running), map it here to make sure vmcs12 changes are
														
 
															+		 * properly reflected.
														
 
															+		 */
														
 
															+		if (vmx->nested.enlightened_vmcs_enabled &&
														
 
															+		    !vmx->nested.hv_evmcs)
														
 
															+			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
														
 
															+
														
 
															+		if (vmx->nested.hv_evmcs) {
														
 
															+			copy_vmcs12_to_enlightened(vmx);
														
 
															+			/* All fields are clean */
														
 
															+			vmx->nested.hv_evmcs->hv_clean_fields |=
														
 
															+				HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
														
 
															+		} else {
														
 
															+			copy_vmcs12_to_shadow(vmx);
														
 
															+		}
														
 
															+		vmx->nested.need_vmcs12_sync = false;
														
 
															 	}
														
 
															 	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
														
@@ -10745,7 +11233,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
															 		"mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
														
 
															 		"jmp 1f \n\t"
														
 
															 		"2: \n\t"
														
 
															-		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
														
 
															+		__ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
														
 
															 		"1: \n\t"
														
 
															 		/* Reload cr2 if changed */
														
 
															 		"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
														
@@ -10777,9 +11265,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
															 		/* Enter guest mode */
														
 
															 		"jne 1f \n\t"
														
 
															-		__ex(ASM_VMX_VMLAUNCH) "\n\t"
														
 
															+		__ex("vmlaunch") "\n\t"
														
 
															 		"jmp 2f \n\t"
														
 
															-		"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
														
 
															+		"1: " __ex("vmresume") "\n\t"
														
 
															 		"2: "
														
 
															 		/* Save guest registers, load host registers, keep flags */
														
 
															 		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
														
@@ -10801,6 +11289,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
															 		"mov %%r13, %c[r13](%0) \n\t"
														
 
															 		"mov %%r14, %c[r14](%0) \n\t"
														
 
															 		"mov %%r15, %c[r15](%0) \n\t"
														
 
															+		/*
														
 
															+		* Clear host registers marked as clobbered to prevent
														
 
															+		* speculative use.
														
 
															+		*/
														
 
															 		"xor %%r8d,  %%r8d \n\t"
														
 
															 		"xor %%r9d,  %%r9d \n\t"
														
 
															 		"xor %%r10d, %%r10d \n\t"
														
@@ -10958,6 +11450,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 
															 	vmx->loaded_vmcs = vmcs;
														
 
															 	vmx_vcpu_load(vcpu, cpu);
														
 
															 	put_cpu();
														
 
															+
														
 
															+	vm_entry_controls_reset_shadow(vmx);
														
 
															+	vm_exit_controls_reset_shadow(vmx);
														
 
															+	vmx_segment_cache_clear(vmx);
														
 
															 }
														
 
															 /*
														
@@ -10966,12 +11462,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 
															  */
														
 
															 static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-       struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															-
														
 
															-       vcpu_load(vcpu);
														
 
															-       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
														
 
															-       free_nested(vmx);
														
 
															-       vcpu_put(vcpu);
														
 
															+	vcpu_load(vcpu);
														
 
															+	vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
														
 
															+	free_nested(vcpu);
														
 
															+	vcpu_put(vcpu);
														
 
															 }
														
 
															 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
														
@@ -11334,28 +11828,28 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 
															 	return get_vmcs12(vcpu)->ept_pointer;
														
 
															 }
														
 
															-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
														
 
															+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	WARN_ON(mmu_is_nested(vcpu));
														
 
															-	if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
														
 
															-		return 1;
														
 
															+	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
														
 
															 	kvm_init_shadow_ept_mmu(vcpu,
														
 
															 			to_vmx(vcpu)->nested.msrs.ept_caps &
														
 
															 			VMX_EPT_EXECUTE_ONLY_BIT,
														
 
															 			nested_ept_ad_enabled(vcpu),
														
 
															 			nested_ept_get_cr3(vcpu));
														
 
															-	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
														
 
															-	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
														
 
															-	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
														
 
															+	vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
														
 
															+	vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
														
 
															+	vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
														
 
															+	vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
														
 
															 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
														
 
															-	return 0;
														
 
															 }
														
 
															 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
														
 
															+	vcpu->arch.mmu = &vcpu->arch.root_mmu;
														
 
															+	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
														
 
															 }
														
 
															 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
														
@@ -11716,7 +12210,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 
															 	    !nested_exit_intr_ack_set(vcpu) ||
														
 
															 	    (vmcs12->posted_intr_nv & 0xff00) ||
														
 
															 	    (vmcs12->posted_intr_desc_addr & 0x3f) ||
														
 
															-	    (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
														
 
															+	    (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
														
 
															 		return -EINVAL;
														
 
															 	/* tpr shadow is needed by all apicv features. */
														
@@ -11772,15 +12266,12 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
 
															 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
														
 
															 					 struct vmcs12 *vmcs12)
														
 
															 {
														
 
															-	u64 address = vmcs12->pml_address;
														
 
															-	int maxphyaddr = cpuid_maxphyaddr(vcpu);
														
 
															+	if (!nested_cpu_has_pml(vmcs12))
														
 
															+		return 0;
														
 
															-	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
														
 
															-		if (!nested_cpu_has_ept(vmcs12) ||
														
 
															-		    !IS_ALIGNED(address, 4096)  ||
														
 
															-		    address >> maxphyaddr)
														
 
															-			return -EINVAL;
														
 
															-	}
														
 
															+	if (!nested_cpu_has_ept(vmcs12) ||
														
 
															+	    !page_address_valid(vcpu, vmcs12->pml_address))
														
 
															+		return -EINVAL;
														
 
															 	return 0;
														
 
															 }
														
@@ -11960,112 +12451,87 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 
															 	return 0;
														
 
															 }
														
 
															-static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
														
 
															-{
														
 
															-	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+/*
														
 
															+ * Returns if KVM is able to config CPU to tag TLB entries
														
 
															+ * populated by L2 differently than TLB entries populated
														
 
															+ * by L1.
														
 
															+ *
														
 
															+ * If L1 uses EPT, then TLB entries are tagged with different EPTP.
														
 
															+ *
														
 
															+ * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
														
 
															+ * with different VPID (L1 entries are tagged with vmx->vpid
														
 
															+ * while L2 entries are tagged with vmx->nested.vpid02).
														
 
															+ */
														
 
															+static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
														
 
															-	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
														
 
															-	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
														
 
															-	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
														
 
															-	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
														
 
															-	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
														
 
															-	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
														
 
															-	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
														
 
															-	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
														
 
															-	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
														
 
															-	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
														
 
															-	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
														
 
															-	vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
														
 
															-	vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
														
 
															-	vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
														
 
															-	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
														
 
															-	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
														
 
															-	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
														
 
															-	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
														
 
															-	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
														
 
															-	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
														
 
															-	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
														
 
															-	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
														
 
															-	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
														
 
															-	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
														
 
															-	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
														
 
															-	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
														
 
															-	vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
														
 
															-	vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
														
 
															-	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
														
 
															-	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
														
 
															-	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
														
 
															-
														
 
															-	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
														
 
															-	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
														
 
															-		vmcs12->guest_pending_dbg_exceptions);
														
 
															-	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
														
 
															-	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
														
 
															+	return nested_cpu_has_ept(vmcs12) ||
														
 
															+	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
														
 
															+}
														
 
															-	if (nested_cpu_has_xsaves(vmcs12))
														
 
															-		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
														
 
															-	vmcs_write64(VMCS_LINK_POINTER, -1ull);
														
 
															+static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	if (vmx->nested.nested_run_pending &&
														
 
															+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
														
 
															+		return vmcs12->guest_ia32_efer;
														
 
															+	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
														
 
															+		return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
														
 
															+	else
														
 
															+		return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
														
 
															+}
														
 
															-	if (cpu_has_vmx_posted_intr())
														
 
															-		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
														
 
															+static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
														
 
															+{
														
 
															+	/*
														
 
															+	 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
														
 
															+	 * according to L0's settings (vmcs12 is irrelevant here).  Host
														
 
															+	 * fields that come from L0 and are not constant, e.g. HOST_CR3,
														
 
															+	 * will be set as needed prior to VMLAUNCH/VMRESUME.
														
 
															+	 */
														
 
															+	if (vmx->nested.vmcs02_initialized)
														
 
															+		return;
														
 
															+	vmx->nested.vmcs02_initialized = true;
														
 
															 	/*
														
 
															-	 * Whether page-faults are trapped is determined by a combination of
														
 
															-	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
														
 
															-	 * If enable_ept, L0 doesn't care about page faults and we should
														
 
															-	 * set all of these to L1's desires. However, if !enable_ept, L0 does
														
 
															-	 * care about (at least some) page faults, and because it is not easy
														
 
															-	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
														
 
															-	 * to exit on each and every L2 page fault. This is done by setting
														
 
															-	 * MASK=MATCH=0 and (see below) EB.PF=1.
														
 
															-	 * Note that below we don't need special code to set EB.PF beyond the
														
 
															-	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
														
 
															-	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
														
 
															-	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
														
 
															+	 * We don't care what the EPTP value is we just need to guarantee
														
 
															+	 * it's valid so we don't get a false positive when doing early
														
 
															+	 * consistency checks.
														
 
															 	 */
														
 
															-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
														
 
															-		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
														
 
															-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
														
 
															-		enable_ept ? vmcs12->page_fault_error_code_match : 0);
														
 
															+	if (enable_ept && nested_early_check)
														
 
															+		vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
														
 
															 	/* All VMFUNCs are currently emulated through L0 vmexits.  */
														
 
															 	if (cpu_has_vmx_vmfunc())
														
 
															 		vmcs_write64(VM_FUNCTION_CONTROL, 0);
														
 
															-	if (cpu_has_vmx_apicv()) {
														
 
															-		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
														
 
															-		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
														
 
															-		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
														
 
															-		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
														
 
															-	}
														
 
															+	if (cpu_has_vmx_posted_intr())
														
 
															+		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
														
 
															-	/*
														
 
															-	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
														
 
															-	 * Some constant fields are set here by vmx_set_constant_host_state().
														
 
															-	 * Other fields are different per CPU, and will be set later when
														
 
															-	 * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
														
 
															-	 * is called.
														
 
															-	 */
														
 
															-	vmx_set_constant_host_state(vmx);
														
 
															+	if (cpu_has_vmx_msr_bitmap())
														
 
															+		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
														
 
															+
														
 
															+	if (enable_pml)
														
 
															+		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
														
 
															 	/*
														
 
															-	 * Set the MSR load/store lists to match L0's settings.
														
 
															+	 * Set the MSR load/store lists to match L0's settings.  Only the
														
 
															+	 * addresses are constant (for vmcs02), the counts can change based
														
 
															+	 * on L2's behavior, e.g. switching to/from long mode.
														
 
															 	 */
														
 
															 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
														
 
															-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
														
 
															 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
														
 
															-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
														
 
															 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
														
 
															-	set_cr4_guest_host_mask(vmx);
														
 
															+	vmx_set_constant_host_state(vmx);
														
 
															+}
														
 
															-	if (kvm_mpx_supported()) {
														
 
															-		if (vmx->nested.nested_run_pending &&
														
 
															-			(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
														
 
															-			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
														
 
															-		else
														
 
															-			vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
														
 
															-	}
														
 
															+static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
														
 
															+				      struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	prepare_vmcs02_constant_state(vmx);
														
 
															+
														
 
															+	vmcs_write64(VMCS_LINK_POINTER, -1ull);
														
 
															 	if (enable_vpid) {
														
 
															 		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
														
@@ -12073,78 +12539,30 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
															 		else
														
 
															 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
														
 
															 	}
														
 
															-
														
 
															-	/*
														
 
															-	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
														
 
															-	 */
														
 
															-	if (enable_ept) {
														
 
															-		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
														
 
															-		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
														
 
															-		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
														
 
															-		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
														
 
															-	}
														
 
															-
														
 
															-	if (cpu_has_vmx_msr_bitmap())
														
 
															-		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
														
 
															 }
														
 
															-/*
														
 
															- * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
														
 
															- * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
														
 
															- * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
														
 
															- * guest in a way that will both be appropriate to L1's requests, and our
														
 
															- * needs. In addition to modifying the active vmcs (which is vmcs02), this
														
 
															- * function also has additional necessary side-effects, like setting various
														
 
															- * vcpu->arch fields.
														
 
															- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
														
 
															- * is assigned to entry_failure_code on failure.
														
 
															- */
														
 
															-static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
														
 
															-			  u32 *entry_failure_code)
														
 
															+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
														
 
															 {
														
 
															-	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															 	u32 exec_control, vmcs12_exec_ctrl;
														
 
															+	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
														
 
															-	if (vmx->nested.dirty_vmcs12) {
														
 
															-		prepare_vmcs02_full(vcpu, vmcs12);
														
 
															-		vmx->nested.dirty_vmcs12 = false;
														
 
															-	}
														
 
															+	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
														
 
															+		prepare_vmcs02_early_full(vmx, vmcs12);
														
 
															 	/*
														
 
															-	 * First, the fields that are shadowed.  This must be kept in sync
														
 
															-	 * with vmx_shadow_fields.h.
														
 
															+	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
														
 
															+	 * entry, but only if the current (host) sp changed from the value
														
 
															+	 * we wrote last (vmx->host_rsp).  This cache is no longer relevant
														
 
															+	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
														
 
															+	 * here we just force the write to happen on entry.  host_rsp will
														
 
															+	 * also be written unconditionally by nested_vmx_check_vmentry_hw()
														
 
															+	 * if we are doing early consistency checks via hardware.
														
 
															 	 */
														
 
															+	vmx->host_rsp = 0;
														
 
															-	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
														
 
															-	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
														
 
															-	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
														
 
															-	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
														
 
															-	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
														
 
															-
														
 
															-	if (vmx->nested.nested_run_pending &&
														
 
															-	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
														
 
															-		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
														
 
															-		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
														
 
															-	} else {
														
 
															-		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
														
 
															-		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
														
 
															-	}
														
 
															-	if (vmx->nested.nested_run_pending) {
														
 
															-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
														
 
															-			     vmcs12->vm_entry_intr_info_field);
														
 
															-		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
														
 
															-			     vmcs12->vm_entry_exception_error_code);
														
 
															-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
														
 
															-			     vmcs12->vm_entry_instruction_len);
														
 
															-		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
														
 
															-			     vmcs12->guest_interruptibility_info);
														
 
															-		vmx->loaded_vmcs->nmi_known_unmasked =
														
 
															-			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
														
 
															-	} else {
														
 
															-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
														
 
															-	}
														
 
															-	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
														
 
															-
														
 
															+	/*
														
 
															+	 * PIN CONTROLS
														
 
															+	 */
														
 
															 	exec_control = vmcs12->pin_based_vm_exec_control;
														
 
															 	/* Preemption timer setting is computed directly in vmx_vcpu_run.  */
														
@@ -12159,13 +12577,43 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	} else {
														
 
															 		exec_control &= ~PIN_BASED_POSTED_INTR;
														
 
															 	}
														
 
															-
														
 
															 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
														
 
															-	vmx->nested.preemption_timer_expired = false;
														
 
															-	if (nested_cpu_has_preemption_timer(vmcs12))
														
 
															-		vmx_start_preemption_timer(vcpu);
														
 
															+	/*
														
 
															+	 * EXEC CONTROLS
														
 
															+	 */
														
 
															+	exec_control = vmx_exec_control(vmx); /* L0's desires */
														
 
															+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
														
 
															+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
														
 
															+	exec_control &= ~CPU_BASED_TPR_SHADOW;
														
 
															+	exec_control |= vmcs12->cpu_based_vm_exec_control;
														
 
															+
														
 
															+	/*
														
 
															+	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
														
 
															+	 * nested_get_vmcs12_pages can't fix it up, the illegal value
														
 
															+	 * will result in a VM entry failure.
														
 
															+	 */
														
 
															+	if (exec_control & CPU_BASED_TPR_SHADOW) {
														
 
															+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
														
 
															+		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
														
 
															+	} else {
														
 
															+#ifdef CONFIG_X86_64
														
 
															+		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
														
 
															+				CPU_BASED_CR8_STORE_EXITING;
														
 
															+#endif
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
														
 
															+	 * for I/O port accesses.
														
 
															+	 */
														
 
															+	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
														
 
															+	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
														
 
															+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
														
 
															+	/*
														
 
															+	 * SECONDARY EXEC CONTROLS
														
 
															+	 */
														
 
															 	if (cpu_has_secondary_exec_ctrls()) {
														
 
															 		exec_control = vmx->secondary_exec_control;
														
@@ -12206,43 +12654,214 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	}
														
 
															 	/*
														
 
															-	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
														
 
															-	 * entry, but only if the current (host) sp changed from the value
														
 
															-	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
														
 
															-	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
														
 
															-	 * here we just force the write to happen on entry.
														
 
															+	 * ENTRY CONTROLS
														
 
															+	 *
														
 
															+	 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
														
 
															+	 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
														
 
															+	 * on the related bits (if supported by the CPU) in the hope that
														
 
															+	 * we can avoid VMWrites during vmx_set_efer().
														
 
															+	 */
														
 
															+	exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
														
 
															+			~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
														
 
															+	if (cpu_has_load_ia32_efer) {
														
 
															+		if (guest_efer & EFER_LMA)
														
 
															+			exec_control |= VM_ENTRY_IA32E_MODE;
														
 
															+		if (guest_efer != host_efer)
														
 
															+			exec_control |= VM_ENTRY_LOAD_IA32_EFER;
														
 
															+	}
														
 
															+	vm_entry_controls_init(vmx, exec_control);
														
 
															+
														
 
															+	/*
														
 
															+	 * EXIT CONTROLS
														
 
															+	 *
														
 
															+	 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
														
 
															+	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
														
 
															+	 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
														
 
															 	 */
														
 
															-	vmx->host_rsp = 0;
														
 
															+	exec_control = vmcs_config.vmexit_ctrl;
														
 
															+	if (cpu_has_load_ia32_efer && guest_efer != host_efer)
														
 
															+		exec_control |= VM_EXIT_LOAD_IA32_EFER;
														
 
															+	vm_exit_controls_init(vmx, exec_control);
														
 
															-	exec_control = vmx_exec_control(vmx); /* L0's desires */
														
 
															-	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
														
 
															-	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
														
 
															-	exec_control &= ~CPU_BASED_TPR_SHADOW;
														
 
															-	exec_control |= vmcs12->cpu_based_vm_exec_control;
														
 
															+	/*
														
 
															+	 * Conceptually we want to copy the PML address and index from
														
 
															+	 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
														
 
															+	 * since we always flush the log on each vmexit and never change
														
 
															+	 * the PML address (once set), this happens to be equivalent to
														
 
															+	 * simply resetting the index in vmcs02.
														
 
															+	 */
														
 
															+	if (enable_pml)
														
 
															+		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
														
 
															 	/*
														
 
															-	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
														
 
															-	 * nested_get_vmcs12_pages can't fix it up, the illegal value
														
 
															-	 * will result in a VM entry failure.
														
 
															+	 * Interrupt/Exception Fields
														
 
															 	 */
														
 
															-	if (exec_control & CPU_BASED_TPR_SHADOW) {
														
 
															-		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
														
 
															-		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
														
 
															+	if (vmx->nested.nested_run_pending) {
														
 
															+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
														
 
															+			     vmcs12->vm_entry_intr_info_field);
														
 
															+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
														
 
															+			     vmcs12->vm_entry_exception_error_code);
														
 
															+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
														
 
															+			     vmcs12->vm_entry_instruction_len);
														
 
															+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
														
 
															+			     vmcs12->guest_interruptibility_info);
														
 
															+		vmx->loaded_vmcs->nmi_known_unmasked =
														
 
															+			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
														
 
															 	} else {
														
 
															-#ifdef CONFIG_X86_64
														
 
															-		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
														
 
															-				CPU_BASED_CR8_STORE_EXITING;
														
 
															-#endif
														
 
															+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
														
 
															+
														
 
															+	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
														
 
															+			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
														
 
															+		vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
														
 
															+		vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
														
 
															+		vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
														
 
															+		vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
														
 
															+		vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
														
 
															+		vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
														
 
															+		vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
														
 
															+		vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
														
 
															+		vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
														
 
															+		vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
														
 
															+		vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
														
 
															+		vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
														
 
															+		vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
														
 
															+		vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
														
 
															+		vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
														
 
															+		vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
														
 
															+		vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
														
 
															+		vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
														
 
															+		vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
														
 
															+		vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
														
 
															+		vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
														
 
															+		vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
														
 
															+		vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
														
 
															+		vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
														
 
															+		vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
														
 
															+		vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
														
 
															+		vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
														
 
															+		vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
														
 
															+		vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
														
 
															+		vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
														
 
															+		vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
														
 
															+		vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
														
 
															+		vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
														
 
															+		vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
														
 
															+	}
														
 
															+
														
 
															+	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
														
 
															+			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
														
 
															+		vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
														
 
															+		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
														
 
															+			    vmcs12->guest_pending_dbg_exceptions);
														
 
															+		vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
														
 
															+		vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
														
 
															+
														
 
															+		/*
														
 
															+		 * L1 may access the L2's PDPTR, so save them to construct
														
 
															+		 * vmcs12
														
 
															+		 */
														
 
															+		if (enable_ept) {
														
 
															+			vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
														
 
															+			vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
														
 
															+			vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
														
 
															+			vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (nested_cpu_has_xsaves(vmcs12))
														
 
															+		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
														
 
															+
														
 
															+	/*
														
 
															+	 * Whether page-faults are trapped is determined by a combination of
														
 
															+	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
														
 
															+	 * If enable_ept, L0 doesn't care about page faults and we should
														
 
															+	 * set all of these to L1's desires. However, if !enable_ept, L0 does
														
 
															+	 * care about (at least some) page faults, and because it is not easy
														
 
															+	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
														
 
															+	 * to exit on each and every L2 page fault. This is done by setting
														
 
															+	 * MASK=MATCH=0 and (see below) EB.PF=1.
														
 
															+	 * Note that below we don't need special code to set EB.PF beyond the
														
 
															+	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
														
 
															+	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
														
 
															+	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
														
 
															+	 */
														
 
															+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
														
 
															+		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
														
 
															+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
														
 
															+		enable_ept ? vmcs12->page_fault_error_code_match : 0);
														
 
															+
														
 
															+	if (cpu_has_vmx_apicv()) {
														
 
															+		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
														
 
															+		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
														
 
															+		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
														
 
															+		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
														
 
															+	}
														
 
															+
														
 
															+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
														
 
															+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
														
 
															+
														
 
															+	set_cr4_guest_host_mask(vmx);
														
 
															+
														
 
															+	if (kvm_mpx_supported()) {
														
 
															+		if (vmx->nested.nested_run_pending &&
														
 
															+			(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
														
 
															+			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
														
 
															+		else
														
 
															+			vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
														
 
															+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
														
 
															+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
														
 
															+ * guest in a way that will both be appropriate to L1's requests, and our
														
 
															+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
														
 
															+ * function also has additional necessary side-effects, like setting various
														
 
															+ * vcpu->arch fields.
														
 
															+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
														
 
															+ * is assigned to entry_failure_code on failure.
														
 
															+ */
														
 
															+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
														
 
															+			  u32 *entry_failure_code)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
														
 
															+
														
 
															+	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
														
 
															+		prepare_vmcs02_full(vmx, vmcs12);
														
 
															+		vmx->nested.dirty_vmcs12 = false;
														
 
															 	}
														
 
															 	/*
														
 
															-	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
														
 
															-	 * for I/O port accesses.
														
 
															+	 * First, the fields that are shadowed.  This must be kept in sync
														
 
															+	 * with vmx_shadow_fields.h.
														
 
															 	 */
														
 
															-	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
														
 
															-	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
														
 
															+	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
														
 
															+			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
														
 
															+		vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
														
 
															+		vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
														
 
															+	}
														
 
															-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
														
 
															+	if (vmx->nested.nested_run_pending &&
														
 
															+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
														
 
															+		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
														
 
															+		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
														
 
															+	} else {
														
 
															+		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
														
 
															+		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
														
 
															+	}
														
 
															+	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
														
 
															+
														
 
															+	vmx->nested.preemption_timer_expired = false;
														
 
															+	if (nested_cpu_has_preemption_timer(vmcs12))
														
 
															+		vmx_start_preemption_timer(vcpu);
														
 
															 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
														
 
															 	 * bitwise-or of what L1 wants to trap for L2, and what we want to
														
@@ -12252,20 +12871,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
														
 
															 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
														
 
															-	/* L2->L1 exit controls are emulated - the hardware exit is to L0 so
														
 
															-	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
														
 
															-	 * bits are further modified by vmx_set_efer() below.
														
 
															-	 */
														
 
															-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
														
 
															-
														
 
															-	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
														
 
															-	 * emulated by vmx_set_efer(), below.
														
 
															-	 */
														
 
															-	vm_entry_controls_init(vmx, 
														
 
															-		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
														
 
															-			~VM_ENTRY_IA32E_MODE) |
														
 
															-		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
														
 
															-
														
 
															 	if (vmx->nested.nested_run_pending &&
														
 
															 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
														
 
															 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
														
@@ -12288,37 +12893,29 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 		 * influence global bitmap(for vpid01 and vpid02 allocation)
														
 
															 		 * even if spawn a lot of nested vCPUs.
														
 
															 		 */
														
 
															-		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
														
 
															+		if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
														
 
															 			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
														
 
															 				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
														
 
															-				__vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
														
 
															+				__vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
														
 
															 			}
														
 
															 		} else {
														
 
															-			vmx_flush_tlb(vcpu, true);
														
 
															+			/*
														
 
															+			 * If L1 use EPT, then L0 needs to execute INVEPT on
														
 
															+			 * EPTP02 instead of EPTP01. Therefore, delay TLB
														
 
															+			 * flush until vmcs02->eptp is fully updated by
														
 
															+			 * KVM_REQ_LOAD_CR3. Note that this assumes
														
 
															+			 * KVM_REQ_TLB_FLUSH is evaluated after
														
 
															+			 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
														
 
															+			 */
														
 
															+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															 		}
														
 
															 	}
														
 
															-	if (enable_pml) {
														
 
															-		/*
														
 
															-		 * Conceptually we want to copy the PML address and index from
														
 
															-		 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
														
 
															-		 * since we always flush the log on each vmexit, this happens
														
 
															-		 * to be equivalent to simply resetting the fields in vmcs02.
														
 
															-		 */
														
 
															-		ASSERT(vmx->pml_pg);
														
 
															-		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
														
 
															-		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
														
 
															-	}
														
 
															-
														
 
															-	if (nested_cpu_has_ept(vmcs12)) {
														
 
															-		if (nested_ept_init_mmu_context(vcpu)) {
														
 
															-			*entry_failure_code = ENTRY_FAIL_DEFAULT;
														
 
															-			return 1;
														
 
															-		}
														
 
															-	} else if (nested_cpu_has2(vmcs12,
														
 
															-				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
														
 
															+	if (nested_cpu_has_ept(vmcs12))
														
 
															+		nested_ept_init_mmu_context(vcpu);
														
 
															+	else if (nested_cpu_has2(vmcs12,
														
 
															+				 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
														
 
															 		vmx_flush_tlb(vcpu, true);
														
 
															-	}
														
 
															 	/*
														
 
															 	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
														
@@ -12334,14 +12931,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
														
 
															 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
														
 
															-	if (vmx->nested.nested_run_pending &&
														
 
															-	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
														
 
															-		vcpu->arch.efer = vmcs12->guest_ia32_efer;
														
 
															-	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
														
 
															-		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
														
 
															-	else
														
 
															-		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
														
 
															-	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
														
 
															+	vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
														
 
															+	/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
														
 
															 	vmx_set_efer(vcpu, vcpu->arch.efer);
														
 
															 	/*
														
@@ -12383,6 +12974,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
 
															 static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
														
 
															 {
														
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	bool ia32e;
														
 
															 	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
														
 
															 	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
														
@@ -12456,6 +13048,21 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
															 	    !nested_cr3_valid(vcpu, vmcs12->host_cr3))
														
 
															 		return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
														
 
															+	/*
														
 
															+	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
														
 
															+	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
														
 
															+	 * the values of the LMA and LME bits in the field must each be that of
														
 
															+	 * the host address-space size VM-exit control.
														
 
															+	 */
														
 
															+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
														
 
															+		ia32e = (vmcs12->vm_exit_controls &
														
 
															+			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
														
 
															+		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
														
 
															+		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
														
 
															+		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
														
 
															+			return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
														
 
															+	}
														
 
															+
														
 
															 	/*
														
 
															 	 * From the Intel SDM, volume 3:
														
 
															 	 * Fields relevant to VM-entry event injection must be set properly.
														
@@ -12512,6 +13119,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
															 		}
														
 
															 	}
														
 
															+	if (nested_cpu_has_ept(vmcs12) &&
														
 
															+	    !valid_ept_address(vcpu, vmcs12->ept_pointer))
														
 
															+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
@@ -12532,94 +13143,192 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
 
															 	if (is_error_page(page))
														
 
															 		return -EINVAL;
														
 
															-	r = 0;
														
 
															-	shadow = kmap(page);
														
 
															-	if (shadow->hdr.revision_id != VMCS12_REVISION ||
														
 
															-	    shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
														
 
															-		r = -EINVAL;
														
 
															-	kunmap(page);
														
 
															-	kvm_release_page_clean(page);
														
 
															-	return r;
														
 
															-}
														
 
															+	r = 0;
														
 
															+	shadow = kmap(page);
														
 
															+	if (shadow->hdr.revision_id != VMCS12_REVISION ||
														
 
															+	    shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
														
 
															+		r = -EINVAL;
														
 
															+	kunmap(page);
														
 
															+	kvm_release_page_clean(page);
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
														
 
															+				  u32 *exit_qual)
														
 
															+{
														
 
															+	bool ia32e;
														
 
															+
														
 
															+	*exit_qual = ENTRY_FAIL_DEFAULT;
														
 
															+
														
 
															+	if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
														
 
															+	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
														
 
															+		return 1;
														
 
															+
														
 
															+	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
														
 
															+		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * If the load IA32_EFER VM-entry control is 1, the following checks
														
 
															+	 * are performed on the field for the IA32_EFER MSR:
														
 
															+	 * - Bits reserved in the IA32_EFER MSR must be 0.
														
 
															+	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
														
 
															+	 *   the IA-32e mode guest VM-exit control. It must also be identical
														
 
															+	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
														
 
															+	 *   CR0.PG) is 1.
														
 
															+	 */
														
 
															+	if (to_vmx(vcpu)->nested.nested_run_pending &&
														
 
															+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
														
 
															+		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
														
 
															+		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
														
 
															+		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
														
 
															+		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
														
 
															+		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
														
 
															+			return 1;
														
 
															+	}
														
 
															+
														
 
															+	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
														
 
															+		(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
														
 
															+		(vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
														
 
															+			return 1;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	unsigned long cr3, cr4;
														
 
															+
														
 
															+	if (!nested_early_check)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (vmx->msr_autoload.host.nr)
														
 
															+		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
														
 
															+	if (vmx->msr_autoload.guest.nr)
														
 
															+		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
														
 
															+
														
 
															+	preempt_disable();
														
 
															+
														
 
															+	vmx_prepare_switch_to_guest(vcpu);
														
 
															+
														
 
															+	/*
														
 
															+	 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
														
 
															+	 * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
														
 
															+	 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
														
 
															+	 * there is no need to preserve other bits or save/restore the field.
														
 
															+	 */
														
 
															+	vmcs_writel(GUEST_RFLAGS, 0);
														
 
															+
														
 
															+	vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
														
 
															+
														
 
															+	cr3 = __get_current_cr3_fast();
														
 
															+	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
														
 
															+		vmcs_writel(HOST_CR3, cr3);
														
 
															+		vmx->loaded_vmcs->host_state.cr3 = cr3;
														
 
															+	}
														
 
															+
														
 
															+	cr4 = cr4_read_shadow();
														
 
															+	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
														
 
															+		vmcs_writel(HOST_CR4, cr4);
														
 
															+		vmx->loaded_vmcs->host_state.cr4 = cr4;
														
 
															+	}
														
 
															+
														
 
															+	vmx->__launched = vmx->loaded_vmcs->launched;
														
 
															+
														
 
															+	asm(
														
 
															+		/* Set HOST_RSP */
														
 
															+		__ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
														
 
															+		"mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
														
 
															+
														
 
															+		/* Check if vmlaunch of vmresume is needed */
														
 
															+		"cmpl $0, %c[launched](%0)\n\t"
														
 
															+		"je 1f\n\t"
														
 
															+		__ex("vmresume") "\n\t"
														
 
															+		"jmp 2f\n\t"
														
 
															+		"1: " __ex("vmlaunch") "\n\t"
														
 
															+		"jmp 2f\n\t"
														
 
															+		"2: "
														
 
															+
														
 
															+		/* Set vmx->fail accordingly */
														
 
															+		"setbe %c[fail](%0)\n\t"
														
 
															+
														
 
															+		".pushsection .rodata\n\t"
														
 
															+		".global vmx_early_consistency_check_return\n\t"
														
 
															+		"vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
														
 
															+		".popsection"
														
 
															+	      :
														
 
															+	      : "c"(vmx), "d"((unsigned long)HOST_RSP),
														
 
															+		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
														
 
															+		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
														
 
															+		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
														
 
															+	      : "rax", "cc", "memory"
														
 
															+	);
														
 
															-static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
														
 
															-				  u32 *exit_qual)
														
 
															-{
														
 
															-	bool ia32e;
														
 
															+	vmcs_writel(HOST_RIP, vmx_return);
														
 
															-	*exit_qual = ENTRY_FAIL_DEFAULT;
														
 
															+	preempt_enable();
														
 
															-	if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
														
 
															-	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
														
 
															-		return 1;
														
 
															+	if (vmx->msr_autoload.host.nr)
														
 
															+		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
														
 
															+	if (vmx->msr_autoload.guest.nr)
														
 
															+		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
														
 
															-	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
														
 
															-		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
														
 
															+	if (vmx->fail) {
														
 
															+		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
														
 
															+			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
														
 
															+		vmx->fail = 0;
														
 
															 		return 1;
														
 
															 	}
														
 
															 	/*
														
 
															-	 * If the load IA32_EFER VM-entry control is 1, the following checks
														
 
															-	 * are performed on the field for the IA32_EFER MSR:
														
 
															-	 * - Bits reserved in the IA32_EFER MSR must be 0.
														
 
															-	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
														
 
															-	 *   the IA-32e mode guest VM-exit control. It must also be identical
														
 
															-	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
														
 
															-	 *   CR0.PG) is 1.
														
 
															+	 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
														
 
															 	 */
														
 
															-	if (to_vmx(vcpu)->nested.nested_run_pending &&
														
 
															-	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
														
 
															-		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
														
 
															-		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
														
 
															-		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
														
 
															-		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
														
 
															-		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
														
 
															-			return 1;
														
 
															-	}
														
 
															+	local_irq_enable();
														
 
															+	if (hw_breakpoint_active())
														
 
															+		set_debugreg(__this_cpu_read(cpu_dr7), 7);
														
 
															 	/*
														
 
															-	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
														
 
															-	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
														
 
															-	 * the values of the LMA and LME bits in the field must each be that of
														
 
															-	 * the host address-space size VM-exit control.
														
 
															+	 * A non-failing VMEntry means we somehow entered guest mode with
														
 
															+	 * an illegal RIP, and that's just the tip of the iceberg.  There
														
 
															+	 * is no telling what memory has been modified or what state has
														
 
															+	 * been exposed to unknown code.  Hitting this all but guarantees
														
 
															+	 * a (very critical) hardware issue.
														
 
															 	 */
														
 
															-	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
														
 
															-		ia32e = (vmcs12->vm_exit_controls &
														
 
															-			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
														
 
															-		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
														
 
															-		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
														
 
															-		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
														
 
															-			return 1;
														
 
															-	}
														
 
															-
														
 
															-	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
														
 
															-		(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
														
 
															-		(vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
														
 
															-			return 1;
														
 
															+	WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
														
 
															+		VMX_EXIT_REASONS_FAILED_VMENTRY));
														
 
															 	return 0;
														
 
															 }
														
 
															+STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
														
 
															+
														
 
															+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
														
 
															+				   struct vmcs12 *vmcs12);
														
 
															 /*
														
 
															- * If exit_qual is NULL, this is being called from state restore (either RSM
														
 
															+ * If from_vmentry is false, this is being called from state restore (either RSM
														
 
															  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
														
 
															++ *
														
 
															++ * Returns:
														
 
															++ *   0 - success, i.e. proceed with actual VMEnter
														
 
															++ *   1 - consistency check VMExit
														
 
															++ *  -1 - consistency check VMFail
														
 
															  */
														
 
															-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
														
 
															+static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
														
 
															+					  bool from_vmentry)
														
 
															 {
														
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
														
 
															-	bool from_vmentry = !!exit_qual;
														
 
															-	u32 dummy_exit_qual;
														
 
															 	bool evaluate_pending_interrupts;
														
 
															-	int r = 0;
														
 
															+	u32 exit_reason = EXIT_REASON_INVALID_STATE;
														
 
															+	u32 exit_qual;
														
 
															 	evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
														
 
															 		(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
														
 
															 	if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
														
 
															 		evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
														
 
															-	enter_guest_mode(vcpu);
														
 
															-
														
 
															 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
														
 
															 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
														
 
															 	if (kvm_mpx_supported() &&
														
@@ -12627,24 +13336,35 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
 
															 		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
														
 
															 	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
														
 
															-	vmx_segment_cache_clear(vmx);
														
 
															+	prepare_vmcs02_early(vmx, vmcs12);
														
 
															+
														
 
															+	if (from_vmentry) {
														
 
															+		nested_get_vmcs12_pages(vcpu);
														
 
															+
														
 
															+		if (nested_vmx_check_vmentry_hw(vcpu)) {
														
 
															+			vmx_switch_vmcs(vcpu, &vmx->vmcs01);
														
 
															+			return -1;
														
 
															+		}
														
 
															+
														
 
															+		if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
														
 
															+			goto vmentry_fail_vmexit;
														
 
															+	}
														
 
															+
														
 
															+	enter_guest_mode(vcpu);
														
 
															 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
														
 
															 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
														
 
															-	r = EXIT_REASON_INVALID_STATE;
														
 
															-	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
														
 
															-		goto fail;
														
 
															+	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
														
 
															+		goto vmentry_fail_vmexit_guest_mode;
														
 
															 	if (from_vmentry) {
														
 
															-		nested_get_vmcs12_pages(vcpu);
														
 
															-
														
 
															-		r = EXIT_REASON_MSR_LOAD_FAIL;
														
 
															-		*exit_qual = nested_vmx_load_msr(vcpu,
														
 
															-	     					 vmcs12->vm_entry_msr_load_addr,
														
 
															-					      	 vmcs12->vm_entry_msr_load_count);
														
 
															-		if (*exit_qual)
														
 
															-			goto fail;
														
 
															+		exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
														
 
															+		exit_qual = nested_vmx_load_msr(vcpu,
														
 
															+						vmcs12->vm_entry_msr_load_addr,
														
 
															+						vmcs12->vm_entry_msr_load_count);
														
 
															+		if (exit_qual)
														
 
															+			goto vmentry_fail_vmexit_guest_mode;
														
 
															 	} else {
														
 
															 		/*
														
 
															 		 * The MMU is not initialized to point at the right entities yet and
														
@@ -12681,12 +13401,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
 
															 	 */
														
 
															 	return 0;
														
 
															-fail:
														
 
															+	/*
														
 
															+	 * A failed consistency check that leads to a VMExit during L1's
														
 
															+	 * VMEnter to L2 is a variation of a normal VMexit, as explained in
														
 
															+	 * 26.7 "VM-entry failures during or after loading guest state".
														
 
															+	 */
														
 
															+vmentry_fail_vmexit_guest_mode:
														
 
															 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
														
 
															 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
														
 
															 	leave_guest_mode(vcpu);
														
 
															+
														
 
															+vmentry_fail_vmexit:
														
 
															 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
														
 
															-	return r;
														
 
															+
														
 
															+	if (!from_vmentry)
														
 
															+		return 1;
														
 
															+
														
 
															+	load_vmcs12_host_state(vcpu, vmcs12);
														
 
															+	vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
														
 
															+	vmcs12->exit_qualification = exit_qual;
														
 
															+	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
														
 
															+		vmx->nested.need_vmcs12_sync = true;
														
 
															+	return 1;
														
 
															 }
														
 
															 /*
														
@@ -12698,14 +13434,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
															 	struct vmcs12 *vmcs12;
														
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															 	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
														
 
															-	u32 exit_qual;
														
 
															 	int ret;
														
 
															 	if (!nested_vmx_check_permission(vcpu))
														
 
															 		return 1;
														
 
															-	if (!nested_vmx_check_vmcs12(vcpu))
														
 
															-		goto out;
														
 
															+	if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
														
 
															+		return 1;
														
 
															+
														
 
															+	if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															 	vmcs12 = get_vmcs12(vcpu);
														
@@ -12715,13 +13453,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
															 	 * rather than RFLAGS.ZF, and no error number is stored to the
														
 
															 	 * VM-instruction error field.
														
 
															 	 */
														
 
															-	if (vmcs12->hdr.shadow_vmcs) {
														
 
															-		nested_vmx_failInvalid(vcpu);
														
 
															-		goto out;
														
 
															-	}
														
 
															+	if (vmcs12->hdr.shadow_vmcs)
														
 
															+		return nested_vmx_failInvalid(vcpu);
														
 
															-	if (enable_shadow_vmcs)
														
 
															+	if (vmx->nested.hv_evmcs) {
														
 
															+		copy_enlightened_to_vmcs12(vmx);
														
 
															+		/* Enlightened VMCS doesn't have launch state */
														
 
															+		vmcs12->launch_state = !launch;
														
 
															+	} else if (enable_shadow_vmcs) {
														
 
															 		copy_shadow_to_vmcs12(vmx);
														
 
															+	}
														
 
															 	/*
														
 
															 	 * The nested entry process starts with enforcing various prerequisites
														
@@ -12733,59 +13474,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
															 	 * for misconfigurations which will anyway be caught by the processor
														
 
															 	 * when using the merged vmcs02.
														
 
															 	 */
														
 
															-	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
														
 
															-		nested_vmx_failValid(vcpu,
														
 
															-				     VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
														
 
															-		goto out;
														
 
															-	}
														
 
															+	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
														
 
															-	if (vmcs12->launch_state == launch) {
														
 
															-		nested_vmx_failValid(vcpu,
														
 
															+	if (vmcs12->launch_state == launch)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															 			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
														
 
															 			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
														
 
															-		goto out;
														
 
															-	}
														
 
															 	ret = check_vmentry_prereqs(vcpu, vmcs12);
														
 
															-	if (ret) {
														
 
															-		nested_vmx_failValid(vcpu, ret);
														
 
															-		goto out;
														
 
															-	}
														
 
															-
														
 
															-	/*
														
 
															-	 * After this point, the trap flag no longer triggers a singlestep trap
														
 
															-	 * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
														
 
															-	 * This is not 100% correct; for performance reasons, we delegate most
														
 
															-	 * of the checks on host state to the processor.  If those fail,
														
 
															-	 * the singlestep trap is missed.
														
 
															-	 */
														
 
															-	skip_emulated_instruction(vcpu);
														
 
															-
														
 
															-	ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
														
 
															-	if (ret) {
														
 
															-		nested_vmx_entry_failure(vcpu, vmcs12,
														
 
															-					 EXIT_REASON_INVALID_STATE, exit_qual);
														
 
															-		return 1;
														
 
															-	}
														
 
															+	if (ret)
														
 
															+		return nested_vmx_failValid(vcpu, ret);
														
 
															 	/*
														
 
															 	 * We're finally done with prerequisite checking, and can start with
														
 
															 	 * the nested entry.
														
 
															 	 */
														
 
															-
														
 
															 	vmx->nested.nested_run_pending = 1;
														
 
															-	ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
														
 
															-	if (ret) {
														
 
															-		nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
														
 
															-		vmx->nested.nested_run_pending = 0;
														
 
															+	ret = nested_vmx_enter_non_root_mode(vcpu, true);
														
 
															+	vmx->nested.nested_run_pending = !ret;
														
 
															+	if (ret > 0)
														
 
															 		return 1;
														
 
															-	}
														
 
															+	else if (ret)
														
 
															+		return nested_vmx_failValid(vcpu,
														
 
															+			VMXERR_ENTRY_INVALID_CONTROL_FIELD);
														
 
															 	/* Hide L1D cache contents from the nested guest.  */
														
 
															 	vmx->vcpu.arch.l1tf_flush_l1d = true;
														
 
															 	/*
														
 
															-	 * Must happen outside of enter_vmx_non_root_mode() as it will
														
 
															+	 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
														
 
															 	 * also be used as part of restoring nVMX state for
														
 
															 	 * snapshot restore (migration).
														
 
															 	 *
														
@@ -12806,9 +13525,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
															 		return kvm_vcpu_halt(vcpu);
														
 
															 	}
														
 
															 	return 1;
														
 
															-
														
 
															-out:
														
 
															-	return kvm_skip_emulated_instruction(vcpu);
														
 
															 }
														
 
															 /*
														
@@ -13122,24 +13838,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	kvm_clear_interrupt_queue(vcpu);
														
 
															 }
														
 
															-static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
														
 
															-			struct vmcs12 *vmcs12)
														
 
															-{
														
 
															-	u32 entry_failure_code;
														
 
															-
														
 
															-	nested_ept_uninit_mmu_context(vcpu);
														
 
															-
														
 
															-	/*
														
 
															-	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
														
 
															-	 * couldn't have changed.
														
 
															-	 */
														
 
															-	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
														
 
															-		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
														
 
															-
														
 
															-	if (!enable_ept)
														
 
															-		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
														
 
															-}
														
 
															-
														
 
															 /*
														
 
															  * A part of what we need to when the nested L2 guest exits and we want to
														
 
															  * run its L1 parent, is to reset L1's guest state to the host state specified
														
@@ -13153,6 +13851,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
															 				   struct vmcs12 *vmcs12)
														
 
															 {
														
 
															 	struct kvm_segment seg;
														
 
															+	u32 entry_failure_code;
														
 
															 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
														
 
															 		vcpu->arch.efer = vmcs12->host_ia32_efer;
														
@@ -13165,6 +13864,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
															 	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
														
 
															 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
														
 
															 	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
														
 
															+	vmx_set_interrupt_shadow(vcpu, 0);
														
 
															+
														
 
															 	/*
														
 
															 	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
														
 
															 	 * actually changed, because vmx_set_cr0 refers to efer set above.
														
@@ -13179,23 +13880,35 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
															 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
														
 
															 	vmx_set_cr4(vcpu, vmcs12->host_cr4);
														
 
															-	load_vmcs12_mmu_host_state(vcpu, vmcs12);
														
 
															+	nested_ept_uninit_mmu_context(vcpu);
														
 
															+
														
 
															+	/*
														
 
															+	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
														
 
															+	 * couldn't have changed.
														
 
															+	 */
														
 
															+	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
														
 
															+		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
														
 
															+
														
 
															+	if (!enable_ept)
														
 
															+		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
														
 
															 	/*
														
 
															-	 * If vmcs01 don't use VPID, CPU flushes TLB on every
														
 
															+	 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
														
 
															 	 * VMEntry/VMExit. Thus, no need to flush TLB.
														
 
															 	 *
														
 
															-	 * If vmcs12 uses VPID, TLB entries populated by L2 are
														
 
															-	 * tagged with vmx->nested.vpid02 while L1 entries are tagged
														
 
															-	 * with vmx->vpid. Thus, no need to flush TLB.
														
 
															+	 * If vmcs12 doesn't use VPID, L1 expects TLB to be
														
 
															+	 * flushed on every VMEntry/VMExit.
														
 
															 	 *
														
 
															-	 * Therefore, flush TLB only in case vmcs01 uses VPID and
														
 
															-	 * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
														
 
															-	 * are both tagged with vmx->vpid.
														
 
															+	 * Otherwise, we can preserve TLB entries as long as we are
														
 
															+	 * able to tag L1 TLB entries differently than L2 TLB entries.
														
 
															+	 *
														
 
															+	 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
														
 
															+	 * and therefore we request the TLB flush to happen only after VMCS EPTP
														
 
															+	 * has been set by KVM_REQ_LOAD_CR3.
														
 
															 	 */
														
 
															 	if (enable_vpid &&
														
 
															-	    !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
														
 
															-		vmx_flush_tlb(vcpu, true);
														
 
															+	    (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
														
 
															+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															 	}
														
 
															 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
														
@@ -13275,6 +13988,140 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
															 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
														
 
															 }
														
 
															+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
														
 
															+{
														
 
															+	struct shared_msr_entry *efer_msr;
														
 
															+	unsigned int i;
														
 
															+
														
 
															+	if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
														
 
															+		return vmcs_read64(GUEST_IA32_EFER);
														
 
															+
														
 
															+	if (cpu_has_load_ia32_efer)
														
 
															+		return host_efer;
														
 
															+
														
 
															+	for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
														
 
															+		if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
														
 
															+			return vmx->msr_autoload.guest.val[i].value;
														
 
															+	}
														
 
															+
														
 
															+	efer_msr = find_msr_entry(vmx, MSR_EFER);
														
 
															+	if (efer_msr)
														
 
															+		return efer_msr->data;
														
 
															+
														
 
															+	return host_efer;
														
 
															+}
														
 
															+
														
 
															+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	struct vmx_msr_entry g, h;
														
 
															+	struct msr_data msr;
														
 
															+	gpa_t gpa;
														
 
															+	u32 i, j;
														
 
															+
														
 
															+	vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
														
 
															+
														
 
															+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
														
 
															+		/*
														
 
															+		 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
														
 
															+		 * as vmcs01.GUEST_DR7 contains a userspace defined value
														
 
															+		 * and vcpu->arch.dr7 is not squirreled away before the
														
 
															+		 * nested VMENTER (not worth adding a variable in nested_vmx).
														
 
															+		 */
														
 
															+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
														
 
															+			kvm_set_dr(vcpu, 7, DR7_FIXED_1);
														
 
															+		else
														
 
															+			WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
														
 
															+	 * handle a variety of side effects to KVM's software model.
														
 
															+	 */
														
 
															+	vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
														
 
															+
														
 
															+	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
														
 
															+	vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
														
 
															+
														
 
															+	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
														
 
															+	vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
														
 
															+
														
 
															+	nested_ept_uninit_mmu_context(vcpu);
														
 
															+	vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
														
 
															+	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
														
 
															+
														
 
															+	/*
														
 
															+	 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
														
 
															+	 * from vmcs01 (if necessary).  The PDPTRs are not loaded on
														
 
															+	 * VMFail, like everything else we just need to ensure our
														
 
															+	 * software model is up-to-date.
														
 
															+	 */
														
 
															+	ept_save_pdptrs(vcpu);
														
 
															+
														
 
															+	kvm_mmu_reset_context(vcpu);
														
 
															+
														
 
															+	if (cpu_has_vmx_msr_bitmap())
														
 
															+		vmx_update_msr_bitmap(vcpu);
														
 
															+
														
 
															+	/*
														
 
															+	 * This nasty bit of open coding is a compromise between blindly
														
 
															+	 * loading L1's MSRs using the exit load lists (incorrect emulation
														
 
															+	 * of VMFail), leaving the nested VM's MSRs in the software model
														
 
															+	 * (incorrect behavior) and snapshotting the modified MSRs (too
														
 
															+	 * expensive since the lists are unbound by hardware).  For each
														
 
															+	 * MSR that was (prematurely) loaded from the nested VMEntry load
														
 
															+	 * list, reload it from the exit load list if it exists and differs
														
 
															+	 * from the guest value.  The intent is to stuff host state as
														
 
															+	 * silently as possible, not to fully process the exit load list.
														
 
															+	 */
														
 
															+	msr.host_initiated = false;
														
 
															+	for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
														
 
															+		gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
														
 
															+		if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
														
 
															+			pr_debug_ratelimited(
														
 
															+				"%s read MSR index failed (%u, 0x%08llx)\n",
														
 
															+				__func__, i, gpa);
														
 
															+			goto vmabort;
														
 
															+		}
														
 
															+
														
 
															+		for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
														
 
															+			gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
														
 
															+			if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
														
 
															+				pr_debug_ratelimited(
														
 
															+					"%s read MSR failed (%u, 0x%08llx)\n",
														
 
															+					__func__, j, gpa);
														
 
															+				goto vmabort;
														
 
															+			}
														
 
															+			if (h.index != g.index)
														
 
															+				continue;
														
 
															+			if (h.value == g.value)
														
 
															+				break;
														
 
															+
														
 
															+			if (nested_vmx_load_msr_check(vcpu, &h)) {
														
 
															+				pr_debug_ratelimited(
														
 
															+					"%s check failed (%u, 0x%x, 0x%x)\n",
														
 
															+					__func__, j, h.index, h.reserved);
														
 
															+				goto vmabort;
														
 
															+			}
														
 
															+
														
 
															+			msr.index = h.index;
														
 
															+			msr.data = h.value;
														
 
															+			if (kvm_set_msr(vcpu, &msr)) {
														
 
															+				pr_debug_ratelimited(
														
 
															+					"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
														
 
															+					__func__, j, h.index, h.value);
														
 
															+				goto vmabort;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return;
														
 
															+
														
 
															+vmabort:
														
 
															+	nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
														
 
															  * and modify vmcs12 to make it see what it would expect to see there if
														
@@ -13290,14 +14137,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
															 	/* trying to cancel vmlaunch/vmresume is a bug */
														
 
															 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
														
 
															-	/*
														
 
															-	 * The only expected VM-instruction error is "VM entry with
														
 
															-	 * invalid control field(s)." Anything else indicates a
														
 
															-	 * problem with L0.
														
 
															-	 */
														
 
															-	WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
														
 
															-				   VMXERR_ENTRY_INVALID_CONTROL_FIELD));
														
 
															-
														
 
															 	leave_guest_mode(vcpu);
														
 
															 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
														
@@ -13324,12 +14163,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
															 		if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
														
 
															 					 vmcs12->vm_exit_msr_store_count))
														
 
															 			nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * The only expected VM-instruction error is "VM entry with
														
 
															+		 * invalid control field(s)." Anything else indicates a
														
 
															+		 * problem with L0.  And we should never get here with a
														
 
															+		 * VMFail of any type if early consistency checks are enabled.
														
 
															+		 */
														
 
															+		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
														
 
															+			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
														
 
															+		WARN_ON_ONCE(nested_early_check);
														
 
															 	}
														
 
															 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
														
 
															-	vm_entry_controls_reset_shadow(vmx);
														
 
															-	vm_exit_controls_reset_shadow(vmx);
														
 
															-	vmx_segment_cache_clear(vmx);
														
 
															 	/* Update any VMCS fields that might have changed while L2 ran */
														
 
															 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
														
@@ -13373,8 +14219,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
															 	 */
														
 
															 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
														
 
															-	if (enable_shadow_vmcs && exit_reason != -1)
														
 
															-		vmx->nested.sync_shadow_vmcs = true;
														
 
															+	if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
														
 
															+		vmx->nested.need_vmcs12_sync = true;
														
 
															 	/* in case we halted in L2 */
														
 
															 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
														
@@ -13409,24 +14255,24 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
															 		return;
														
 
															 	}
														
 
															-	
														
 
															+
														
 
															 	/*
														
 
															 	 * After an early L2 VM-entry failure, we're now back
														
 
															 	 * in L1 which thinks it just finished a VMLAUNCH or
														
 
															 	 * VMRESUME instruction, so we need to set the failure
														
 
															 	 * flag and the VM-instruction error field of the VMCS
														
 
															-	 * accordingly.
														
 
															+	 * accordingly, and skip the emulated instruction.
														
 
															 	 */
														
 
															-	nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
														
 
															-
														
 
															-	load_vmcs12_mmu_host_state(vcpu, vmcs12);
														
 
															+	(void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
														
 
															 	/*
														
 
															-	 * The emulated instruction was already skipped in
														
 
															-	 * nested_vmx_run, but the updated RIP was never
														
 
															-	 * written back to the vmcs01.
														
 
															+	 * Restore L1's host state to KVM's software model.  We're here
														
 
															+	 * because a consistency check was caught by hardware, which
														
 
															+	 * means some amount of guest state has been propagated to KVM's
														
 
															+	 * model and needs to be unwound to the host's state.
														
 
															 	 */
														
 
															-	skip_emulated_instruction(vcpu);
														
 
															+	nested_vmx_restore_host_state(vcpu);
														
 
															+
														
 
															 	vmx->fail = 0;
														
 
															 }
														
@@ -13439,26 +14285,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu)
 
															 		to_vmx(vcpu)->nested.nested_run_pending = 0;
														
 
															 		nested_vmx_vmexit(vcpu, -1, 0, 0);
														
 
															 	}
														
 
															-	free_nested(to_vmx(vcpu));
														
 
															-}
														
 
															-
														
 
															-/*
														
 
															- * L1's failure to enter L2 is a subset of a normal exit, as explained in
														
 
															- * 23.7 "VM-entry failures during or after loading guest state" (this also
														
 
															- * lists the acceptable exit-reason and exit-qualification parameters).
														
 
															- * It should only be called before L2 actually succeeded to run, and when
														
 
															- * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
														
 
															- */
														
 
															-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
														
 
															-			struct vmcs12 *vmcs12,
														
 
															-			u32 reason, unsigned long qualification)
														
 
															-{
														
 
															-	load_vmcs12_host_state(vcpu, vmcs12);
														
 
															-	vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
														
 
															-	vmcs12->exit_qualification = qualification;
														
 
															-	nested_vmx_succeed(vcpu);
														
 
															-	if (enable_shadow_vmcs)
														
 
															-		to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
														
 
															+	free_nested(vcpu);
														
 
															 }
														
 
															 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
														
@@ -13884,7 +14711,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
 
															 	if (vmx->nested.smm.guest_mode) {
														
 
															 		vcpu->arch.hflags &= ~HF_SMM_MASK;
														
 
															-		ret = enter_vmx_non_root_mode(vcpu, NULL);
														
 
															+		ret = nested_vmx_enter_non_root_mode(vcpu, false);
														
 
															 		vcpu->arch.hflags |= HF_SMM_MASK;
														
 
															 		if (ret)
														
 
															 			return ret;
														
@@ -13899,6 +14726,20 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 
															 	return 0;
														
 
															 }
														
 
															+static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															+	/*
														
 
															+	 * In case we do two consecutive get/set_nested_state()s while L2 was
														
 
															+	 * running hv_evmcs may end up not being mapped (we map it from
														
 
															+	 * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
														
 
															+	 * have vmcs12 if it is true.
														
 
															+	 */
														
 
															+	return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
														
 
															+		vmx->nested.hv_evmcs;
														
 
															+}
														
 
															+
														
 
															 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
														
 
															 				struct kvm_nested_state __user *user_kvm_nested_state,
														
 
															 				u32 user_data_size)
														
@@ -13918,12 +14759,16 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 
															 	vmx = to_vmx(vcpu);
														
 
															 	vmcs12 = get_vmcs12(vcpu);
														
 
															+
														
 
															+	if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
														
 
															+		kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
														
 
															+
														
 
															 	if (nested_vmx_allowed(vcpu) &&
														
 
															 	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
														
 
															 		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
														
 
															 		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
														
 
															-		if (vmx->nested.current_vmptr != -1ull) {
														
 
															+		if (vmx_has_valid_vmcs12(vcpu)) {
														
 
															 			kvm_state.size += VMCS12_SIZE;
														
 
															 			if (is_guest_mode(vcpu) &&
														
@@ -13952,20 +14797,24 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 
															 	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
														
 
															 		return -EFAULT;
														
 
															-	if (vmx->nested.current_vmptr == -1ull)
														
 
															+	if (!vmx_has_valid_vmcs12(vcpu))
														
 
															 		goto out;
														
 
															 	/*
														
 
															 	 * When running L2, the authoritative vmcs12 state is in the
														
 
															 	 * vmcs02. When running L1, the authoritative vmcs12 state is
														
 
															-	 * in the shadow vmcs linked to vmcs01, unless
														
 
															-	 * sync_shadow_vmcs is set, in which case, the authoritative
														
 
															+	 * in the shadow or enlightened vmcs linked to vmcs01, unless
														
 
															+	 * need_vmcs12_sync is set, in which case, the authoritative
														
 
															 	 * vmcs12 state is in the vmcs12 already.
														
 
															 	 */
														
 
															-	if (is_guest_mode(vcpu))
														
 
															+	if (is_guest_mode(vcpu)) {
														
 
															 		sync_vmcs12(vcpu, vmcs12);
														
 
															-	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
														
 
															-		copy_shadow_to_vmcs12(vmx);
														
 
															+	} else if (!vmx->nested.need_vmcs12_sync) {
														
 
															+		if (vmx->nested.hv_evmcs)
														
 
															+			copy_enlightened_to_vmcs12(vmx);
														
 
															+		else if (enable_shadow_vmcs)
														
 
															+			copy_shadow_to_vmcs12(vmx);
														
 
															+	}
														
 
															 	if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
														
 
															 		return -EFAULT;
														
@@ -13993,6 +14842,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 
															 	if (kvm_state->format != 0)
														
 
															 		return -EINVAL;
														
 
															+	if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
														
 
															+		nested_enable_evmcs(vcpu, NULL);
														
 
															+
														
 
															 	if (!nested_vmx_allowed(vcpu))
														
 
															 		return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
														
@@ -14010,13 +14862,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 
															 	if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
														
 
															 		return -EINVAL;
														
 
															-	if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
														
 
															-		return -EINVAL;
														
 
															-
														
 
															-	if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
														
 
															-	    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
														
 
															-		return -EINVAL;
														
 
															-
														
 
															 	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
														
 
															 	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
														
 
															 		return -EINVAL;
														
@@ -14046,7 +14891,25 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 
															 	if (ret)
														
 
															 		return ret;
														
 
															-	set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
														
 
															+	/* Empty 'VMXON' state is permitted */
														
 
															+	if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
														
 
															+		return 0;
														
 
															+
														
 
															+	if (kvm_state->vmx.vmcs_pa != -1ull) {
														
 
															+		if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
														
 
															+		    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
														
 
															+	} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
														
 
															+		/*
														
 
															+		 * Sync eVMCS upon entry as we may not have
														
 
															+		 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
														
 
															+		 */
														
 
															+		vmx->nested.need_vmcs12_sync = true;
														
 
															+	} else {
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															 	if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
														
 
															 		vmx->nested.smm.vmxon = true;
														
@@ -14090,7 +14953,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 
															 		return -EINVAL;
														
 
															 	vmx->nested.dirty_vmcs12 = true;
														
 
															-	ret = enter_vmx_non_root_mode(vcpu, NULL);
														
 
															+	ret = nested_vmx_enter_non_root_mode(vcpu, false);
														
 
															 	if (ret)
														
 
															 		return -EINVAL;
														
@@ -14242,6 +15105,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
															 	.pre_enter_smm = vmx_pre_enter_smm,
														
 
															 	.pre_leave_smm = vmx_pre_leave_smm,
														
 
															 	.enable_smi_window = enable_smi_window,
														
 
															+
														
 
															+	.nested_enable_evmcs = nested_enable_evmcs,
														
 
															 };
														
 
															 static void vmx_cleanup_l1d_flush(void)
														
--- a/arch/x86/kvm/vmx_shadow_fields.h
+++ b/arch/x86/kvm/vmx_shadow_fields.h
@@ -28,7 +28,6 @@
 
															  */
														
 
															 /* 16-bits */
														
 
															-SHADOW_FIELD_RW(GUEST_CS_SELECTOR)
														
 
															 SHADOW_FIELD_RW(GUEST_INTR_STATUS)
														
 
															 SHADOW_FIELD_RW(GUEST_PML_INDEX)
														
 
															 SHADOW_FIELD_RW(HOST_FS_SELECTOR)
														
@@ -47,8 +46,8 @@ SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
 
															 SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
														
 
															 SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
														
 
															 SHADOW_FIELD_RW(TPR_THRESHOLD)
														
 
															-SHADOW_FIELD_RW(GUEST_CS_LIMIT)
														
 
															 SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
														
 
															+SHADOW_FIELD_RW(GUEST_SS_AR_BYTES)
														
 
															 SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
														
 
															 SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
														
@@ -61,8 +60,6 @@ SHADOW_FIELD_RW(GUEST_CR0)
 
															 SHADOW_FIELD_RW(GUEST_CR3)
														
 
															 SHADOW_FIELD_RW(GUEST_CR4)
														
 
															 SHADOW_FIELD_RW(GUEST_RFLAGS)
														
 
															-SHADOW_FIELD_RW(GUEST_CS_BASE)
														
 
															-SHADOW_FIELD_RW(GUEST_ES_BASE)
														
 
															 SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
														
 
															 SHADOW_FIELD_RW(CR0_READ_SHADOW)
														
 
															 SHADOW_FIELD_RW(CR4_READ_SHADOW)
														
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,7 +136,7 @@ static u32 __read_mostly tsc_tolerance_ppm = 250;
 
															 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
														
 
															 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
														
 
															-unsigned int __read_mostly lapic_timer_advance_ns = 0;
														
 
															+unsigned int __read_mostly lapic_timer_advance_ns = 1000;
														
 
															 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
														
 
															 EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
														
@@ -400,9 +400,51 @@ static int exception_type(int vector)
 
															 	return EXCPT_FAULT;
														
 
															 }
														
 
															+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	unsigned nr = vcpu->arch.exception.nr;
														
 
															+	bool has_payload = vcpu->arch.exception.has_payload;
														
 
															+	unsigned long payload = vcpu->arch.exception.payload;
														
 
															+
														
 
															+	if (!has_payload)
														
 
															+		return;
														
 
															+
														
 
															+	switch (nr) {
														
 
															+	case DB_VECTOR:
														
 
															+		/*
														
 
															+		 * "Certain debug exceptions may clear bit 0-3.  The
														
 
															+		 * remaining contents of the DR6 register are never
														
 
															+		 * cleared by the processor".
														
 
															+		 */
														
 
															+		vcpu->arch.dr6 &= ~DR_TRAP_BITS;
														
 
															+		/*
														
 
															+		 * DR6.RTM is set by all #DB exceptions that don't clear it.
														
 
															+		 */
														
 
															+		vcpu->arch.dr6 |= DR6_RTM;
														
 
															+		vcpu->arch.dr6 |= payload;
														
 
															+		/*
														
 
															+		 * Bit 16 should be set in the payload whenever the #DB
														
 
															+		 * exception should clear DR6.RTM. This makes the payload
														
 
															+		 * compatible with the pending debug exceptions under VMX.
														
 
															+		 * Though not currently documented in the SDM, this also
														
 
															+		 * makes the payload compatible with the exit qualification
														
 
															+		 * for #DB exceptions under VMX.
														
 
															+		 */
														
 
															+		vcpu->arch.dr6 ^= payload & DR6_RTM;
														
 
															+		break;
														
 
															+	case PF_VECTOR:
														
 
															+		vcpu->arch.cr2 = payload;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	vcpu->arch.exception.has_payload = false;
														
 
															+	vcpu->arch.exception.payload = 0;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
														
 
															+
														
 
															 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
														
 
															 		unsigned nr, bool has_error, u32 error_code,
														
 
															-		bool reinject)
														
 
															+	        bool has_payload, unsigned long payload, bool reinject)
														
 
															 {
														
 
															 	u32 prev_nr;
														
 
															 	int class1, class2;
														
@@ -424,6 +466,14 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
															 			 */
														
 
															 			WARN_ON_ONCE(vcpu->arch.exception.pending);
														
 
															 			vcpu->arch.exception.injected = true;
														
 
															+			if (WARN_ON_ONCE(has_payload)) {
														
 
															+				/*
														
 
															+				 * A reinjected event has already
														
 
															+				 * delivered its payload.
														
 
															+				 */
														
 
															+				has_payload = false;
														
 
															+				payload = 0;
														
 
															+			}
														
 
															 		} else {
														
 
															 			vcpu->arch.exception.pending = true;
														
 
															 			vcpu->arch.exception.injected = false;
														
@@ -431,6 +481,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
															 		vcpu->arch.exception.has_error_code = has_error;
														
 
															 		vcpu->arch.exception.nr = nr;
														
 
															 		vcpu->arch.exception.error_code = error_code;
														
 
															+		vcpu->arch.exception.has_payload = has_payload;
														
 
															+		vcpu->arch.exception.payload = payload;
														
 
															+		/*
														
 
															+		 * In guest mode, payload delivery should be deferred,
														
 
															+		 * so that the L1 hypervisor can intercept #PF before
														
 
															+		 * CR2 is modified (or intercept #DB before DR6 is
														
 
															+		 * modified under nVMX).  However, for ABI
														
 
															+		 * compatibility with KVM_GET_VCPU_EVENTS and
														
 
															+		 * KVM_SET_VCPU_EVENTS, we can't delay payload
														
 
															+		 * delivery unless userspace has enabled this
														
 
															+		 * functionality via the per-VM capability,
														
 
															+		 * KVM_CAP_EXCEPTION_PAYLOAD.
														
 
															+		 */
														
 
															+		if (!vcpu->kvm->arch.exception_payload_enabled ||
														
 
															+		    !is_guest_mode(vcpu))
														
 
															+			kvm_deliver_exception_payload(vcpu);
														
 
															 		return;
														
 
															 	}
														
@@ -455,6 +521,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
															 		vcpu->arch.exception.has_error_code = true;
														
 
															 		vcpu->arch.exception.nr = DF_VECTOR;
														
 
															 		vcpu->arch.exception.error_code = 0;
														
 
															+		vcpu->arch.exception.has_payload = false;
														
 
															+		vcpu->arch.exception.payload = 0;
														
 
															 	} else
														
 
															 		/* replace previous exception with a new one in a hope
														
 
															 		   that instruction re-execution will regenerate lost
														
@@ -464,16 +532,29 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
															 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
														
 
															 {
														
 
															-	kvm_multiple_exception(vcpu, nr, false, 0, false);
														
 
															+	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_queue_exception);
														
 
															 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
														
 
															 {
														
 
															-	kvm_multiple_exception(vcpu, nr, false, 0, true);
														
 
															+	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
														
 
															+static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
														
 
															+				  unsigned long payload)
														
 
															+{
														
 
															+	kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
														
 
															+}
														
 
															+
														
 
															+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
														
 
															+				    u32 error_code, unsigned long payload)
														
 
															+{
														
 
															+	kvm_multiple_exception(vcpu, nr, true, error_code,
														
 
															+			       true, payload, false);
														
 
															+}
														
 
															+
														
 
															 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
														
 
															 {
														
 
															 	if (err)
														
@@ -490,11 +571,13 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 
															 	++vcpu->stat.pf_guest;
														
 
															 	vcpu->arch.exception.nested_apf =
														
 
															 		is_guest_mode(vcpu) && fault->async_page_fault;
														
 
															-	if (vcpu->arch.exception.nested_apf)
														
 
															+	if (vcpu->arch.exception.nested_apf) {
														
 
															 		vcpu->arch.apf.nested_apf_token = fault->address;
														
 
															-	else
														
 
															-		vcpu->arch.cr2 = fault->address;
														
 
															-	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
														
 
															+		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
														
 
															+	} else {
														
 
															+		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
														
 
															+					fault->address);
														
 
															+	}
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
														
@@ -503,7 +586,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau
 
															 	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
														
 
															 		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
														
 
															 	else
														
 
															-		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
														
 
															+		vcpu->arch.mmu->inject_page_fault(vcpu, fault);
														
 
															 	return fault->nested_page_fault;
														
 
															 }
														
@@ -517,13 +600,13 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
															 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
														
 
															 {
														
 
															-	kvm_multiple_exception(vcpu, nr, true, error_code, false);
														
 
															+	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
														
 
															 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
														
 
															 {
														
 
															-	kvm_multiple_exception(vcpu, nr, true, error_code, true);
														
 
															+	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
														
@@ -602,7 +685,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 
															 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
														
 
															 		if ((pdpte[i] & PT_PRESENT_MASK) &&
														
 
															 		    (pdpte[i] &
														
 
															-		     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
														
 
															+		     vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) {
														
 
															 			ret = 0;
														
 
															 			goto out;
														
 
															 		}
														
@@ -2477,7 +2560,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
															 		break;
														
 
															 	case MSR_KVM_PV_EOI_EN:
														
 
															-		if (kvm_lapic_enable_pv_eoi(vcpu, data))
														
 
															+		if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
														
 
															 			return 1;
														
 
															 		break;
														
@@ -2912,6 +2995,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
															 	case KVM_CAP_HYPERV_VP_INDEX:
														
 
															 	case KVM_CAP_HYPERV_EVENTFD:
														
 
															 	case KVM_CAP_HYPERV_TLBFLUSH:
														
 
															+	case KVM_CAP_HYPERV_SEND_IPI:
														
 
															+	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
														
 
															 	case KVM_CAP_PCI_SEGMENT:
														
 
															 	case KVM_CAP_DEBUGREGS:
														
 
															 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
														
@@ -2930,6 +3015,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
															 	case KVM_CAP_IMMEDIATE_EXIT:
														
 
															 	case KVM_CAP_GET_MSR_FEATURES:
														
 
															 	case KVM_CAP_MSR_PLATFORM_INFO:
														
 
															+	case KVM_CAP_EXCEPTION_PAYLOAD:
														
 
															 		r = 1;
														
 
															 		break;
														
 
															 	case KVM_CAP_SYNC_REGS:
														
@@ -3362,19 +3448,33 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 
															 					       struct kvm_vcpu_events *events)
														
 
															 {
														
 
															 	process_nmi(vcpu);
														
 
															+
														
 
															 	/*
														
 
															-	 * FIXME: pass injected and pending separately.  This is only
														
 
															-	 * needed for nested virtualization, whose state cannot be
														
 
															-	 * migrated yet.  For now we can combine them.
														
 
															+	 * The API doesn't provide the instruction length for software
														
 
															+	 * exceptions, so don't report them. As long as the guest RIP
														
 
															+	 * isn't advanced, we should expect to encounter the exception
														
 
															+	 * again.
														
 
															 	 */
														
 
															-	events->exception.injected =
														
 
															-		(vcpu->arch.exception.pending ||
														
 
															-		 vcpu->arch.exception.injected) &&
														
 
															-		!kvm_exception_is_soft(vcpu->arch.exception.nr);
														
 
															+	if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
														
 
															+		events->exception.injected = 0;
														
 
															+		events->exception.pending = 0;
														
 
															+	} else {
														
 
															+		events->exception.injected = vcpu->arch.exception.injected;
														
 
															+		events->exception.pending = vcpu->arch.exception.pending;
														
 
															+		/*
														
 
															+		 * For ABI compatibility, deliberately conflate
														
 
															+		 * pending and injected exceptions when
														
 
															+		 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
														
 
															+		 */
														
 
															+		if (!vcpu->kvm->arch.exception_payload_enabled)
														
 
															+			events->exception.injected |=
														
 
															+				vcpu->arch.exception.pending;
														
 
															+	}
														
 
															 	events->exception.nr = vcpu->arch.exception.nr;
														
 
															 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
														
 
															-	events->exception.pad = 0;
														
 
															 	events->exception.error_code = vcpu->arch.exception.error_code;
														
 
															+	events->exception_has_payload = vcpu->arch.exception.has_payload;
														
 
															+	events->exception_payload = vcpu->arch.exception.payload;
														
 
															 	events->interrupt.injected =
														
 
															 		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
														
@@ -3398,6 +3498,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 
															 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
														
 
															 			 | KVM_VCPUEVENT_VALID_SHADOW
														
 
															 			 | KVM_VCPUEVENT_VALID_SMM);
														
 
															+	if (vcpu->kvm->arch.exception_payload_enabled)
														
 
															+		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
														
 
															+
														
 
															 	memset(&events->reserved, 0, sizeof(events->reserved));
														
 
															 }
														
@@ -3409,12 +3512,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 
															 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
														
 
															 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
														
 
															 			      | KVM_VCPUEVENT_VALID_SHADOW
														
 
															-			      | KVM_VCPUEVENT_VALID_SMM))
														
 
															+			      | KVM_VCPUEVENT_VALID_SMM
														
 
															+			      | KVM_VCPUEVENT_VALID_PAYLOAD))
														
 
															 		return -EINVAL;
														
 
															-	if (events->exception.injected &&
														
 
															-	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
														
 
															-	     is_guest_mode(vcpu)))
														
 
															+	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
														
 
															+		if (!vcpu->kvm->arch.exception_payload_enabled)
														
 
															+			return -EINVAL;
														
 
															+		if (events->exception.pending)
														
 
															+			events->exception.injected = 0;
														
 
															+		else
														
 
															+			events->exception_has_payload = 0;
														
 
															+	} else {
														
 
															+		events->exception.pending = 0;
														
 
															+		events->exception_has_payload = 0;
														
 
															+	}
														
 
															+
														
 
															+	if ((events->exception.injected || events->exception.pending) &&
														
 
															+	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
														
 
															 		return -EINVAL;
														
 
															 	/* INITs are latched while in SMM */
														
@@ -3424,11 +3539,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 
															 		return -EINVAL;
														
 
															 	process_nmi(vcpu);
														
 
															-	vcpu->arch.exception.injected = false;
														
 
															-	vcpu->arch.exception.pending = events->exception.injected;
														
 
															+	vcpu->arch.exception.injected = events->exception.injected;
														
 
															+	vcpu->arch.exception.pending = events->exception.pending;
														
 
															 	vcpu->arch.exception.nr = events->exception.nr;
														
 
															 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
														
 
															 	vcpu->arch.exception.error_code = events->exception.error_code;
														
 
															+	vcpu->arch.exception.has_payload = events->exception_has_payload;
														
 
															+	vcpu->arch.exception.payload = events->exception_payload;
														
 
															 	vcpu->arch.interrupt.injected = events->interrupt.injected;
														
 
															 	vcpu->arch.interrupt.nr = events->interrupt.nr;
														
@@ -3694,6 +3811,10 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 
															 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
														
 
															 				     struct kvm_enable_cap *cap)
														
 
															 {
														
 
															+	int r;
														
 
															+	uint16_t vmcs_version;
														
 
															+	void __user *user_ptr;
														
 
															+
														
 
															 	if (cap->flags)
														
 
															 		return -EINVAL;
														
@@ -3706,6 +3827,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
															 			return -EINVAL;
														
 
															 		return kvm_hv_activate_synic(vcpu, cap->cap ==
														
 
															 					     KVM_CAP_HYPERV_SYNIC2);
														
 
															+	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
														
 
															+		r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
														
 
															+		if (!r) {
														
 
															+			user_ptr = (void __user *)(uintptr_t)cap->args[0];
														
 
															+			if (copy_to_user(user_ptr, &vmcs_version,
														
 
															+					 sizeof(vmcs_version)))
														
 
															+				r = -EFAULT;
														
 
															+		}
														
 
															+		return r;
														
 
															+
														
 
															 	default:
														
 
															 		return -EINVAL;
														
 
															 	}
														
@@ -4047,11 +4178,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
															 			break;
														
 
															 		if (kvm_state.flags &
														
 
															-		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
														
 
															+		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
														
 
															+		      | KVM_STATE_NESTED_EVMCS))
														
 
															 			break;
														
 
															 		/* nested_run_pending implies guest_mode.  */
														
 
															-		if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
														
 
															+		if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
														
 
															+		    && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
														
 
															 			break;
														
 
															 		r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
														
@@ -4363,6 +4496,10 @@ split_irqchip_unlock:
 
															 		kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
														
 
															 		r = 0;
														
 
															 		break;
														
 
															+	case KVM_CAP_EXCEPTION_PAYLOAD:
														
 
															+		kvm->arch.exception_payload_enabled = cap->args[0];
														
 
															+		r = 0;
														
 
															+		break;
														
 
															 	default:
														
 
															 		r = -EINVAL;
														
 
															 		break;
														
@@ -4803,7 +4940,7 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
 
															 	/* NPT walks are always user-walks */
														
 
															 	access |= PFERR_USER_MASK;
														
 
															-	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
														
 
															+	t_gpa  = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
														
 
															 	return t_gpa;
														
 
															 }
														
@@ -5889,7 +6026,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
 
															 	if (WARN_ON_ONCE(is_guest_mode(vcpu)))
														
 
															 		return false;
														
 
															-	if (!vcpu->arch.mmu.direct_map) {
														
 
															+	if (!vcpu->arch.mmu->direct_map) {
														
 
															 		/*
														
 
															 		 * Write permission should be allowed since only
														
 
															 		 * write access need to be emulated.
														
@@ -5922,7 +6059,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
 
															 	kvm_release_pfn_clean(pfn);
														
 
															 	/* The instructions are well-emulated on direct mmu. */
														
 
															-	if (vcpu->arch.mmu.direct_map) {
														
 
															+	if (vcpu->arch.mmu->direct_map) {
														
 
															 		unsigned int indirect_shadow_pages;
														
 
															 		spin_lock(&vcpu->kvm->mmu_lock);
														
@@ -5989,7 +6126,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 
															 	vcpu->arch.last_retry_eip = ctxt->eip;
														
 
															 	vcpu->arch.last_retry_addr = cr2;
														
 
															-	if (!vcpu->arch.mmu.direct_map)
														
 
															+	if (!vcpu->arch.mmu->direct_map)
														
 
															 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
														
 
															 	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
														
@@ -6049,14 +6186,7 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
 
															 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
														
 
															 		*r = EMULATE_USER_EXIT;
														
 
															 	} else {
														
 
															-		/*
														
 
															-		 * "Certain debug exceptions may clear bit 0-3.  The
														
 
															-		 * remaining contents of the DR6 register are never
														
 
															-		 * cleared by the processor".
														
 
															-		 */
														
 
															-		vcpu->arch.dr6 &= ~15;
														
 
															-		vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
														
 
															-		kvm_queue_exception(vcpu, DB_VECTOR);
														
 
															+		kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
														
 
															 	}
														
 
															 }
														
@@ -6995,10 +7125,22 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 
															 			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
														
 
															 					     X86_EFLAGS_RF);
														
 
															-		if (vcpu->arch.exception.nr == DB_VECTOR &&
														
 
															-		    (vcpu->arch.dr7 & DR7_GD)) {
														
 
															-			vcpu->arch.dr7 &= ~DR7_GD;
														
 
															-			kvm_update_dr7(vcpu);
														
 
															+		if (vcpu->arch.exception.nr == DB_VECTOR) {
														
 
															+			/*
														
 
															+			 * This code assumes that nSVM doesn't use
														
 
															+			 * check_nested_events(). If it does, the
														
 
															+			 * DR6/DR7 changes should happen before L1
														
 
															+			 * gets a #VMEXIT for an intercepted #DB in
														
 
															+			 * L2.  (Under VMX, on the other hand, the
														
 
															+			 * DR6/DR7 changes should not happen in the
														
 
															+			 * event of a VM-exit to L1 for an intercepted
														
 
															+			 * #DB in L2.)
														
 
															+			 */
														
 
															+			kvm_deliver_exception_payload(vcpu);
														
 
															+			if (vcpu->arch.dr7 & DR7_GD) {
														
 
															+				vcpu->arch.dr7 &= ~DR7_GD;
														
 
															+				kvm_update_dr7(vcpu);
														
 
															+			}
														
 
															 		}
														
 
															 		kvm_x86_ops->queue_exception(vcpu);
														
@@ -8478,7 +8620,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
															 	kvm_vcpu_mtrr_init(vcpu);
														
 
															 	vcpu_load(vcpu);
														
 
															 	kvm_vcpu_reset(vcpu, false);
														
 
															-	kvm_mmu_setup(vcpu);
														
 
															+	kvm_init_mmu(vcpu, false);
														
 
															 	vcpu_put(vcpu);
														
 
															 	return 0;
														
 
															 }
														
@@ -9327,7 +9469,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 
															 {
														
 
															 	int r;
														
 
															-	if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
														
 
															+	if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
														
 
															 	      work->wakeup_all)
														
 
															 		return;
														
@@ -9335,11 +9477,11 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 
															 	if (unlikely(r))
														
 
															 		return;
														
 
															-	if (!vcpu->arch.mmu.direct_map &&
														
 
															-	      work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
														
 
															+	if (!vcpu->arch.mmu->direct_map &&
														
 
															+	      work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
														
 
															 		return;
														
 
															-	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
														
 
															+	vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
														
 
															 }
														
 
															 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
														
@@ -9463,6 +9605,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 
															 			vcpu->arch.exception.nr = 0;
														
 
															 			vcpu->arch.exception.has_error_code = false;
														
 
															 			vcpu->arch.exception.error_code = 0;
														
 
															+			vcpu->arch.exception.has_payload = false;
														
 
															+			vcpu->arch.exception.payload = 0;
														
 
															 		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
														
 
															 			fault.vector = PF_VECTOR;
														
 
															 			fault.error_code_valid = true;
														
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -266,6 +266,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
 
															 int handle_ud(struct kvm_vcpu *vcpu);
														
 
															+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu);
														
 
															+
														
 
															 void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
														
 
															 u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
														
 
															 bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
														
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -372,6 +372,14 @@ config S390_CCW_IOMMU
 
															 	  Enables bits of IOMMU API required by VFIO. The iommu_ops
														
 
															 	  is not implemented as it is not necessary for VFIO.
														
 
															+config S390_AP_IOMMU
														
 
															+	bool "S390 AP IOMMU Support"
														
 
															+	depends on S390 && ZCRYPT
														
 
															+	select IOMMU_API
														
 
															+	help
														
 
															+	  Enables bits of IOMMU API required by VFIO. The iommu_ops
														
 
															+	  is not implemented as it is not necessary for VFIO.
														
 
															+
														
 
															 config MTK_IOMMU
														
 
															 	bool "MTK IOMMU Support"
														
 
															 	depends on ARM || ARM64
														
--- a/drivers/s390/crypto/Makefile
+++ b/drivers/s390/crypto/Makefile
@@ -15,3 +15,7 @@ obj-$(CONFIG_ZCRYPT) += zcrypt_cex2c.o zcrypt_cex2a.o zcrypt_cex4.o
 
															 # pkey kernel module
														
 
															 pkey-objs := pkey_api.o
														
 
															 obj-$(CONFIG_PKEY) += pkey.o
														
 
															+
														
 
															+# adjunct processor matrix
														
 
															+vfio_ap-objs := vfio_ap_drv.o vfio_ap_ops.o
														
 
															+obj-$(CONFIG_VFIO_AP) += vfio_ap.o
														
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c
@@ -0,0 +1,157 @@
 
															+// SPDX-License-Identifier: GPL-2.0+
														
 
															+/*
														
 
															+ * VFIO based AP device driver
														
 
															+ *
														
 
															+ * Copyright IBM Corp. 2018
														
 
															+ *
														
 
															+ * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/mod_devicetable.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/string.h>
														
 
															+#include "vfio_ap_private.h"
														
 
															+
														
 
															+#define VFIO_AP_ROOT_NAME "vfio_ap"
														
 
															+#define VFIO_AP_DEV_TYPE_NAME "ap_matrix"
														
 
															+#define VFIO_AP_DEV_NAME "matrix"
														
 
															+
														
 
															+MODULE_AUTHOR("IBM Corporation");
														
 
															+MODULE_DESCRIPTION("VFIO AP device driver, Copyright IBM Corp. 2018");
														
 
															+MODULE_LICENSE("GPL v2");
														
 
															+
														
 
															+static struct ap_driver vfio_ap_drv;
														
 
															+
														
 
															+static struct device_type vfio_ap_dev_type = {
														
 
															+	.name = VFIO_AP_DEV_TYPE_NAME,
														
 
															+};
														
 
															+
														
 
															+struct ap_matrix_dev *matrix_dev;
														
 
															+
														
 
															+/* Only type 10 adapters (CEX4 and later) are supported
														
 
															+ * by the AP matrix device driver
														
 
															+ */
														
 
															+static struct ap_device_id ap_queue_ids[] = {
														
 
															+	{ .dev_type = AP_DEVICE_TYPE_CEX4,
														
 
															+	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
														
 
															+	{ .dev_type = AP_DEVICE_TYPE_CEX5,
														
 
															+	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
														
 
															+	{ .dev_type = AP_DEVICE_TYPE_CEX6,
														
 
															+	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
														
 
															+	{ /* end of sibling */ },
														
 
															+};
														
 
															+
														
 
															+MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids);
														
 
															+
														
 
															+static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
														
 
															+{
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
														
 
															+{
														
 
															+	/* Nothing to do yet */
														
 
															+}
														
 
															+
														
 
															+static void vfio_ap_matrix_dev_release(struct device *dev)
														
 
															+{
														
 
															+	struct ap_matrix_dev *matrix_dev = dev_get_drvdata(dev);
														
 
															+
														
 
															+	kfree(matrix_dev);
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_matrix_dev_create(void)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct device *root_device;
														
 
															+
														
 
															+	root_device = root_device_register(VFIO_AP_ROOT_NAME);
														
 
															+	if (IS_ERR(root_device))
														
 
															+		return PTR_ERR(root_device);
														
 
															+
														
 
															+	matrix_dev = kzalloc(sizeof(*matrix_dev), GFP_KERNEL);
														
 
															+	if (!matrix_dev) {
														
 
															+		ret = -ENOMEM;
														
 
															+		goto matrix_alloc_err;
														
 
															+	}
														
 
															+
														
 
															+	/* Fill in config info via PQAP(QCI), if available */
														
 
															+	if (test_facility(12)) {
														
 
															+		ret = ap_qci(&matrix_dev->info);
														
 
															+		if (ret)
														
 
															+			goto matrix_alloc_err;
														
 
															+	}
														
 
															+
														
 
															+	mutex_init(&matrix_dev->lock);
														
 
															+	INIT_LIST_HEAD(&matrix_dev->mdev_list);
														
 
															+
														
 
															+	matrix_dev->device.type = &vfio_ap_dev_type;
														
 
															+	dev_set_name(&matrix_dev->device, "%s", VFIO_AP_DEV_NAME);
														
 
															+	matrix_dev->device.parent = root_device;
														
 
															+	matrix_dev->device.release = vfio_ap_matrix_dev_release;
														
 
															+	matrix_dev->device.driver = &vfio_ap_drv.driver;
														
 
															+
														
 
															+	ret = device_register(&matrix_dev->device);
														
 
															+	if (ret)
														
 
															+		goto matrix_reg_err;
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+matrix_reg_err:
														
 
															+	put_device(&matrix_dev->device);
														
 
															+matrix_alloc_err:
														
 
															+	root_device_unregister(root_device);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void vfio_ap_matrix_dev_destroy(void)
														
 
															+{
														
 
															+	device_unregister(&matrix_dev->device);
														
 
															+	root_device_unregister(matrix_dev->device.parent);
														
 
															+}
														
 
															+
														
 
															+static int __init vfio_ap_init(void)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	/* If there are no AP instructions, there is nothing to pass through. */
														
 
															+	if (!ap_instructions_available())
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	ret = vfio_ap_matrix_dev_create();
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	memset(&vfio_ap_drv, 0, sizeof(vfio_ap_drv));
														
 
															+	vfio_ap_drv.probe = vfio_ap_queue_dev_probe;
														
 
															+	vfio_ap_drv.remove = vfio_ap_queue_dev_remove;
														
 
															+	vfio_ap_drv.ids = ap_queue_ids;
														
 
															+
														
 
															+	ret = ap_driver_register(&vfio_ap_drv, THIS_MODULE, VFIO_AP_DRV_NAME);
														
 
															+	if (ret) {
														
 
															+		vfio_ap_matrix_dev_destroy();
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	ret = vfio_ap_mdev_register();
														
 
															+	if (ret) {
														
 
															+		ap_driver_unregister(&vfio_ap_drv);
														
 
															+		vfio_ap_matrix_dev_destroy();
														
 
															+
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void __exit vfio_ap_exit(void)
														
 
															+{
														
 
															+	vfio_ap_mdev_unregister();
														
 
															+	ap_driver_unregister(&vfio_ap_drv);
														
 
															+	vfio_ap_matrix_dev_destroy();
														
 
															+}
														
 
															+
														
 
															+module_init(vfio_ap_init);
														
 
															+module_exit(vfio_ap_exit);
														
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -0,0 +1,939 @@
 
															+// SPDX-License-Identifier: GPL-2.0+
														
 
															+/*
														
 
															+ * Adjunct processor matrix VFIO device driver callbacks.
														
 
															+ *
														
 
															+ * Copyright IBM Corp. 2018
														
 
															+ *
														
 
															+ * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
														
 
															+ *	      Halil Pasic <pasic@linux.ibm.com>
														
 
															+ *	      Pierre Morel <pmorel@linux.ibm.com>
														
 
															+ */
														
 
															+#include <linux/string.h>
														
 
															+#include <linux/vfio.h>
														
 
															+#include <linux/device.h>
														
 
															+#include <linux/list.h>
														
 
															+#include <linux/ctype.h>
														
 
															+#include <linux/bitops.h>
														
 
															+#include <linux/kvm_host.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <asm/kvm.h>
														
 
															+#include <asm/zcrypt.h>
														
 
															+
														
 
															+#include "vfio_ap_private.h"
														
 
															+
														
 
															+#define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough"
														
 
															+#define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device"
														
 
															+
														
 
															+static void vfio_ap_matrix_init(struct ap_config_info *info,
														
 
															+				struct ap_matrix *matrix)
														
 
															+{
														
 
															+	matrix->apm_max = info->apxa ? info->Na : 63;
														
 
															+	matrix->aqm_max = info->apxa ? info->Nd : 15;
														
 
															+	matrix->adm_max = info->apxa ? info->Nd : 15;
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
														
 
															+{
														
 
															+	struct ap_matrix_mdev *matrix_mdev;
														
 
															+
														
 
															+	if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0))
														
 
															+		return -EPERM;
														
 
															+
														
 
															+	matrix_mdev = kzalloc(sizeof(*matrix_mdev), GFP_KERNEL);
														
 
															+	if (!matrix_mdev) {
														
 
															+		atomic_inc(&matrix_dev->available_instances);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix);
														
 
															+	mdev_set_drvdata(mdev, matrix_mdev);
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_mdev_remove(struct mdev_device *mdev)
														
 
															+{
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+	list_del(&matrix_mdev->node);
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	kfree(matrix_mdev);
														
 
															+	mdev_set_drvdata(mdev, NULL);
														
 
															+	atomic_inc(&matrix_dev->available_instances);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
														
 
															+{
														
 
															+	return sprintf(buf, "%s\n", VFIO_AP_MDEV_NAME_HWVIRT);
														
 
															+}
														
 
															+
														
 
															+static MDEV_TYPE_ATTR_RO(name);
														
 
															+
														
 
															+static ssize_t available_instances_show(struct kobject *kobj,
														
 
															+					struct device *dev, char *buf)
														
 
															+{
														
 
															+	return sprintf(buf, "%d\n",
														
 
															+		       atomic_read(&matrix_dev->available_instances));
														
 
															+}
														
 
															+
														
 
															+static MDEV_TYPE_ATTR_RO(available_instances);
														
 
															+
														
 
															+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
														
 
															+			       char *buf)
														
 
															+{
														
 
															+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_AP_STRING);
														
 
															+}
														
 
															+
														
 
															+static MDEV_TYPE_ATTR_RO(device_api);
														
 
															+
														
 
															+static struct attribute *vfio_ap_mdev_type_attrs[] = {
														
 
															+	&mdev_type_attr_name.attr,
														
 
															+	&mdev_type_attr_device_api.attr,
														
 
															+	&mdev_type_attr_available_instances.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static struct attribute_group vfio_ap_mdev_hwvirt_type_group = {
														
 
															+	.name = VFIO_AP_MDEV_TYPE_HWVIRT,
														
 
															+	.attrs = vfio_ap_mdev_type_attrs,
														
 
															+};
														
 
															+
														
 
															+static struct attribute_group *vfio_ap_mdev_type_groups[] = {
														
 
															+	&vfio_ap_mdev_hwvirt_type_group,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+struct vfio_ap_queue_reserved {
														
 
															+	unsigned long *apid;
														
 
															+	unsigned long *apqi;
														
 
															+	bool reserved;
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * vfio_ap_has_queue
														
 
															+ *
														
 
															+ * @dev: an AP queue device
														
 
															+ * @data: a struct vfio_ap_queue_reserved reference
														
 
															+ *
														
 
															+ * Flags whether the AP queue device (@dev) has a queue ID containing the APQN,
														
 
															+ * apid or apqi specified in @data:
														
 
															+ *
														
 
															+ * - If @data contains both an apid and apqi value, then @data will be flagged
														
 
															+ *   as reserved if the APID and APQI fields for the AP queue device matches
														
 
															+ *
														
 
															+ * - If @data contains only an apid value, @data will be flagged as
														
 
															+ *   reserved if the APID field in the AP queue device matches
														
 
															+ *
														
 
															+ * - If @data contains only an apqi value, @data will be flagged as
														
 
															+ *   reserved if the APQI field in the AP queue device matches
														
 
															+ *
														
 
															+ * Returns 0 to indicate the input to function succeeded. Returns -EINVAL if
														
 
															+ * @data does not contain either an apid or apqi.
														
 
															+ */
														
 
															+static int vfio_ap_has_queue(struct device *dev, void *data)
														
 
															+{
														
 
															+	struct vfio_ap_queue_reserved *qres = data;
														
 
															+	struct ap_queue *ap_queue = to_ap_queue(dev);
														
 
															+	ap_qid_t qid;
														
 
															+	unsigned long id;
														
 
															+
														
 
															+	if (qres->apid && qres->apqi) {
														
 
															+		qid = AP_MKQID(*qres->apid, *qres->apqi);
														
 
															+		if (qid == ap_queue->qid)
														
 
															+			qres->reserved = true;
														
 
															+	} else if (qres->apid && !qres->apqi) {
														
 
															+		id = AP_QID_CARD(ap_queue->qid);
														
 
															+		if (id == *qres->apid)
														
 
															+			qres->reserved = true;
														
 
															+	} else if (!qres->apid && qres->apqi) {
														
 
															+		id = AP_QID_QUEUE(ap_queue->qid);
														
 
															+		if (id == *qres->apqi)
														
 
															+			qres->reserved = true;
														
 
															+	} else {
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * vfio_ap_verify_queue_reserved
														
 
															+ *
														
 
															+ * @matrix_dev: a mediated matrix device
														
 
															+ * @apid: an AP adapter ID
														
 
															+ * @apqi: an AP queue index
														
 
															+ *
														
 
															+ * Verifies that the AP queue with @apid/@apqi is reserved by the VFIO AP device
														
 
															+ * driver according to the following rules:
														
 
															+ *
														
 
															+ * - If both @apid and @apqi are not NULL, then there must be an AP queue
														
 
															+ *   device bound to the vfio_ap driver with the APQN identified by @apid and
														
 
															+ *   @apqi
														
 
															+ *
														
 
															+ * - If only @apid is not NULL, then there must be an AP queue device bound
														
 
															+ *   to the vfio_ap driver with an APQN containing @apid
														
 
															+ *
														
 
															+ * - If only @apqi is not NULL, then there must be an AP queue device bound
														
 
															+ *   to the vfio_ap driver with an APQN containing @apqi
														
 
															+ *
														
 
															+ * Returns 0 if the AP queue is reserved; otherwise, returns -EADDRNOTAVAIL.
														
 
															+ */
														
 
															+static int vfio_ap_verify_queue_reserved(unsigned long *apid,
														
 
															+					 unsigned long *apqi)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct vfio_ap_queue_reserved qres;
														
 
															+
														
 
															+	qres.apid = apid;
														
 
															+	qres.apqi = apqi;
														
 
															+	qres.reserved = false;
														
 
															+
														
 
															+	ret = driver_for_each_device(matrix_dev->device.driver, NULL, &qres,
														
 
															+				     vfio_ap_has_queue);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (qres.reserved)
														
 
															+		return 0;
														
 
															+
														
 
															+	return -EADDRNOTAVAIL;
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+vfio_ap_mdev_verify_queues_reserved_for_apid(struct ap_matrix_mdev *matrix_mdev,
														
 
															+					     unsigned long apid)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long apqi;
														
 
															+	unsigned long nbits = matrix_mdev->matrix.aqm_max + 1;
														
 
															+
														
 
															+	if (find_first_bit_inv(matrix_mdev->matrix.aqm, nbits) >= nbits)
														
 
															+		return vfio_ap_verify_queue_reserved(&apid, NULL);
														
 
															+
														
 
															+	for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, nbits) {
														
 
															+		ret = vfio_ap_verify_queue_reserved(&apid, &apqi);
														
 
															+		if (ret)
														
 
															+			return ret;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * vfio_ap_mdev_verify_no_sharing
														
 
															+ *
														
 
															+ * Verifies that the APQNs derived from the cross product of the AP adapter IDs
														
 
															+ * and AP queue indexes comprising the AP matrix are not configured for another
														
 
															+ * mediated device. AP queue sharing is not allowed.
														
 
															+ *
														
 
															+ * @matrix_mdev: the mediated matrix device
														
 
															+ *
														
 
															+ * Returns 0 if the APQNs are not shared, otherwise; returns -EADDRINUSE.
														
 
															+ */
														
 
															+static int vfio_ap_mdev_verify_no_sharing(struct ap_matrix_mdev *matrix_mdev)
														
 
															+{
														
 
															+	struct ap_matrix_mdev *lstdev;
														
 
															+	DECLARE_BITMAP(apm, AP_DEVICES);
														
 
															+	DECLARE_BITMAP(aqm, AP_DOMAINS);
														
 
															+
														
 
															+	list_for_each_entry(lstdev, &matrix_dev->mdev_list, node) {
														
 
															+		if (matrix_mdev == lstdev)
														
 
															+			continue;
														
 
															+
														
 
															+		memset(apm, 0, sizeof(apm));
														
 
															+		memset(aqm, 0, sizeof(aqm));
														
 
															+
														
 
															+		/*
														
 
															+		 * We work on full longs, as we can only exclude the leftover
														
 
															+		 * bits in non-inverse order. The leftover is all zeros.
														
 
															+		 */
														
 
															+		if (!bitmap_and(apm, matrix_mdev->matrix.apm,
														
 
															+				lstdev->matrix.apm, AP_DEVICES))
														
 
															+			continue;
														
 
															+
														
 
															+		if (!bitmap_and(aqm, matrix_mdev->matrix.aqm,
														
 
															+				lstdev->matrix.aqm, AP_DOMAINS))
														
 
															+			continue;
														
 
															+
														
 
															+		return -EADDRINUSE;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * assign_adapter_store
														
 
															+ *
														
 
															+ * @dev:	the matrix device
														
 
															+ * @attr:	the mediated matrix device's assign_adapter attribute
														
 
															+ * @buf:	a buffer containing the AP adapter number (APID) to
														
 
															+ *		be assigned
														
 
															+ * @count:	the number of bytes in @buf
														
 
															+ *
														
 
															+ * Parses the APID from @buf and sets the corresponding bit in the mediated
														
 
															+ * matrix device's APM.
														
 
															+ *
														
 
															+ * Returns the number of bytes processed if the APID is valid; otherwise,
														
 
															+ * returns one of the following errors:
														
 
															+ *
														
 
															+ *	1. -EINVAL
														
 
															+ *	   The APID is not a valid number
														
 
															+ *
														
 
															+ *	2. -ENODEV
														
 
															+ *	   The APID exceeds the maximum value configured for the system
														
 
															+ *
														
 
															+ *	3. -EADDRNOTAVAIL
														
 
															+ *	   An APQN derived from the cross product of the APID being assigned
														
 
															+ *	   and the APQIs previously assigned is not bound to the vfio_ap device
														
 
															+ *	   driver; or, if no APQIs have yet been assigned, the APID is not
														
 
															+ *	   contained in an APQN bound to the vfio_ap device driver.
														
 
															+ *
														
 
															+ *	4. -EADDRINUSE
														
 
															+ *	   An APQN derived from the cross product of the APID being assigned
														
 
															+ *	   and the APQIs previously assigned is being used by another mediated
														
 
															+ *	   matrix device
														
 
															+ */
														
 
															+static ssize_t assign_adapter_store(struct device *dev,
														
 
															+				    struct device_attribute *attr,
														
 
															+				    const char *buf, size_t count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long apid;
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+
														
 
															+	/* If the guest is running, disallow assignment of adapter */
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	ret = kstrtoul(buf, 0, &apid);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (apid > matrix_mdev->matrix.apm_max)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	/*
														
 
															+	 * Set the bit in the AP mask (APM) corresponding to the AP adapter
														
 
															+	 * number (APID). The bits in the mask, from most significant to least
														
 
															+	 * significant bit, correspond to APIDs 0-255.
														
 
															+	 */
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+
														
 
															+	ret = vfio_ap_mdev_verify_queues_reserved_for_apid(matrix_mdev, apid);
														
 
															+	if (ret)
														
 
															+		goto done;
														
 
															+
														
 
															+	set_bit_inv(apid, matrix_mdev->matrix.apm);
														
 
															+
														
 
															+	ret = vfio_ap_mdev_verify_no_sharing(matrix_mdev);
														
 
															+	if (ret)
														
 
															+		goto share_err;
														
 
															+
														
 
															+	ret = count;
														
 
															+	goto done;
														
 
															+
														
 
															+share_err:
														
 
															+	clear_bit_inv(apid, matrix_mdev->matrix.apm);
														
 
															+done:
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+static DEVICE_ATTR_WO(assign_adapter);
														
 
															+
														
 
															+/**
														
 
															+ * unassign_adapter_store
														
 
															+ *
														
 
															+ * @dev:	the matrix device
														
 
															+ * @attr:	the mediated matrix device's unassign_adapter attribute
														
 
															+ * @buf:	a buffer containing the adapter number (APID) to be unassigned
														
 
															+ * @count:	the number of bytes in @buf
														
 
															+ *
														
 
															+ * Parses the APID from @buf and clears the corresponding bit in the mediated
														
 
															+ * matrix device's APM.
														
 
															+ *
														
 
															+ * Returns the number of bytes processed if the APID is valid; otherwise,
														
 
															+ * returns one of the following errors:
														
 
															+ *	-EINVAL if the APID is not a number
														
 
															+ *	-ENODEV if the APID it exceeds the maximum value configured for the
														
 
															+ *		system
														
 
															+ */
														
 
															+static ssize_t unassign_adapter_store(struct device *dev,
														
 
															+				      struct device_attribute *attr,
														
 
															+				      const char *buf, size_t count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long apid;
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+
														
 
															+	/* If the guest is running, disallow un-assignment of adapter */
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	ret = kstrtoul(buf, 0, &apid);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (apid > matrix_mdev->matrix.apm_max)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+	clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+static DEVICE_ATTR_WO(unassign_adapter);
														
 
															+
														
 
															+static int
														
 
															+vfio_ap_mdev_verify_queues_reserved_for_apqi(struct ap_matrix_mdev *matrix_mdev,
														
 
															+					     unsigned long apqi)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long apid;
														
 
															+	unsigned long nbits = matrix_mdev->matrix.apm_max + 1;
														
 
															+
														
 
															+	if (find_first_bit_inv(matrix_mdev->matrix.apm, nbits) >= nbits)
														
 
															+		return vfio_ap_verify_queue_reserved(NULL, &apqi);
														
 
															+
														
 
															+	for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, nbits) {
														
 
															+		ret = vfio_ap_verify_queue_reserved(&apid, &apqi);
														
 
															+		if (ret)
														
 
															+			return ret;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * assign_domain_store
														
 
															+ *
														
 
															+ * @dev:	the matrix device
														
 
															+ * @attr:	the mediated matrix device's assign_domain attribute
														
 
															+ * @buf:	a buffer containing the AP queue index (APQI) of the domain to
														
 
															+ *		be assigned
														
 
															+ * @count:	the number of bytes in @buf
														
 
															+ *
														
 
															+ * Parses the APQI from @buf and sets the corresponding bit in the mediated
														
 
															+ * matrix device's AQM.
														
 
															+ *
														
 
															+ * Returns the number of bytes processed if the APQI is valid; otherwise returns
														
 
															+ * one of the following errors:
														
 
															+ *
														
 
															+ *	1. -EINVAL
														
 
															+ *	   The APQI is not a valid number
														
 
															+ *
														
 
															+ *	2. -ENODEV
														
 
															+ *	   The APQI exceeds the maximum value configured for the system
														
 
															+ *
														
 
															+ *	3. -EADDRNOTAVAIL
														
 
															+ *	   An APQN derived from the cross product of the APQI being assigned
														
 
															+ *	   and the APIDs previously assigned is not bound to the vfio_ap device
														
 
															+ *	   driver; or, if no APIDs have yet been assigned, the APQI is not
														
 
															+ *	   contained in an APQN bound to the vfio_ap device driver.
														
 
															+ *
														
 
															+ *	4. -EADDRINUSE
														
 
															+ *	   An APQN derived from the cross product of the APQI being assigned
														
 
															+ *	   and the APIDs previously assigned is being used by another mediated
														
 
															+ *	   matrix device
														
 
															+ */
														
 
															+static ssize_t assign_domain_store(struct device *dev,
														
 
															+				   struct device_attribute *attr,
														
 
															+				   const char *buf, size_t count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long apqi;
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+	unsigned long max_apqi = matrix_mdev->matrix.aqm_max;
														
 
															+
														
 
															+	/* If the guest is running, disallow assignment of domain */
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	ret = kstrtoul(buf, 0, &apqi);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+	if (apqi > max_apqi)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+
														
 
															+	ret = vfio_ap_mdev_verify_queues_reserved_for_apqi(matrix_mdev, apqi);
														
 
															+	if (ret)
														
 
															+		goto done;
														
 
															+
														
 
															+	set_bit_inv(apqi, matrix_mdev->matrix.aqm);
														
 
															+
														
 
															+	ret = vfio_ap_mdev_verify_no_sharing(matrix_mdev);
														
 
															+	if (ret)
														
 
															+		goto share_err;
														
 
															+
														
 
															+	ret = count;
														
 
															+	goto done;
														
 
															+
														
 
															+share_err:
														
 
															+	clear_bit_inv(apqi, matrix_mdev->matrix.aqm);
														
 
															+done:
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+static DEVICE_ATTR_WO(assign_domain);
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ * unassign_domain_store
														
 
															+ *
														
 
															+ * @dev:	the matrix device
														
 
															+ * @attr:	the mediated matrix device's unassign_domain attribute
														
 
															+ * @buf:	a buffer containing the AP queue index (APQI) of the domain to
														
 
															+ *		be unassigned
														
 
															+ * @count:	the number of bytes in @buf
														
 
															+ *
														
 
															+ * Parses the APQI from @buf and clears the corresponding bit in the
														
 
															+ * mediated matrix device's AQM.
														
 
															+ *
														
 
															+ * Returns the number of bytes processed if the APQI is valid; otherwise,
														
 
															+ * returns one of the following errors:
														
 
															+ *	-EINVAL if the APQI is not a number
														
 
															+ *	-ENODEV if the APQI exceeds the maximum value configured for the system
														
 
															+ */
														
 
															+static ssize_t unassign_domain_store(struct device *dev,
														
 
															+				     struct device_attribute *attr,
														
 
															+				     const char *buf, size_t count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long apqi;
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+
														
 
															+	/* If the guest is running, disallow un-assignment of domain */
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	ret = kstrtoul(buf, 0, &apqi);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (apqi > matrix_mdev->matrix.aqm_max)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+	clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+static DEVICE_ATTR_WO(unassign_domain);
														
 
															+
														
 
															+/**
														
 
															+ * assign_control_domain_store
														
 
															+ *
														
 
															+ * @dev:	the matrix device
														
 
															+ * @attr:	the mediated matrix device's assign_control_domain attribute
														
 
															+ * @buf:	a buffer containing the domain ID to be assigned
														
 
															+ * @count:	the number of bytes in @buf
														
 
															+ *
														
 
															+ * Parses the domain ID from @buf and sets the corresponding bit in the mediated
														
 
															+ * matrix device's ADM.
														
 
															+ *
														
 
															+ * Returns the number of bytes processed if the domain ID is valid; otherwise,
														
 
															+ * returns one of the following errors:
														
 
															+ *	-EINVAL if the ID is not a number
														
 
															+ *	-ENODEV if the ID exceeds the maximum value configured for the system
														
 
															+ */
														
 
															+static ssize_t assign_control_domain_store(struct device *dev,
														
 
															+					   struct device_attribute *attr,
														
 
															+					   const char *buf, size_t count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long id;
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+
														
 
															+	/* If the guest is running, disallow assignment of control domain */
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	ret = kstrtoul(buf, 0, &id);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (id > matrix_mdev->matrix.adm_max)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	/* Set the bit in the ADM (bitmask) corresponding to the AP control
														
 
															+	 * domain number (id). The bits in the mask, from most significant to
														
 
															+	 * least significant, correspond to IDs 0 up to the one less than the
														
 
															+	 * number of control domains that can be assigned.
														
 
															+	 */
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+	set_bit_inv(id, matrix_mdev->matrix.adm);
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+static DEVICE_ATTR_WO(assign_control_domain);
														
 
															+
														
 
															+/**
														
 
															+ * unassign_control_domain_store
														
 
															+ *
														
 
															+ * @dev:	the matrix device
														
 
															+ * @attr:	the mediated matrix device's unassign_control_domain attribute
														
 
															+ * @buf:	a buffer containing the domain ID to be unassigned
														
 
															+ * @count:	the number of bytes in @buf
														
 
															+ *
														
 
															+ * Parses the domain ID from @buf and clears the corresponding bit in the
														
 
															+ * mediated matrix device's ADM.
														
 
															+ *
														
 
															+ * Returns the number of bytes processed if the domain ID is valid; otherwise,
														
 
															+ * returns one of the following errors:
														
 
															+ *	-EINVAL if the ID is not a number
														
 
															+ *	-ENODEV if the ID exceeds the maximum value configured for the system
														
 
															+ */
														
 
															+static ssize_t unassign_control_domain_store(struct device *dev,
														
 
															+					     struct device_attribute *attr,
														
 
															+					     const char *buf, size_t count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned long domid;
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+	unsigned long max_domid =  matrix_mdev->matrix.adm_max;
														
 
															+
														
 
															+	/* If the guest is running, disallow un-assignment of control domain */
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	ret = kstrtoul(buf, 0, &domid);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+	if (domid > max_domid)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+	clear_bit_inv(domid, matrix_mdev->matrix.adm);
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+static DEVICE_ATTR_WO(unassign_control_domain);
														
 
															+
														
 
															+static ssize_t control_domains_show(struct device *dev,
														
 
															+				    struct device_attribute *dev_attr,
														
 
															+				    char *buf)
														
 
															+{
														
 
															+	unsigned long id;
														
 
															+	int nchars = 0;
														
 
															+	int n;
														
 
															+	char *bufpos = buf;
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+	unsigned long max_domid = matrix_mdev->matrix.adm_max;
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+	for_each_set_bit_inv(id, matrix_mdev->matrix.adm, max_domid + 1) {
														
 
															+		n = sprintf(bufpos, "%04lx\n", id);
														
 
															+		bufpos += n;
														
 
															+		nchars += n;
														
 
															+	}
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return nchars;
														
 
															+}
														
 
															+static DEVICE_ATTR_RO(control_domains);
														
 
															+
														
 
															+static ssize_t matrix_show(struct device *dev, struct device_attribute *attr,
														
 
															+			   char *buf)
														
 
															+{
														
 
															+	struct mdev_device *mdev = mdev_from_dev(dev);
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+	char *bufpos = buf;
														
 
															+	unsigned long apid;
														
 
															+	unsigned long apqi;
														
 
															+	unsigned long apid1;
														
 
															+	unsigned long apqi1;
														
 
															+	unsigned long napm_bits = matrix_mdev->matrix.apm_max + 1;
														
 
															+	unsigned long naqm_bits = matrix_mdev->matrix.aqm_max + 1;
														
 
															+	int nchars = 0;
														
 
															+	int n;
														
 
															+
														
 
															+	apid1 = find_first_bit_inv(matrix_mdev->matrix.apm, napm_bits);
														
 
															+	apqi1 = find_first_bit_inv(matrix_mdev->matrix.aqm, naqm_bits);
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+
														
 
															+	if ((apid1 < napm_bits) && (apqi1 < naqm_bits)) {
														
 
															+		for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, napm_bits) {
														
 
															+			for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm,
														
 
															+					     naqm_bits) {
														
 
															+				n = sprintf(bufpos, "%02lx.%04lx\n", apid,
														
 
															+					    apqi);
														
 
															+				bufpos += n;
														
 
															+				nchars += n;
														
 
															+			}
														
 
															+		}
														
 
															+	} else if (apid1 < napm_bits) {
														
 
															+		for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, napm_bits) {
														
 
															+			n = sprintf(bufpos, "%02lx.\n", apid);
														
 
															+			bufpos += n;
														
 
															+			nchars += n;
														
 
															+		}
														
 
															+	} else if (apqi1 < naqm_bits) {
														
 
															+		for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, naqm_bits) {
														
 
															+			n = sprintf(bufpos, ".%04lx\n", apqi);
														
 
															+			bufpos += n;
														
 
															+			nchars += n;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return nchars;
														
 
															+}
														
 
															+static DEVICE_ATTR_RO(matrix);
														
 
															+
														
 
															+static struct attribute *vfio_ap_mdev_attrs[] = {
														
 
															+	&dev_attr_assign_adapter.attr,
														
 
															+	&dev_attr_unassign_adapter.attr,
														
 
															+	&dev_attr_assign_domain.attr,
														
 
															+	&dev_attr_unassign_domain.attr,
														
 
															+	&dev_attr_assign_control_domain.attr,
														
 
															+	&dev_attr_unassign_control_domain.attr,
														
 
															+	&dev_attr_control_domains.attr,
														
 
															+	&dev_attr_matrix.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static struct attribute_group vfio_ap_mdev_attr_group = {
														
 
															+	.attrs = vfio_ap_mdev_attrs
														
 
															+};
														
 
															+
														
 
															+static const struct attribute_group *vfio_ap_mdev_attr_groups[] = {
														
 
															+	&vfio_ap_mdev_attr_group,
														
 
															+	NULL
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * vfio_ap_mdev_set_kvm
														
 
															+ *
														
 
															+ * @matrix_mdev: a mediated matrix device
														
 
															+ * @kvm: reference to KVM instance
														
 
															+ *
														
 
															+ * Verifies no other mediated matrix device has @kvm and sets a reference to
														
 
															+ * it in @matrix_mdev->kvm.
														
 
															+ *
														
 
															+ * Return 0 if no other mediated matrix device has a reference to @kvm;
														
 
															+ * otherwise, returns an -EPERM.
														
 
															+ */
														
 
															+static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
														
 
															+				struct kvm *kvm)
														
 
															+{
														
 
															+	struct ap_matrix_mdev *m;
														
 
															+
														
 
															+	mutex_lock(&matrix_dev->lock);
														
 
															+
														
 
															+	list_for_each_entry(m, &matrix_dev->mdev_list, node) {
														
 
															+		if ((m != matrix_mdev) && (m->kvm == kvm)) {
														
 
															+			mutex_unlock(&matrix_dev->lock);
														
 
															+			return -EPERM;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	matrix_mdev->kvm = kvm;
														
 
															+	mutex_unlock(&matrix_dev->lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
														
 
															+				       unsigned long action, void *data)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ap_matrix_mdev *matrix_mdev;
														
 
															+
														
 
															+	if (action != VFIO_GROUP_NOTIFY_SET_KVM)
														
 
															+		return NOTIFY_OK;
														
 
															+
														
 
															+	matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier);
														
 
															+
														
 
															+	if (!data) {
														
 
															+		matrix_mdev->kvm = NULL;
														
 
															+		return NOTIFY_OK;
														
 
															+	}
														
 
															+
														
 
															+	ret = vfio_ap_mdev_set_kvm(matrix_mdev, data);
														
 
															+	if (ret)
														
 
															+		return NOTIFY_DONE;
														
 
															+
														
 
															+	/* If there is no CRYCB pointer, then we can't copy the masks */
														
 
															+	if (!matrix_mdev->kvm->arch.crypto.crycbd)
														
 
															+		return NOTIFY_DONE;
														
 
															+
														
 
															+	kvm_arch_crypto_set_masks(matrix_mdev->kvm, matrix_mdev->matrix.apm,
														
 
															+				  matrix_mdev->matrix.aqm,
														
 
															+				  matrix_mdev->matrix.adm);
														
 
															+
														
 
															+	return NOTIFY_OK;
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
														
 
															+				    unsigned int retry)
														
 
															+{
														
 
															+	struct ap_queue_status status;
														
 
															+
														
 
															+	do {
														
 
															+		status = ap_zapq(AP_MKQID(apid, apqi));
														
 
															+		switch (status.response_code) {
														
 
															+		case AP_RESPONSE_NORMAL:
														
 
															+			return 0;
														
 
															+		case AP_RESPONSE_RESET_IN_PROGRESS:
														
 
															+		case AP_RESPONSE_BUSY:
														
 
															+			msleep(20);
														
 
															+			break;
														
 
															+		default:
														
 
															+			/* things are really broken, give up */
														
 
															+			return -EIO;
														
 
															+		}
														
 
															+	} while (retry--);
														
 
															+
														
 
															+	return -EBUSY;
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev)
														
 
															+{
														
 
															+	int ret;
														
 
															+	int rc = 0;
														
 
															+	unsigned long apid, apqi;
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+
														
 
															+	for_each_set_bit_inv(apid, matrix_mdev->matrix.apm,
														
 
															+			     matrix_mdev->matrix.apm_max + 1) {
														
 
															+		for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm,
														
 
															+				     matrix_mdev->matrix.aqm_max + 1) {
														
 
															+			ret = vfio_ap_mdev_reset_queue(apid, apqi, 1);
														
 
															+			/*
														
 
															+			 * Regardless whether a queue turns out to be busy, or
														
 
															+			 * is not operational, we need to continue resetting
														
 
															+			 * the remaining queues.
														
 
															+			 */
														
 
															+			if (ret)
														
 
															+				rc = ret;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return rc;
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_mdev_open(struct mdev_device *mdev)
														
 
															+{
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+	unsigned long events;
														
 
															+	int ret;
														
 
															+
														
 
															+
														
 
															+	if (!try_module_get(THIS_MODULE))
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier;
														
 
															+	events = VFIO_GROUP_NOTIFY_SET_KVM;
														
 
															+
														
 
															+	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
														
 
															+				     &events, &matrix_mdev->group_notifier);
														
 
															+	if (ret) {
														
 
															+		module_put(THIS_MODULE);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void vfio_ap_mdev_release(struct mdev_device *mdev)
														
 
															+{
														
 
															+	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
														
 
															+
														
 
															+	if (matrix_mdev->kvm)
														
 
															+		kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
														
 
															+
														
 
															+	vfio_ap_mdev_reset_queues(mdev);
														
 
															+	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
														
 
															+				 &matrix_mdev->group_notifier);
														
 
															+	matrix_mdev->kvm = NULL;
														
 
															+	module_put(THIS_MODULE);
														
 
															+}
														
 
															+
														
 
															+static int vfio_ap_mdev_get_device_info(unsigned long arg)
														
 
															+{
														
 
															+	unsigned long minsz;
														
 
															+	struct vfio_device_info info;
														
 
															+
														
 
															+	minsz = offsetofend(struct vfio_device_info, num_irqs);
														
 
															+
														
 
															+	if (copy_from_user(&info, (void __user *)arg, minsz))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	if (info.argsz < minsz)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	info.flags = VFIO_DEVICE_FLAGS_AP | VFIO_DEVICE_FLAGS_RESET;
														
 
															+	info.num_regions = 0;
														
 
															+	info.num_irqs = 0;
														
 
															+
														
 
															+	return copy_to_user((void __user *)arg, &info, minsz);
														
 
															+}
														
 
															+
														
 
															+static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
														
 
															+				    unsigned int cmd, unsigned long arg)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	switch (cmd) {
														
 
															+	case VFIO_DEVICE_GET_INFO:
														
 
															+		ret = vfio_ap_mdev_get_device_info(arg);
														
 
															+		break;
														
 
															+	case VFIO_DEVICE_RESET:
														
 
															+		ret = vfio_ap_mdev_reset_queues(mdev);
														
 
															+		break;
														
 
															+	default:
														
 
															+		ret = -EOPNOTSUPP;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static const struct mdev_parent_ops vfio_ap_matrix_ops = {
														
 
															+	.owner			= THIS_MODULE,
														
 
															+	.supported_type_groups	= vfio_ap_mdev_type_groups,
														
 
															+	.mdev_attr_groups	= vfio_ap_mdev_attr_groups,
														
 
															+	.create			= vfio_ap_mdev_create,
														
 
															+	.remove			= vfio_ap_mdev_remove,
														
 
															+	.open			= vfio_ap_mdev_open,
														
 
															+	.release		= vfio_ap_mdev_release,
														
 
															+	.ioctl			= vfio_ap_mdev_ioctl,
														
 
															+};
														
 
															+
														
 
															+int vfio_ap_mdev_register(void)
														
 
															+{
														
 
															+	atomic_set(&matrix_dev->available_instances, MAX_ZDEV_ENTRIES_EXT);
														
 
															+
														
 
															+	return mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_ops);
														
 
															+}
														
 
															+
														
 
															+void vfio_ap_mdev_unregister(void)
														
 
															+{
														
 
															+	mdev_unregister_device(&matrix_dev->device);
														
 
															+}
														
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -0,0 +1,88 @@
 
															+/* SPDX-License-Identifier: GPL-2.0 */
														
 
															+/*
														
 
															+ * Private data and functions for adjunct processor VFIO matrix driver.
														
 
															+ *
														
 
															+ * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
														
 
															+ *	      Halil Pasic <pasic@linux.ibm.com>
														
 
															+ *
														
 
															+ * Copyright IBM Corp. 2018
														
 
															+ */
														
 
															+
														
 
															+#ifndef _VFIO_AP_PRIVATE_H_
														
 
															+#define _VFIO_AP_PRIVATE_H_
														
 
															+
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/device.h>
														
 
															+#include <linux/mdev.h>
														
 
															+#include <linux/delay.h>
														
 
															+#include <linux/mutex.h>
														
 
															+
														
 
															+#include "ap_bus.h"
														
 
															+
														
 
															+#define VFIO_AP_MODULE_NAME "vfio_ap"
														
 
															+#define VFIO_AP_DRV_NAME "vfio_ap"
														
 
															+
														
 
															+/**
														
 
															+ * ap_matrix_dev - the AP matrix device structure
														
 
															+ * @device:	generic device structure associated with the AP matrix device
														
 
															+ * @available_instances: number of mediated matrix devices that can be created
														
 
															+ * @info:	the struct containing the output from the PQAP(QCI) instruction
														
 
															+ * mdev_list:	the list of mediated matrix devices created
														
 
															+ * lock:	mutex for locking the AP matrix device. This lock will be
														
 
															+ *		taken every time we fiddle with state managed by the vfio_ap
														
 
															+ *		driver, be it using @mdev_list or writing the state of a
														
 
															+ *		single ap_matrix_mdev device. It's quite coarse but we don't
														
 
															+ *		expect much contention.
														
 
															+ */
														
 
															+struct ap_matrix_dev {
														
 
															+	struct device device;
														
 
															+	atomic_t available_instances;
														
 
															+	struct ap_config_info info;
														
 
															+	struct list_head mdev_list;
														
 
															+	struct mutex lock;
														
 
															+};
														
 
															+
														
 
															+extern struct ap_matrix_dev *matrix_dev;
														
 
															+
														
 
															+/**
														
 
															+ * The AP matrix is comprised of three bit masks identifying the adapters,
														
 
															+ * queues (domains) and control domains that belong to an AP matrix. The bits i
														
 
															+ * each mask, from least significant to most significant bit, correspond to IDs
														
 
															+ * 0 to 255. When a bit is set, the corresponding ID belongs to the matrix.
														
 
															+ *
														
 
															+ * @apm_max: max adapter number in @apm
														
 
															+ * @apm identifies the AP adapters in the matrix
														
 
															+ * @aqm_max: max domain number in @aqm
														
 
															+ * @aqm identifies the AP queues (domains) in the matrix
														
 
															+ * @adm_max: max domain number in @adm
														
 
															+ * @adm identifies the AP control domains in the matrix
														
 
															+ */
														
 
															+struct ap_matrix {
														
 
															+	unsigned long apm_max;
														
 
															+	DECLARE_BITMAP(apm, 256);
														
 
															+	unsigned long aqm_max;
														
 
															+	DECLARE_BITMAP(aqm, 256);
														
 
															+	unsigned long adm_max;
														
 
															+	DECLARE_BITMAP(adm, 256);
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * struct ap_matrix_mdev - the mediated matrix device structure
														
 
															+ * @list:	allows the ap_matrix_mdev struct to be added to a list
														
 
															+ * @matrix:	the adapters, usage domains and control domains assigned to the
														
 
															+ *		mediated matrix device.
														
 
															+ * @group_notifier: notifier block used for specifying callback function for
														
 
															+ *		    handling the VFIO_GROUP_NOTIFY_SET_KVM event
														
 
															+ * @kvm:	the struct holding guest's state
														
 
															+ */
														
 
															+struct ap_matrix_mdev {
														
 
															+	struct list_head node;
														
 
															+	struct ap_matrix matrix;
														
 
															+	struct notifier_block group_notifier;
														
 
															+	struct kvm *kvm;
														
 
															+};
														
 
															+
														
 
															+extern int vfio_ap_mdev_register(void);
														
 
															+extern void vfio_ap_mdev_unregister(void);
														
 
															+
														
 
															+#endif /* _VFIO_AP_PRIVATE_H_ */