setup.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861
  1. /*
  2. * Machine specific setup for xen
  3. *
  4. * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  5. */
  6. #include <linux/module.h>
  7. #include <linux/sched.h>
  8. #include <linux/mm.h>
  9. #include <linux/pm.h>
  10. #include <linux/memblock.h>
  11. #include <linux/cpuidle.h>
  12. #include <linux/cpufreq.h>
  13. #include <asm/elf.h>
  14. #include <asm/vdso.h>
  15. #include <asm/e820.h>
  16. #include <asm/setup.h>
  17. #include <asm/acpi.h>
  18. #include <asm/numa.h>
  19. #include <asm/xen/hypervisor.h>
  20. #include <asm/xen/hypercall.h>
  21. #include <xen/xen.h>
  22. #include <xen/page.h>
  23. #include <xen/interface/callback.h>
  24. #include <xen/interface/memory.h>
  25. #include <xen/interface/physdev.h>
  26. #include <xen/features.h>
  27. #include "xen-ops.h"
  28. #include "vdso.h"
  29. #include "p2m.h"
  30. /* These are code, but not functions. Defined in entry.S */
  31. extern const char xen_hypervisor_callback[];
  32. extern const char xen_failsafe_callback[];
  33. #ifdef CONFIG_X86_64
  34. extern asmlinkage void nmi(void);
  35. #endif
  36. extern void xen_sysenter_target(void);
  37. extern void xen_syscall_target(void);
  38. extern void xen_syscall32_target(void);
  39. /* Amount of extra memory space we add to the e820 ranges */
  40. struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
  41. /* Number of pages released from the initial allocation. */
  42. unsigned long xen_released_pages;
  43. /* Buffer used to remap identity mapped pages */
  44. unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
  45. /*
  46. * The maximum amount of extra memory compared to the base size. The
  47. * main scaling factor is the size of struct page. At extreme ratios
  48. * of base:extra, all the base memory can be filled with page
  49. * structures for the extra memory, leaving no space for anything
  50. * else.
  51. *
  52. * 10x seems like a reasonable balance between scaling flexibility and
  53. * leaving a practically usable system.
  54. */
  55. #define EXTRA_MEM_RATIO (10)
  56. static void __init xen_add_extra_mem(u64 start, u64 size)
  57. {
  58. unsigned long pfn;
  59. int i;
  60. for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
  61. /* Add new region. */
  62. if (xen_extra_mem[i].size == 0) {
  63. xen_extra_mem[i].start = start;
  64. xen_extra_mem[i].size = size;
  65. break;
  66. }
  67. /* Append to existing region. */
  68. if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
  69. xen_extra_mem[i].size += size;
  70. break;
  71. }
  72. }
  73. if (i == XEN_EXTRA_MEM_MAX_REGIONS)
  74. printk(KERN_WARNING "Warning: not enough extra memory regions\n");
  75. memblock_reserve(start, size);
  76. xen_max_p2m_pfn = PFN_DOWN(start + size);
  77. for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
  78. unsigned long mfn = pfn_to_mfn(pfn);
  79. if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
  80. continue;
  81. WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
  82. pfn, mfn);
  83. __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
  84. }
  85. }
  86. static unsigned long __init xen_do_chunk(unsigned long start,
  87. unsigned long end, bool release)
  88. {
  89. struct xen_memory_reservation reservation = {
  90. .address_bits = 0,
  91. .extent_order = 0,
  92. .domid = DOMID_SELF
  93. };
  94. unsigned long len = 0;
  95. unsigned long pfn;
  96. int ret;
  97. for (pfn = start; pfn < end; pfn++) {
  98. unsigned long frame;
  99. unsigned long mfn = pfn_to_mfn(pfn);
  100. if (release) {
  101. /* Make sure pfn exists to start with */
  102. if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
  103. continue;
  104. frame = mfn;
  105. } else {
  106. if (mfn != INVALID_P2M_ENTRY)
  107. continue;
  108. frame = pfn;
  109. }
  110. set_xen_guest_handle(reservation.extent_start, &frame);
  111. reservation.nr_extents = 1;
  112. ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
  113. &reservation);
  114. WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
  115. release ? "release" : "populate", pfn, ret);
  116. if (ret == 1) {
  117. if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
  118. if (release)
  119. break;
  120. set_xen_guest_handle(reservation.extent_start, &frame);
  121. reservation.nr_extents = 1;
  122. ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
  123. &reservation);
  124. break;
  125. }
  126. len++;
  127. } else
  128. break;
  129. }
  130. if (len)
  131. printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
  132. release ? "Freeing" : "Populating",
  133. start, end, len,
  134. release ? "freed" : "added");
  135. return len;
  136. }
  137. /*
  138. * Finds the next RAM pfn available in the E820 map after min_pfn.
  139. * This function updates min_pfn with the pfn found and returns
  140. * the size of that range or zero if not found.
  141. */
  142. static unsigned long __init xen_find_pfn_range(
  143. const struct e820entry *list, size_t map_size,
  144. unsigned long *min_pfn)
  145. {
  146. const struct e820entry *entry;
  147. unsigned int i;
  148. unsigned long done = 0;
  149. for (i = 0, entry = list; i < map_size; i++, entry++) {
  150. unsigned long s_pfn;
  151. unsigned long e_pfn;
  152. if (entry->type != E820_RAM)
  153. continue;
  154. e_pfn = PFN_DOWN(entry->addr + entry->size);
  155. /* We only care about E820 after this */
  156. if (e_pfn < *min_pfn)
  157. continue;
  158. s_pfn = PFN_UP(entry->addr);
  159. /* If min_pfn falls within the E820 entry, we want to start
  160. * at the min_pfn PFN.
  161. */
  162. if (s_pfn <= *min_pfn) {
  163. done = e_pfn - *min_pfn;
  164. } else {
  165. done = e_pfn - s_pfn;
  166. *min_pfn = s_pfn;
  167. }
  168. break;
  169. }
  170. return done;
  171. }
  172. /*
  173. * This releases a chunk of memory and then does the identity map. It's used as
  174. * as a fallback if the remapping fails.
  175. */
  176. static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
  177. unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
  178. unsigned long *released)
  179. {
  180. WARN_ON(start_pfn > end_pfn);
  181. /* Need to release pages first */
  182. *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
  183. *identity += set_phys_range_identity(start_pfn, end_pfn);
  184. }
  185. /*
  186. * Helper function to update both the p2m and m2p tables.
  187. */
  188. static unsigned long __init xen_update_mem_tables(unsigned long pfn,
  189. unsigned long mfn)
  190. {
  191. struct mmu_update update = {
  192. .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
  193. .val = pfn
  194. };
  195. /* Update p2m */
  196. if (!early_set_phys_to_machine(pfn, mfn)) {
  197. WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
  198. pfn, mfn);
  199. return false;
  200. }
  201. /* Update m2p */
  202. if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
  203. WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
  204. mfn, pfn);
  205. return false;
  206. }
  207. return true;
  208. }
  209. /*
  210. * This function updates the p2m and m2p tables with an identity map from
  211. * start_pfn to start_pfn+size and remaps the underlying RAM of the original
  212. * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
  213. * to not exhaust the reserved brk space. Doing it in properly aligned blocks
  214. * ensures we only allocate the minimum required leaf pages in the p2m table. It
  215. * copies the existing mfns from the p2m table under the 1:1 map, overwrites
  216. * them with the identity map and then updates the p2m and m2p tables with the
  217. * remapped memory.
  218. */
  219. static unsigned long __init xen_do_set_identity_and_remap_chunk(
  220. unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
  221. {
  222. unsigned long ident_pfn_iter, remap_pfn_iter;
  223. unsigned long ident_start_pfn_align, remap_start_pfn_align;
  224. unsigned long ident_end_pfn_align, remap_end_pfn_align;
  225. unsigned long ident_boundary_pfn, remap_boundary_pfn;
  226. unsigned long ident_cnt = 0;
  227. unsigned long remap_cnt = 0;
  228. unsigned long left = size;
  229. unsigned long mod;
  230. int i;
  231. WARN_ON(size == 0);
  232. BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
  233. /*
  234. * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
  235. * blocks. We need to keep track of both the existing pfn mapping and
  236. * the new pfn remapping.
  237. */
  238. mod = start_pfn % P2M_PER_PAGE;
  239. ident_start_pfn_align =
  240. mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
  241. mod = remap_pfn % P2M_PER_PAGE;
  242. remap_start_pfn_align =
  243. mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
  244. mod = (start_pfn + size) % P2M_PER_PAGE;
  245. ident_end_pfn_align = start_pfn + size - mod;
  246. mod = (remap_pfn + size) % P2M_PER_PAGE;
  247. remap_end_pfn_align = remap_pfn + size - mod;
  248. /* Iterate over each p2m leaf node in each range */
  249. for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
  250. ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
  251. ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
  252. /* Check we aren't past the end */
  253. BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
  254. BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
  255. /* Save p2m mappings */
  256. for (i = 0; i < P2M_PER_PAGE; i++)
  257. xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
  258. /* Set identity map which will free a p2m leaf */
  259. ident_cnt += set_phys_range_identity(ident_pfn_iter,
  260. ident_pfn_iter + P2M_PER_PAGE);
  261. #ifdef DEBUG
  262. /* Helps verify a p2m leaf has been freed */
  263. for (i = 0; i < P2M_PER_PAGE; i++) {
  264. unsigned int pfn = ident_pfn_iter + i;
  265. BUG_ON(pfn_to_mfn(pfn) != pfn);
  266. }
  267. #endif
  268. /* Now remap memory */
  269. for (i = 0; i < P2M_PER_PAGE; i++) {
  270. unsigned long mfn = xen_remap_buf[i];
  271. /* This will use the p2m leaf freed above */
  272. if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
  273. WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
  274. remap_pfn_iter + i, mfn);
  275. return 0;
  276. }
  277. remap_cnt++;
  278. }
  279. left -= P2M_PER_PAGE;
  280. }
  281. /* Max boundary space possible */
  282. BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
  283. /* Now handle the boundary conditions */
  284. ident_boundary_pfn = start_pfn;
  285. remap_boundary_pfn = remap_pfn;
  286. for (i = 0; i < left; i++) {
  287. unsigned long mfn;
  288. /* These two checks move from the start to end boundaries */
  289. if (ident_boundary_pfn == ident_start_pfn_align)
  290. ident_boundary_pfn = ident_pfn_iter;
  291. if (remap_boundary_pfn == remap_start_pfn_align)
  292. remap_boundary_pfn = remap_pfn_iter;
  293. /* Check we aren't past the end */
  294. BUG_ON(ident_boundary_pfn >= start_pfn + size);
  295. BUG_ON(remap_boundary_pfn >= remap_pfn + size);
  296. mfn = pfn_to_mfn(ident_boundary_pfn);
  297. if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
  298. WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
  299. remap_pfn_iter + i, mfn);
  300. return 0;
  301. }
  302. remap_cnt++;
  303. ident_boundary_pfn++;
  304. remap_boundary_pfn++;
  305. }
  306. /* Finish up the identity map */
  307. if (ident_start_pfn_align >= ident_end_pfn_align) {
  308. /*
  309. * In this case we have an identity range which does not span an
  310. * aligned block so everything needs to be identity mapped here.
  311. * If we didn't check this we might remap too many pages since
  312. * the align boundaries are not meaningful in this case.
  313. */
  314. ident_cnt += set_phys_range_identity(start_pfn,
  315. start_pfn + size);
  316. } else {
  317. /* Remapped above so check each end of the chunk */
  318. if (start_pfn < ident_start_pfn_align)
  319. ident_cnt += set_phys_range_identity(start_pfn,
  320. ident_start_pfn_align);
  321. if (start_pfn + size > ident_pfn_iter)
  322. ident_cnt += set_phys_range_identity(ident_pfn_iter,
  323. start_pfn + size);
  324. }
  325. BUG_ON(ident_cnt != size);
  326. BUG_ON(remap_cnt != size);
  327. return size;
  328. }
  329. /*
  330. * This function takes a contiguous pfn range that needs to be identity mapped
  331. * and:
  332. *
  333. * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
  334. * 2) Calls the do_ function to actually do the mapping/remapping work.
  335. *
  336. * The goal is to not allocate additional memory but to remap the existing
  337. * pages. In the case of an error the underlying memory is simply released back
  338. * to Xen and not remapped.
  339. */
  340. static unsigned long __init xen_set_identity_and_remap_chunk(
  341. const struct e820entry *list, size_t map_size, unsigned long start_pfn,
  342. unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
  343. unsigned long *identity, unsigned long *remapped,
  344. unsigned long *released)
  345. {
  346. unsigned long pfn;
  347. unsigned long i = 0;
  348. unsigned long n = end_pfn - start_pfn;
  349. while (i < n) {
  350. unsigned long cur_pfn = start_pfn + i;
  351. unsigned long left = n - i;
  352. unsigned long size = left;
  353. unsigned long remap_range_size;
  354. /* Do not remap pages beyond the current allocation */
  355. if (cur_pfn >= nr_pages) {
  356. /* Identity map remaining pages */
  357. *identity += set_phys_range_identity(cur_pfn,
  358. cur_pfn + size);
  359. break;
  360. }
  361. if (cur_pfn + size > nr_pages)
  362. size = nr_pages - cur_pfn;
  363. remap_range_size = xen_find_pfn_range(list, map_size,
  364. &remap_pfn);
  365. if (!remap_range_size) {
  366. pr_warning("Unable to find available pfn range, not remapping identity pages\n");
  367. xen_set_identity_and_release_chunk(cur_pfn,
  368. cur_pfn + left, nr_pages, identity, released);
  369. break;
  370. }
  371. /* Adjust size to fit in current e820 RAM region */
  372. if (size > remap_range_size)
  373. size = remap_range_size;
  374. if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
  375. WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
  376. cur_pfn, size, remap_pfn);
  377. xen_set_identity_and_release_chunk(cur_pfn,
  378. cur_pfn + left, nr_pages, identity, released);
  379. break;
  380. }
  381. /* Update variables to reflect new mappings. */
  382. i += size;
  383. remap_pfn += size;
  384. *identity += size;
  385. *remapped += size;
  386. }
  387. /*
  388. * If the PFNs are currently mapped, the VA mapping also needs
  389. * to be updated to be 1:1.
  390. */
  391. for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
  392. (void)HYPERVISOR_update_va_mapping(
  393. (unsigned long)__va(pfn << PAGE_SHIFT),
  394. mfn_pte(pfn, PAGE_KERNEL_IO), 0);
  395. return remap_pfn;
  396. }
  397. static unsigned long __init xen_set_identity_and_remap(
  398. const struct e820entry *list, size_t map_size, unsigned long nr_pages,
  399. unsigned long *released)
  400. {
  401. phys_addr_t start = 0;
  402. unsigned long identity = 0;
  403. unsigned long remapped = 0;
  404. unsigned long last_pfn = nr_pages;
  405. const struct e820entry *entry;
  406. unsigned long num_released = 0;
  407. int i;
  408. /*
  409. * Combine non-RAM regions and gaps until a RAM region (or the
  410. * end of the map) is reached, then set the 1:1 map and
  411. * remap the memory in those non-RAM regions.
  412. *
  413. * The combined non-RAM regions are rounded to a whole number
  414. * of pages so any partial pages are accessible via the 1:1
  415. * mapping. This is needed for some BIOSes that put (for
  416. * example) the DMI tables in a reserved region that begins on
  417. * a non-page boundary.
  418. */
  419. for (i = 0, entry = list; i < map_size; i++, entry++) {
  420. phys_addr_t end = entry->addr + entry->size;
  421. if (entry->type == E820_RAM || i == map_size - 1) {
  422. unsigned long start_pfn = PFN_DOWN(start);
  423. unsigned long end_pfn = PFN_UP(end);
  424. if (entry->type == E820_RAM)
  425. end_pfn = PFN_UP(entry->addr);
  426. if (start_pfn < end_pfn)
  427. last_pfn = xen_set_identity_and_remap_chunk(
  428. list, map_size, start_pfn,
  429. end_pfn, nr_pages, last_pfn,
  430. &identity, &remapped,
  431. &num_released);
  432. start = end;
  433. }
  434. }
  435. *released = num_released;
  436. pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
  437. pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
  438. last_pfn);
  439. pr_info("Released %ld page(s)\n", num_released);
  440. return last_pfn;
  441. }
  442. static unsigned long __init xen_get_max_pages(void)
  443. {
  444. unsigned long max_pages = MAX_DOMAIN_PAGES;
  445. domid_t domid = DOMID_SELF;
  446. int ret;
  447. /*
  448. * For the initial domain we use the maximum reservation as
  449. * the maximum page.
  450. *
  451. * For guest domains the current maximum reservation reflects
  452. * the current maximum rather than the static maximum. In this
  453. * case the e820 map provided to us will cover the static
  454. * maximum region.
  455. */
  456. if (xen_initial_domain()) {
  457. ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
  458. if (ret > 0)
  459. max_pages = ret;
  460. }
  461. return min(max_pages, MAX_DOMAIN_PAGES);
  462. }
  463. static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
  464. {
  465. u64 end = start + size;
  466. /* Align RAM regions to page boundaries. */
  467. if (type == E820_RAM) {
  468. start = PAGE_ALIGN(start);
  469. end &= ~((u64)PAGE_SIZE - 1);
  470. }
  471. e820_add_region(start, end - start, type);
  472. }
  473. void xen_ignore_unusable(struct e820entry *list, size_t map_size)
  474. {
  475. struct e820entry *entry;
  476. unsigned int i;
  477. for (i = 0, entry = list; i < map_size; i++, entry++) {
  478. if (entry->type == E820_UNUSABLE)
  479. entry->type = E820_RAM;
  480. }
  481. }
  482. /**
  483. * machine_specific_memory_setup - Hook for machine specific memory setup.
  484. **/
  485. char * __init xen_memory_setup(void)
  486. {
  487. static struct e820entry map[E820MAX] __initdata;
  488. unsigned long max_pfn = xen_start_info->nr_pages;
  489. unsigned long long mem_end;
  490. int rc;
  491. struct xen_memory_map memmap;
  492. unsigned long max_pages;
  493. unsigned long last_pfn = 0;
  494. unsigned long extra_pages = 0;
  495. int i;
  496. int op;
  497. max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
  498. mem_end = PFN_PHYS(max_pfn);
  499. memmap.nr_entries = E820MAX;
  500. set_xen_guest_handle(memmap.buffer, map);
  501. op = xen_initial_domain() ?
  502. XENMEM_machine_memory_map :
  503. XENMEM_memory_map;
  504. rc = HYPERVISOR_memory_op(op, &memmap);
  505. if (rc == -ENOSYS) {
  506. BUG_ON(xen_initial_domain());
  507. memmap.nr_entries = 1;
  508. map[0].addr = 0ULL;
  509. map[0].size = mem_end;
  510. /* 8MB slack (to balance backend allocations). */
  511. map[0].size += 8ULL << 20;
  512. map[0].type = E820_RAM;
  513. rc = 0;
  514. }
  515. BUG_ON(rc);
  516. /*
  517. * Xen won't allow a 1:1 mapping to be created to UNUSABLE
  518. * regions, so if we're using the machine memory map leave the
  519. * region as RAM as it is in the pseudo-physical map.
  520. *
  521. * UNUSABLE regions in domUs are not handled and will need
  522. * a patch in the future.
  523. */
  524. if (xen_initial_domain())
  525. xen_ignore_unusable(map, memmap.nr_entries);
  526. /* Make sure the Xen-supplied memory map is well-ordered. */
  527. sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
  528. max_pages = xen_get_max_pages();
  529. if (max_pages > max_pfn)
  530. extra_pages += max_pages - max_pfn;
  531. /*
  532. * Set identity map on non-RAM pages and remap the underlying RAM.
  533. */
  534. last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
  535. &xen_released_pages);
  536. extra_pages += xen_released_pages;
  537. if (last_pfn > max_pfn) {
  538. max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
  539. mem_end = PFN_PHYS(max_pfn);
  540. }
  541. /*
  542. * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
  543. * factor the base size. On non-highmem systems, the base
  544. * size is the full initial memory allocation; on highmem it
  545. * is limited to the max size of lowmem, so that it doesn't
  546. * get completely filled.
  547. *
  548. * In principle there could be a problem in lowmem systems if
  549. * the initial memory is also very large with respect to
  550. * lowmem, but we won't try to deal with that here.
  551. */
  552. extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
  553. extra_pages);
  554. i = 0;
  555. while (i < memmap.nr_entries) {
  556. u64 addr = map[i].addr;
  557. u64 size = map[i].size;
  558. u32 type = map[i].type;
  559. if (type == E820_RAM) {
  560. if (addr < mem_end) {
  561. size = min(size, mem_end - addr);
  562. } else if (extra_pages) {
  563. size = min(size, (u64)extra_pages * PAGE_SIZE);
  564. extra_pages -= size / PAGE_SIZE;
  565. xen_add_extra_mem(addr, size);
  566. } else
  567. type = E820_UNUSABLE;
  568. }
  569. xen_align_and_add_e820_region(addr, size, type);
  570. map[i].addr += size;
  571. map[i].size -= size;
  572. if (map[i].size == 0)
  573. i++;
  574. }
  575. /*
  576. * Set the rest as identity mapped, in case PCI BARs are
  577. * located here.
  578. *
  579. * PFNs above MAX_P2M_PFN are considered identity mapped as
  580. * well.
  581. */
  582. set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
  583. /*
  584. * In domU, the ISA region is normal, usable memory, but we
  585. * reserve ISA memory anyway because too many things poke
  586. * about in there.
  587. */
  588. e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
  589. E820_RESERVED);
  590. /*
  591. * Reserve Xen bits:
  592. * - mfn_list
  593. * - xen_start_info
  594. * See comment above "struct start_info" in <xen/interface/xen.h>
  595. * We tried to make the the memblock_reserve more selective so
  596. * that it would be clear what region is reserved. Sadly we ran
  597. * in the problem wherein on a 64-bit hypervisor with a 32-bit
  598. * initial domain, the pt_base has the cr3 value which is not
  599. * neccessarily where the pagetable starts! As Jan put it: "
  600. * Actually, the adjustment turns out to be correct: The page
  601. * tables for a 32-on-64 dom0 get allocated in the order "first L1",
  602. * "first L2", "first L3", so the offset to the page table base is
  603. * indeed 2. When reading xen/include/public/xen.h's comment
  604. * very strictly, this is not a violation (since there nothing is said
  605. * that the first thing in the page table space is pointed to by
  606. * pt_base; I admit that this seems to be implied though, namely
  607. * do I think that it is implied that the page table space is the
  608. * range [pt_base, pt_base + nt_pt_frames), whereas that
  609. * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
  610. * which - without a priori knowledge - the kernel would have
  611. * difficulty to figure out)." - so lets just fall back to the
  612. * easy way and reserve the whole region.
  613. */
  614. memblock_reserve(__pa(xen_start_info->mfn_list),
  615. xen_start_info->pt_base - xen_start_info->mfn_list);
  616. sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
  617. return "Xen";
  618. }
  619. /*
  620. * Machine specific memory setup for auto-translated guests.
  621. */
  622. char * __init xen_auto_xlated_memory_setup(void)
  623. {
  624. static struct e820entry map[E820MAX] __initdata;
  625. struct xen_memory_map memmap;
  626. int i;
  627. int rc;
  628. memmap.nr_entries = E820MAX;
  629. set_xen_guest_handle(memmap.buffer, map);
  630. rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
  631. if (rc < 0)
  632. panic("No memory map (%d)\n", rc);
  633. sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
  634. for (i = 0; i < memmap.nr_entries; i++)
  635. e820_add_region(map[i].addr, map[i].size, map[i].type);
  636. memblock_reserve(__pa(xen_start_info->mfn_list),
  637. xen_start_info->pt_base - xen_start_info->mfn_list);
  638. return "Xen";
  639. }
  640. /*
  641. * Set the bit indicating "nosegneg" library variants should be used.
  642. * We only need to bother in pure 32-bit mode; compat 32-bit processes
  643. * can have un-truncated segments, so wrapping around is allowed.
  644. */
  645. static void __init fiddle_vdso(void)
  646. {
  647. #ifdef CONFIG_X86_32
  648. /*
  649. * This could be called before selected_vdso32 is initialized, so
  650. * just fiddle with both possible images. vdso_image_32_syscall
  651. * can't be selected, since it only exists on 64-bit systems.
  652. */
  653. u32 *mask;
  654. mask = vdso_image_32_int80.data +
  655. vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
  656. *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
  657. mask = vdso_image_32_sysenter.data +
  658. vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
  659. *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
  660. #endif
  661. }
  662. static int register_callback(unsigned type, const void *func)
  663. {
  664. struct callback_register callback = {
  665. .type = type,
  666. .address = XEN_CALLBACK(__KERNEL_CS, func),
  667. .flags = CALLBACKF_mask_events,
  668. };
  669. return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
  670. }
  671. void xen_enable_sysenter(void)
  672. {
  673. int ret;
  674. unsigned sysenter_feature;
  675. #ifdef CONFIG_X86_32
  676. sysenter_feature = X86_FEATURE_SEP;
  677. #else
  678. sysenter_feature = X86_FEATURE_SYSENTER32;
  679. #endif
  680. if (!boot_cpu_has(sysenter_feature))
  681. return;
  682. ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
  683. if(ret != 0)
  684. setup_clear_cpu_cap(sysenter_feature);
  685. }
  686. void xen_enable_syscall(void)
  687. {
  688. #ifdef CONFIG_X86_64
  689. int ret;
  690. ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
  691. if (ret != 0) {
  692. printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
  693. /* Pretty fatal; 64-bit userspace has no other
  694. mechanism for syscalls. */
  695. }
  696. if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
  697. ret = register_callback(CALLBACKTYPE_syscall32,
  698. xen_syscall32_target);
  699. if (ret != 0)
  700. setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
  701. }
  702. #endif /* CONFIG_X86_64 */
  703. }
  704. void __init xen_pvmmu_arch_setup(void)
  705. {
  706. HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
  707. HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
  708. HYPERVISOR_vm_assist(VMASST_CMD_enable,
  709. VMASST_TYPE_pae_extended_cr3);
  710. if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
  711. register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
  712. BUG();
  713. xen_enable_sysenter();
  714. xen_enable_syscall();
  715. }
  716. /* This function is not called for HVM domains */
  717. void __init xen_arch_setup(void)
  718. {
  719. xen_panic_handler_init();
  720. if (!xen_feature(XENFEAT_auto_translated_physmap))
  721. xen_pvmmu_arch_setup();
  722. #ifdef CONFIG_ACPI
  723. if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
  724. printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
  725. disable_acpi();
  726. }
  727. #endif
  728. memcpy(boot_command_line, xen_start_info->cmd_line,
  729. MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
  730. COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
  731. /* Set up idle, making sure it calls safe_halt() pvop */
  732. disable_cpuidle();
  733. disable_cpufreq();
  734. WARN_ON(xen_set_default_idle());
  735. fiddle_vdso();
  736. #ifdef CONFIG_NUMA
  737. numa_off = 1;
  738. #endif
  739. }