setup.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. /*
  2. * Machine specific setup for xen
  3. *
  4. * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  5. */
  6. #include <linux/module.h>
  7. #include <linux/sched.h>
  8. #include <linux/mm.h>
  9. #include <linux/pm.h>
  10. #include <linux/memblock.h>
  11. #include <linux/cpuidle.h>
  12. #include <linux/cpufreq.h>
  13. #include <asm/elf.h>
  14. #include <asm/vdso.h>
  15. #include <asm/e820.h>
  16. #include <asm/setup.h>
  17. #include <asm/acpi.h>
  18. #include <asm/numa.h>
  19. #include <asm/xen/hypervisor.h>
  20. #include <asm/xen/hypercall.h>
  21. #include <xen/xen.h>
  22. #include <xen/page.h>
  23. #include <xen/interface/callback.h>
  24. #include <xen/interface/memory.h>
  25. #include <xen/interface/physdev.h>
  26. #include <xen/features.h>
  27. #include "mmu.h"
  28. #include "xen-ops.h"
  29. #include "vdso.h"
  30. /* These are code, but not functions. Defined in entry.S */
  31. extern const char xen_hypervisor_callback[];
  32. extern const char xen_failsafe_callback[];
  33. #ifdef CONFIG_X86_64
  34. extern asmlinkage void nmi(void);
  35. #endif
  36. extern void xen_sysenter_target(void);
  37. extern void xen_syscall_target(void);
  38. extern void xen_syscall32_target(void);
  39. /* Amount of extra memory space we add to the e820 ranges */
  40. struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
  41. /* Number of pages released from the initial allocation. */
  42. unsigned long xen_released_pages;
  43. /*
  44. * The maximum amount of extra memory compared to the base size. The
  45. * main scaling factor is the size of struct page. At extreme ratios
  46. * of base:extra, all the base memory can be filled with page
  47. * structures for the extra memory, leaving no space for anything
  48. * else.
  49. *
  50. * 10x seems like a reasonable balance between scaling flexibility and
  51. * leaving a practically usable system.
  52. */
  53. #define EXTRA_MEM_RATIO (10)
  54. static void __init xen_add_extra_mem(u64 start, u64 size)
  55. {
  56. unsigned long pfn;
  57. int i;
  58. for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
  59. /* Add new region. */
  60. if (xen_extra_mem[i].size == 0) {
  61. xen_extra_mem[i].start = start;
  62. xen_extra_mem[i].size = size;
  63. break;
  64. }
  65. /* Append to existing region. */
  66. if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
  67. xen_extra_mem[i].size += size;
  68. break;
  69. }
  70. }
  71. if (i == XEN_EXTRA_MEM_MAX_REGIONS)
  72. printk(KERN_WARNING "Warning: not enough extra memory regions\n");
  73. memblock_reserve(start, size);
  74. if (xen_feature(XENFEAT_auto_translated_physmap))
  75. return;
  76. xen_max_p2m_pfn = PFN_DOWN(start + size);
  77. for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
  78. unsigned long mfn = pfn_to_mfn(pfn);
  79. if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
  80. continue;
  81. WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
  82. pfn, mfn);
  83. __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
  84. }
  85. }
  86. static unsigned long __init xen_do_chunk(unsigned long start,
  87. unsigned long end, bool release)
  88. {
  89. struct xen_memory_reservation reservation = {
  90. .address_bits = 0,
  91. .extent_order = 0,
  92. .domid = DOMID_SELF
  93. };
  94. unsigned long len = 0;
  95. int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
  96. unsigned long pfn;
  97. int ret;
  98. for (pfn = start; pfn < end; pfn++) {
  99. unsigned long frame;
  100. unsigned long mfn = pfn_to_mfn(pfn);
  101. if (release) {
  102. /* Make sure pfn exists to start with */
  103. if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
  104. continue;
  105. frame = mfn;
  106. } else {
  107. if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
  108. continue;
  109. frame = pfn;
  110. }
  111. set_xen_guest_handle(reservation.extent_start, &frame);
  112. reservation.nr_extents = 1;
  113. ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
  114. &reservation);
  115. WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
  116. release ? "release" : "populate", pfn, ret);
  117. if (ret == 1) {
  118. if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
  119. if (release)
  120. break;
  121. set_xen_guest_handle(reservation.extent_start, &frame);
  122. reservation.nr_extents = 1;
  123. ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
  124. &reservation);
  125. break;
  126. }
  127. len++;
  128. } else
  129. break;
  130. }
  131. if (len)
  132. printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
  133. release ? "Freeing" : "Populating",
  134. start, end, len,
  135. release ? "freed" : "added");
  136. return len;
  137. }
  138. static unsigned long __init xen_release_chunk(unsigned long start,
  139. unsigned long end)
  140. {
  141. /*
  142. * Xen already ballooned out the E820 non RAM regions for us
  143. * and set them up properly in EPT.
  144. */
  145. if (xen_feature(XENFEAT_auto_translated_physmap))
  146. return end - start;
  147. return xen_do_chunk(start, end, true);
  148. }
  149. static unsigned long __init xen_populate_chunk(
  150. const struct e820entry *list, size_t map_size,
  151. unsigned long max_pfn, unsigned long *last_pfn,
  152. unsigned long credits_left)
  153. {
  154. const struct e820entry *entry;
  155. unsigned int i;
  156. unsigned long done = 0;
  157. unsigned long dest_pfn;
  158. for (i = 0, entry = list; i < map_size; i++, entry++) {
  159. unsigned long s_pfn;
  160. unsigned long e_pfn;
  161. unsigned long pfns;
  162. long capacity;
  163. if (credits_left <= 0)
  164. break;
  165. if (entry->type != E820_RAM)
  166. continue;
  167. e_pfn = PFN_DOWN(entry->addr + entry->size);
  168. /* We only care about E820 after the xen_start_info->nr_pages */
  169. if (e_pfn <= max_pfn)
  170. continue;
  171. s_pfn = PFN_UP(entry->addr);
  172. /* If the E820 falls within the nr_pages, we want to start
  173. * at the nr_pages PFN.
  174. * If that would mean going past the E820 entry, skip it
  175. */
  176. if (s_pfn <= max_pfn) {
  177. capacity = e_pfn - max_pfn;
  178. dest_pfn = max_pfn;
  179. } else {
  180. capacity = e_pfn - s_pfn;
  181. dest_pfn = s_pfn;
  182. }
  183. if (credits_left < capacity)
  184. capacity = credits_left;
  185. pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
  186. done += pfns;
  187. *last_pfn = (dest_pfn + pfns);
  188. if (pfns < capacity)
  189. break;
  190. credits_left -= pfns;
  191. }
  192. return done;
  193. }
  194. static void __init xen_set_identity_and_release_chunk(
  195. unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
  196. unsigned long *released, unsigned long *identity)
  197. {
  198. unsigned long pfn;
  199. /*
  200. * If the PFNs are currently mapped, clear the mappings
  201. * (except for the ISA region which must be 1:1 mapped) to
  202. * release the refcounts (in Xen) on the original frames.
  203. */
  204. /*
  205. * PVH E820 matches the hypervisor's P2M which means we need to
  206. * account for the proper values of *release and *identity.
  207. */
  208. for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
  209. pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
  210. pte_t pte = __pte_ma(0);
  211. if (pfn < PFN_UP(ISA_END_ADDRESS))
  212. pte = mfn_pte(pfn, PAGE_KERNEL_IO);
  213. (void)HYPERVISOR_update_va_mapping(
  214. (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
  215. }
  216. if (start_pfn < nr_pages)
  217. *released += xen_release_chunk(
  218. start_pfn, min(end_pfn, nr_pages));
  219. *identity += set_phys_range_identity(start_pfn, end_pfn);
  220. }
  221. static unsigned long __init xen_set_identity_and_release(
  222. const struct e820entry *list, size_t map_size, unsigned long nr_pages)
  223. {
  224. phys_addr_t start = 0;
  225. unsigned long released = 0;
  226. unsigned long identity = 0;
  227. const struct e820entry *entry;
  228. int i;
  229. /*
  230. * Combine non-RAM regions and gaps until a RAM region (or the
  231. * end of the map) is reached, then set the 1:1 map and
  232. * release the pages (if available) in those non-RAM regions.
  233. *
  234. * The combined non-RAM regions are rounded to a whole number
  235. * of pages so any partial pages are accessible via the 1:1
  236. * mapping. This is needed for some BIOSes that put (for
  237. * example) the DMI tables in a reserved region that begins on
  238. * a non-page boundary.
  239. */
  240. for (i = 0, entry = list; i < map_size; i++, entry++) {
  241. phys_addr_t end = entry->addr + entry->size;
  242. if (entry->type == E820_RAM || i == map_size - 1) {
  243. unsigned long start_pfn = PFN_DOWN(start);
  244. unsigned long end_pfn = PFN_UP(end);
  245. if (entry->type == E820_RAM)
  246. end_pfn = PFN_UP(entry->addr);
  247. if (start_pfn < end_pfn)
  248. xen_set_identity_and_release_chunk(
  249. start_pfn, end_pfn, nr_pages,
  250. &released, &identity);
  251. start = end;
  252. }
  253. }
  254. if (released)
  255. printk(KERN_INFO "Released %lu pages of unused memory\n", released);
  256. if (identity)
  257. printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
  258. return released;
  259. }
  260. static unsigned long __init xen_get_max_pages(void)
  261. {
  262. unsigned long max_pages = MAX_DOMAIN_PAGES;
  263. domid_t domid = DOMID_SELF;
  264. int ret;
  265. /*
  266. * For the initial domain we use the maximum reservation as
  267. * the maximum page.
  268. *
  269. * For guest domains the current maximum reservation reflects
  270. * the current maximum rather than the static maximum. In this
  271. * case the e820 map provided to us will cover the static
  272. * maximum region.
  273. */
  274. if (xen_initial_domain()) {
  275. ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
  276. if (ret > 0)
  277. max_pages = ret;
  278. }
  279. return min(max_pages, MAX_DOMAIN_PAGES);
  280. }
  281. static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
  282. {
  283. u64 end = start + size;
  284. /* Align RAM regions to page boundaries. */
  285. if (type == E820_RAM) {
  286. start = PAGE_ALIGN(start);
  287. end &= ~((u64)PAGE_SIZE - 1);
  288. }
  289. e820_add_region(start, end - start, type);
  290. }
  291. void xen_ignore_unusable(struct e820entry *list, size_t map_size)
  292. {
  293. struct e820entry *entry;
  294. unsigned int i;
  295. for (i = 0, entry = list; i < map_size; i++, entry++) {
  296. if (entry->type == E820_UNUSABLE)
  297. entry->type = E820_RAM;
  298. }
  299. }
  300. /**
  301. * machine_specific_memory_setup - Hook for machine specific memory setup.
  302. **/
  303. char * __init xen_memory_setup(void)
  304. {
  305. static struct e820entry map[E820MAX] __initdata;
  306. unsigned long max_pfn = xen_start_info->nr_pages;
  307. unsigned long long mem_end;
  308. int rc;
  309. struct xen_memory_map memmap;
  310. unsigned long max_pages;
  311. unsigned long last_pfn = 0;
  312. unsigned long extra_pages = 0;
  313. unsigned long populated;
  314. int i;
  315. int op;
  316. max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
  317. mem_end = PFN_PHYS(max_pfn);
  318. memmap.nr_entries = E820MAX;
  319. set_xen_guest_handle(memmap.buffer, map);
  320. op = xen_initial_domain() ?
  321. XENMEM_machine_memory_map :
  322. XENMEM_memory_map;
  323. rc = HYPERVISOR_memory_op(op, &memmap);
  324. if (rc == -ENOSYS) {
  325. BUG_ON(xen_initial_domain());
  326. memmap.nr_entries = 1;
  327. map[0].addr = 0ULL;
  328. map[0].size = mem_end;
  329. /* 8MB slack (to balance backend allocations). */
  330. map[0].size += 8ULL << 20;
  331. map[0].type = E820_RAM;
  332. rc = 0;
  333. }
  334. BUG_ON(rc);
  335. /*
  336. * Xen won't allow a 1:1 mapping to be created to UNUSABLE
  337. * regions, so if we're using the machine memory map leave the
  338. * region as RAM as it is in the pseudo-physical map.
  339. *
  340. * UNUSABLE regions in domUs are not handled and will need
  341. * a patch in the future.
  342. */
  343. if (xen_initial_domain())
  344. xen_ignore_unusable(map, memmap.nr_entries);
  345. /* Make sure the Xen-supplied memory map is well-ordered. */
  346. sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
  347. max_pages = xen_get_max_pages();
  348. if (max_pages > max_pfn)
  349. extra_pages += max_pages - max_pfn;
  350. /*
  351. * Set P2M for all non-RAM pages and E820 gaps to be identity
  352. * type PFNs. Any RAM pages that would be made inaccesible by
  353. * this are first released.
  354. */
  355. xen_released_pages = xen_set_identity_and_release(
  356. map, memmap.nr_entries, max_pfn);
  357. /*
  358. * Populate back the non-RAM pages and E820 gaps that had been
  359. * released. */
  360. populated = xen_populate_chunk(map, memmap.nr_entries,
  361. max_pfn, &last_pfn, xen_released_pages);
  362. xen_released_pages -= populated;
  363. extra_pages += xen_released_pages;
  364. if (last_pfn > max_pfn) {
  365. max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
  366. mem_end = PFN_PHYS(max_pfn);
  367. }
  368. /*
  369. * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
  370. * factor the base size. On non-highmem systems, the base
  371. * size is the full initial memory allocation; on highmem it
  372. * is limited to the max size of lowmem, so that it doesn't
  373. * get completely filled.
  374. *
  375. * In principle there could be a problem in lowmem systems if
  376. * the initial memory is also very large with respect to
  377. * lowmem, but we won't try to deal with that here.
  378. */
  379. extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
  380. extra_pages);
  381. i = 0;
  382. while (i < memmap.nr_entries) {
  383. u64 addr = map[i].addr;
  384. u64 size = map[i].size;
  385. u32 type = map[i].type;
  386. if (type == E820_RAM) {
  387. if (addr < mem_end) {
  388. size = min(size, mem_end - addr);
  389. } else if (extra_pages) {
  390. size = min(size, (u64)extra_pages * PAGE_SIZE);
  391. extra_pages -= size / PAGE_SIZE;
  392. xen_add_extra_mem(addr, size);
  393. } else
  394. type = E820_UNUSABLE;
  395. }
  396. xen_align_and_add_e820_region(addr, size, type);
  397. map[i].addr += size;
  398. map[i].size -= size;
  399. if (map[i].size == 0)
  400. i++;
  401. }
  402. /*
  403. * In domU, the ISA region is normal, usable memory, but we
  404. * reserve ISA memory anyway because too many things poke
  405. * about in there.
  406. */
  407. e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
  408. E820_RESERVED);
  409. /*
  410. * Reserve Xen bits:
  411. * - mfn_list
  412. * - xen_start_info
  413. * See comment above "struct start_info" in <xen/interface/xen.h>
  414. * We tried to make the the memblock_reserve more selective so
  415. * that it would be clear what region is reserved. Sadly we ran
  416. * in the problem wherein on a 64-bit hypervisor with a 32-bit
  417. * initial domain, the pt_base has the cr3 value which is not
  418. * neccessarily where the pagetable starts! As Jan put it: "
  419. * Actually, the adjustment turns out to be correct: The page
  420. * tables for a 32-on-64 dom0 get allocated in the order "first L1",
  421. * "first L2", "first L3", so the offset to the page table base is
  422. * indeed 2. When reading xen/include/public/xen.h's comment
  423. * very strictly, this is not a violation (since there nothing is said
  424. * that the first thing in the page table space is pointed to by
  425. * pt_base; I admit that this seems to be implied though, namely
  426. * do I think that it is implied that the page table space is the
  427. * range [pt_base, pt_base + nt_pt_frames), whereas that
  428. * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
  429. * which - without a priori knowledge - the kernel would have
  430. * difficulty to figure out)." - so lets just fall back to the
  431. * easy way and reserve the whole region.
  432. */
  433. memblock_reserve(__pa(xen_start_info->mfn_list),
  434. xen_start_info->pt_base - xen_start_info->mfn_list);
  435. sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
  436. return "Xen";
  437. }
  438. /*
  439. * Set the bit indicating "nosegneg" library variants should be used.
  440. * We only need to bother in pure 32-bit mode; compat 32-bit processes
  441. * can have un-truncated segments, so wrapping around is allowed.
  442. */
  443. static void __init fiddle_vdso(void)
  444. {
  445. #ifdef CONFIG_X86_32
  446. u32 *mask;
  447. mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
  448. *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
  449. mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
  450. *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
  451. #endif
  452. }
  453. static int register_callback(unsigned type, const void *func)
  454. {
  455. struct callback_register callback = {
  456. .type = type,
  457. .address = XEN_CALLBACK(__KERNEL_CS, func),
  458. .flags = CALLBACKF_mask_events,
  459. };
  460. return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
  461. }
  462. void xen_enable_sysenter(void)
  463. {
  464. int ret;
  465. unsigned sysenter_feature;
  466. #ifdef CONFIG_X86_32
  467. sysenter_feature = X86_FEATURE_SEP;
  468. #else
  469. sysenter_feature = X86_FEATURE_SYSENTER32;
  470. #endif
  471. if (!boot_cpu_has(sysenter_feature))
  472. return;
  473. ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
  474. if(ret != 0)
  475. setup_clear_cpu_cap(sysenter_feature);
  476. }
  477. void xen_enable_syscall(void)
  478. {
  479. #ifdef CONFIG_X86_64
  480. int ret;
  481. ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
  482. if (ret != 0) {
  483. printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
  484. /* Pretty fatal; 64-bit userspace has no other
  485. mechanism for syscalls. */
  486. }
  487. if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
  488. ret = register_callback(CALLBACKTYPE_syscall32,
  489. xen_syscall32_target);
  490. if (ret != 0)
  491. setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
  492. }
  493. #endif /* CONFIG_X86_64 */
  494. }
  495. void xen_enable_nmi(void)
  496. {
  497. #ifdef CONFIG_X86_64
  498. if (register_callback(CALLBACKTYPE_nmi, (char *)nmi))
  499. BUG();
  500. #endif
  501. }
  502. void __init xen_pvmmu_arch_setup(void)
  503. {
  504. HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
  505. HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
  506. HYPERVISOR_vm_assist(VMASST_CMD_enable,
  507. VMASST_TYPE_pae_extended_cr3);
  508. if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
  509. register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
  510. BUG();
  511. xen_enable_sysenter();
  512. xen_enable_syscall();
  513. xen_enable_nmi();
  514. }
  515. /* This function is not called for HVM domains */
  516. void __init xen_arch_setup(void)
  517. {
  518. xen_panic_handler_init();
  519. if (!xen_feature(XENFEAT_auto_translated_physmap))
  520. xen_pvmmu_arch_setup();
  521. #ifdef CONFIG_ACPI
  522. if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
  523. printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
  524. disable_acpi();
  525. }
  526. #endif
  527. memcpy(boot_command_line, xen_start_info->cmd_line,
  528. MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
  529. COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
  530. /* Set up idle, making sure it calls safe_halt() pvop */
  531. disable_cpuidle();
  532. disable_cpufreq();
  533. WARN_ON(xen_set_default_idle());
  534. fiddle_vdso();
  535. #ifdef CONFIG_NUMA
  536. numa_off = 1;
  537. #endif
  538. }