dump_pagetables.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. /*
  2. * Debug helper to dump the current kernel pagetables of the system
  3. * so that we can see what the various memory ranges are set to.
  4. *
  5. * (C) Copyright 2008 Intel Corporation
  6. *
  7. * Author: Arjan van de Ven <arjan@linux.intel.com>
  8. *
  9. * This program is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU General Public License
  11. * as published by the Free Software Foundation; version 2
  12. * of the License.
  13. */
  14. #include <linux/debugfs.h>
  15. #include <linux/mm.h>
  16. #include <linux/init.h>
  17. #include <linux/sched.h>
  18. #include <linux/seq_file.h>
  19. #include <asm/kasan.h>
  20. #include <asm/pgtable.h>
  21. /*
  22. * The dumper groups pagetable entries of the same type into one, and for
  23. * that it needs to keep some state when walking, and flush this state
  24. * when a "break" in the continuity is found.
  25. */
  26. struct pg_state {
  27. int level;
  28. pgprot_t current_prot;
  29. unsigned long start_address;
  30. unsigned long current_address;
  31. const struct addr_marker *marker;
  32. unsigned long lines;
  33. bool to_dmesg;
  34. bool check_wx;
  35. unsigned long wx_pages;
  36. };
  37. struct addr_marker {
  38. unsigned long start_address;
  39. const char *name;
  40. unsigned long max_lines;
  41. };
  42. /* indices for address_markers; keep sync'd w/ address_markers below */
  43. enum address_markers_idx {
  44. USER_SPACE_NR = 0,
  45. #ifdef CONFIG_X86_64
  46. KERNEL_SPACE_NR,
  47. LOW_KERNEL_NR,
  48. VMALLOC_START_NR,
  49. VMEMMAP_START_NR,
  50. #ifdef CONFIG_KASAN
  51. KASAN_SHADOW_START_NR,
  52. KASAN_SHADOW_END_NR,
  53. #endif
  54. # ifdef CONFIG_X86_ESPFIX64
  55. ESPFIX_START_NR,
  56. # endif
  57. HIGH_KERNEL_NR,
  58. MODULES_VADDR_NR,
  59. MODULES_END_NR,
  60. #else
  61. KERNEL_SPACE_NR,
  62. VMALLOC_START_NR,
  63. VMALLOC_END_NR,
  64. # ifdef CONFIG_HIGHMEM
  65. PKMAP_BASE_NR,
  66. # endif
  67. FIXADDR_START_NR,
  68. #endif
  69. };
  70. /* Address space markers hints */
  71. static struct addr_marker address_markers[] = {
  72. { 0, "User Space" },
  73. #ifdef CONFIG_X86_64
  74. { 0x8000000000000000UL, "Kernel Space" },
  75. { 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
  76. { 0/* VMALLOC_START */, "vmalloc() Area" },
  77. { 0/* VMEMMAP_START */, "Vmemmap" },
  78. #ifdef CONFIG_KASAN
  79. { KASAN_SHADOW_START, "KASAN shadow" },
  80. { KASAN_SHADOW_END, "KASAN shadow end" },
  81. #endif
  82. # ifdef CONFIG_X86_ESPFIX64
  83. { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
  84. # endif
  85. # ifdef CONFIG_EFI
  86. { EFI_VA_END, "EFI Runtime Services" },
  87. # endif
  88. { __START_KERNEL_map, "High Kernel Mapping" },
  89. { MODULES_VADDR, "Modules" },
  90. { MODULES_END, "End Modules" },
  91. #else
  92. { PAGE_OFFSET, "Kernel Mapping" },
  93. { 0/* VMALLOC_START */, "vmalloc() Area" },
  94. { 0/*VMALLOC_END*/, "vmalloc() End" },
  95. # ifdef CONFIG_HIGHMEM
  96. { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
  97. # endif
  98. { 0/*FIXADDR_START*/, "Fixmap Area" },
  99. #endif
  100. { -1, NULL } /* End of list */
  101. };
  102. /* Multipliers for offsets within the PTEs */
  103. #define PTE_LEVEL_MULT (PAGE_SIZE)
  104. #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
  105. #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
  106. #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
  107. #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
  108. #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
  109. ({ \
  110. if (to_dmesg) \
  111. printk(KERN_INFO fmt, ##args); \
  112. else \
  113. if (m) \
  114. seq_printf(m, fmt, ##args); \
  115. })
  116. #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
  117. ({ \
  118. if (to_dmesg) \
  119. printk(KERN_CONT fmt, ##args); \
  120. else \
  121. if (m) \
  122. seq_printf(m, fmt, ##args); \
  123. })
  124. /*
  125. * Print a readable form of a pgprot_t to the seq_file
  126. */
  127. static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
  128. {
  129. pgprotval_t pr = pgprot_val(prot);
  130. static const char * const level_name[] =
  131. { "cr3", "pgd", "pud", "pmd", "pte" };
  132. if (!pgprot_val(prot)) {
  133. /* Not present */
  134. pt_dump_cont_printf(m, dmsg, " ");
  135. } else {
  136. if (pr & _PAGE_USER)
  137. pt_dump_cont_printf(m, dmsg, "USR ");
  138. else
  139. pt_dump_cont_printf(m, dmsg, " ");
  140. if (pr & _PAGE_RW)
  141. pt_dump_cont_printf(m, dmsg, "RW ");
  142. else
  143. pt_dump_cont_printf(m, dmsg, "ro ");
  144. if (pr & _PAGE_PWT)
  145. pt_dump_cont_printf(m, dmsg, "PWT ");
  146. else
  147. pt_dump_cont_printf(m, dmsg, " ");
  148. if (pr & _PAGE_PCD)
  149. pt_dump_cont_printf(m, dmsg, "PCD ");
  150. else
  151. pt_dump_cont_printf(m, dmsg, " ");
  152. /* Bit 7 has a different meaning on level 3 vs 4 */
  153. if (level <= 3 && pr & _PAGE_PSE)
  154. pt_dump_cont_printf(m, dmsg, "PSE ");
  155. else
  156. pt_dump_cont_printf(m, dmsg, " ");
  157. if ((level == 4 && pr & _PAGE_PAT) ||
  158. ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
  159. pt_dump_cont_printf(m, dmsg, "PAT ");
  160. else
  161. pt_dump_cont_printf(m, dmsg, " ");
  162. if (pr & _PAGE_GLOBAL)
  163. pt_dump_cont_printf(m, dmsg, "GLB ");
  164. else
  165. pt_dump_cont_printf(m, dmsg, " ");
  166. if (pr & _PAGE_NX)
  167. pt_dump_cont_printf(m, dmsg, "NX ");
  168. else
  169. pt_dump_cont_printf(m, dmsg, "x ");
  170. }
  171. pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
  172. }
  173. /*
  174. * On 64 bits, sign-extend the 48 bit address to 64 bit
  175. */
  176. static unsigned long normalize_addr(unsigned long u)
  177. {
  178. #ifdef CONFIG_X86_64
  179. return (signed long)(u << 16) >> 16;
  180. #else
  181. return u;
  182. #endif
  183. }
  184. /*
  185. * This function gets called on a break in a continuous series
  186. * of PTE entries; the next one is different so we need to
  187. * print what we collected so far.
  188. */
  189. static void note_page(struct seq_file *m, struct pg_state *st,
  190. pgprot_t new_prot, int level)
  191. {
  192. pgprotval_t prot, cur;
  193. static const char units[] = "BKMGTPE";
  194. /*
  195. * If we have a "break" in the series, we need to flush the state that
  196. * we have now. "break" is either changing perms, levels or
  197. * address space marker.
  198. */
  199. prot = pgprot_val(new_prot);
  200. cur = pgprot_val(st->current_prot);
  201. if (!st->level) {
  202. /* First entry */
  203. st->current_prot = new_prot;
  204. st->level = level;
  205. st->marker = address_markers;
  206. st->lines = 0;
  207. pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
  208. st->marker->name);
  209. } else if (prot != cur || level != st->level ||
  210. st->current_address >= st->marker[1].start_address) {
  211. const char *unit = units;
  212. unsigned long delta;
  213. int width = sizeof(unsigned long) * 2;
  214. pgprotval_t pr = pgprot_val(st->current_prot);
  215. if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
  216. WARN_ONCE(1,
  217. "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
  218. (void *)st->start_address,
  219. (void *)st->start_address);
  220. st->wx_pages += (st->current_address -
  221. st->start_address) / PAGE_SIZE;
  222. }
  223. /*
  224. * Now print the actual finished series
  225. */
  226. if (!st->marker->max_lines ||
  227. st->lines < st->marker->max_lines) {
  228. pt_dump_seq_printf(m, st->to_dmesg,
  229. "0x%0*lx-0x%0*lx ",
  230. width, st->start_address,
  231. width, st->current_address);
  232. delta = st->current_address - st->start_address;
  233. while (!(delta & 1023) && unit[1]) {
  234. delta >>= 10;
  235. unit++;
  236. }
  237. pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
  238. delta, *unit);
  239. printk_prot(m, st->current_prot, st->level,
  240. st->to_dmesg);
  241. }
  242. st->lines++;
  243. /*
  244. * We print markers for special areas of address space,
  245. * such as the start of vmalloc space etc.
  246. * This helps in the interpretation.
  247. */
  248. if (st->current_address >= st->marker[1].start_address) {
  249. if (st->marker->max_lines &&
  250. st->lines > st->marker->max_lines) {
  251. unsigned long nskip =
  252. st->lines - st->marker->max_lines;
  253. pt_dump_seq_printf(m, st->to_dmesg,
  254. "... %lu entr%s skipped ... \n",
  255. nskip,
  256. nskip == 1 ? "y" : "ies");
  257. }
  258. st->marker++;
  259. st->lines = 0;
  260. pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
  261. st->marker->name);
  262. }
  263. st->start_address = st->current_address;
  264. st->current_prot = new_prot;
  265. st->level = level;
  266. }
  267. }
  268. static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
  269. {
  270. int i;
  271. pte_t *start;
  272. pgprotval_t prot;
  273. start = (pte_t *)pmd_page_vaddr(addr);
  274. for (i = 0; i < PTRS_PER_PTE; i++) {
  275. prot = pte_flags(*start);
  276. st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
  277. note_page(m, st, __pgprot(prot), 4);
  278. start++;
  279. }
  280. }
  281. #if PTRS_PER_PMD > 1
  282. static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
  283. {
  284. int i;
  285. pmd_t *start;
  286. pgprotval_t prot;
  287. start = (pmd_t *)pud_page_vaddr(addr);
  288. for (i = 0; i < PTRS_PER_PMD; i++) {
  289. st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
  290. if (!pmd_none(*start)) {
  291. if (pmd_large(*start) || !pmd_present(*start)) {
  292. prot = pmd_flags(*start);
  293. note_page(m, st, __pgprot(prot), 3);
  294. } else {
  295. walk_pte_level(m, st, *start,
  296. P + i * PMD_LEVEL_MULT);
  297. }
  298. } else
  299. note_page(m, st, __pgprot(0), 3);
  300. start++;
  301. }
  302. }
  303. #else
  304. #define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
  305. #define pud_large(a) pmd_large(__pmd(pud_val(a)))
  306. #define pud_none(a) pmd_none(__pmd(pud_val(a)))
  307. #endif
  308. #if PTRS_PER_PUD > 1
  309. /*
  310. * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
  311. * KASAN fills page tables with the same values. Since there is no
  312. * point in checking page table more than once we just skip repeated
  313. * entries. This saves us dozens of seconds during boot.
  314. */
  315. static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
  316. {
  317. return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
  318. }
  319. static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
  320. {
  321. int i;
  322. pud_t *start;
  323. pgprotval_t prot;
  324. pud_t *prev_pud = NULL;
  325. start = (pud_t *)p4d_page_vaddr(addr);
  326. for (i = 0; i < PTRS_PER_PUD; i++) {
  327. st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
  328. if (!pud_none(*start) &&
  329. !pud_already_checked(prev_pud, start, st->check_wx)) {
  330. if (pud_large(*start) || !pud_present(*start)) {
  331. prot = pud_flags(*start);
  332. note_page(m, st, __pgprot(prot), 2);
  333. } else {
  334. walk_pmd_level(m, st, *start,
  335. P + i * PUD_LEVEL_MULT);
  336. }
  337. } else
  338. note_page(m, st, __pgprot(0), 2);
  339. prev_pud = start;
  340. start++;
  341. }
  342. }
  343. #else
  344. #define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
  345. #define p4d_large(a) pud_large(__pud(p4d_val(a)))
  346. #define p4d_none(a) pud_none(__pud(p4d_val(a)))
  347. #endif
  348. #if PTRS_PER_P4D > 1
  349. static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
  350. {
  351. int i;
  352. p4d_t *start;
  353. pgprotval_t prot;
  354. start = (p4d_t *)pgd_page_vaddr(addr);
  355. for (i = 0; i < PTRS_PER_P4D; i++) {
  356. st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
  357. if (!p4d_none(*start)) {
  358. if (p4d_large(*start) || !p4d_present(*start)) {
  359. prot = p4d_flags(*start);
  360. note_page(m, st, __pgprot(prot), 2);
  361. } else {
  362. walk_pud_level(m, st, *start,
  363. P + i * P4D_LEVEL_MULT);
  364. }
  365. } else
  366. note_page(m, st, __pgprot(0), 2);
  367. start++;
  368. }
  369. }
  370. #else
  371. #define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
  372. #define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
  373. #define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
  374. #endif
  375. static inline bool is_hypervisor_range(int idx)
  376. {
  377. #ifdef CONFIG_X86_64
  378. /*
  379. * ffff800000000000 - ffff87ffffffffff is reserved for
  380. * the hypervisor.
  381. */
  382. return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
  383. (idx < pgd_index(__PAGE_OFFSET));
  384. #else
  385. return false;
  386. #endif
  387. }
  388. static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
  389. bool checkwx)
  390. {
  391. #ifdef CONFIG_X86_64
  392. pgd_t *start = (pgd_t *) &init_level4_pgt;
  393. #else
  394. pgd_t *start = swapper_pg_dir;
  395. #endif
  396. pgprotval_t prot;
  397. int i;
  398. struct pg_state st = {};
  399. if (pgd) {
  400. start = pgd;
  401. st.to_dmesg = true;
  402. }
  403. st.check_wx = checkwx;
  404. if (checkwx)
  405. st.wx_pages = 0;
  406. for (i = 0; i < PTRS_PER_PGD; i++) {
  407. st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
  408. if (!pgd_none(*start) && !is_hypervisor_range(i)) {
  409. if (pgd_large(*start) || !pgd_present(*start)) {
  410. prot = pgd_flags(*start);
  411. note_page(m, &st, __pgprot(prot), 1);
  412. } else {
  413. walk_p4d_level(m, &st, *start,
  414. i * PGD_LEVEL_MULT);
  415. }
  416. } else
  417. note_page(m, &st, __pgprot(0), 1);
  418. cond_resched();
  419. start++;
  420. }
  421. /* Flush out the last page */
  422. st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
  423. note_page(m, &st, __pgprot(0), 0);
  424. if (!checkwx)
  425. return;
  426. if (st.wx_pages)
  427. pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
  428. st.wx_pages);
  429. else
  430. pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
  431. }
  432. void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
  433. {
  434. ptdump_walk_pgd_level_core(m, pgd, false);
  435. }
  436. EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
  437. void ptdump_walk_pgd_level_checkwx(void)
  438. {
  439. ptdump_walk_pgd_level_core(NULL, NULL, true);
  440. }
  441. static int __init pt_dump_init(void)
  442. {
  443. /*
  444. * Various markers are not compile-time constants, so assign them
  445. * here.
  446. */
  447. #ifdef CONFIG_X86_64
  448. address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
  449. address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
  450. address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
  451. #endif
  452. #ifdef CONFIG_X86_32
  453. address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
  454. address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
  455. # ifdef CONFIG_HIGHMEM
  456. address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
  457. # endif
  458. address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
  459. #endif
  460. return 0;
  461. }
  462. __initcall(pt_dump_init);