kexec.c 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036
  1. /*
  2. * kexec.c - kexec system call
  3. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
  4. *
  5. * This source code is licensed under the GNU General Public License,
  6. * Version 2. See the file COPYING for more details.
  7. */
  8. #include <linux/mm.h>
  9. #include <linux/file.h>
  10. #include <linux/slab.h>
  11. #include <linux/fs.h>
  12. #include <linux/kexec.h>
  13. #include <linux/spinlock.h>
  14. #include <linux/list.h>
  15. #include <linux/highmem.h>
  16. #include <linux/syscalls.h>
  17. #include <linux/reboot.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/ioport.h>
  20. #include <asm/page.h>
  21. #include <asm/uaccess.h>
  22. #include <asm/io.h>
  23. #include <asm/system.h>
  24. #include <asm/semaphore.h>
  25. /* Location of the reserved area for the crash kernel */
  26. struct resource crashk_res = {
  27. .name = "Crash kernel",
  28. .start = 0,
  29. .end = 0,
  30. .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  31. };
  32. /*
  33. * When kexec transitions to the new kernel there is a one-to-one
  34. * mapping between physical and virtual addresses. On processors
  35. * where you can disable the MMU this is trivial, and easy. For
  36. * others it is still a simple predictable page table to setup.
  37. *
  38. * In that environment kexec copies the new kernel to its final
  39. * resting place. This means I can only support memory whose
  40. * physical address can fit in an unsigned long. In particular
  41. * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  42. * If the assembly stub has more restrictive requirements
  43. * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  44. * defined more restrictively in <asm/kexec.h>.
  45. *
  46. * The code for the transition from the current kernel to the
  47. * the new kernel is placed in the control_code_buffer, whose size
  48. * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
  49. * page of memory is necessary, but some architectures require more.
  50. * Because this memory must be identity mapped in the transition from
  51. * virtual to physical addresses it must live in the range
  52. * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  53. * modifiable.
  54. *
  55. * The assembly stub in the control code buffer is passed a linked list
  56. * of descriptor pages detailing the source pages of the new kernel,
  57. * and the destination addresses of those source pages. As this data
  58. * structure is not used in the context of the current OS, it must
  59. * be self-contained.
  60. *
  61. * The code has been made to work with highmem pages and will use a
  62. * destination page in its final resting place (if it happens
  63. * to allocate it). The end product of this is that most of the
  64. * physical address space, and most of RAM can be used.
  65. *
  66. * Future directions include:
  67. * - allocating a page table with the control code buffer identity
  68. * mapped, to simplify machine_kexec and make kexec_on_panic more
  69. * reliable.
  70. */
  71. /*
  72. * KIMAGE_NO_DEST is an impossible destination address..., for
  73. * allocating pages whose destination address we do not care about.
  74. */
  75. #define KIMAGE_NO_DEST (-1UL)
  76. static int kimage_is_destination_range(
  77. struct kimage *image, unsigned long start, unsigned long end);
  78. static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
  79. static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
  80. unsigned long nr_segments, struct kexec_segment __user *segments)
  81. {
  82. size_t segment_bytes;
  83. struct kimage *image;
  84. unsigned long i;
  85. int result;
  86. /* Allocate a controlling structure */
  87. result = -ENOMEM;
  88. image = kmalloc(sizeof(*image), GFP_KERNEL);
  89. if (!image) {
  90. goto out;
  91. }
  92. memset(image, 0, sizeof(*image));
  93. image->head = 0;
  94. image->entry = &image->head;
  95. image->last_entry = &image->head;
  96. image->control_page = ~0; /* By default this does not apply */
  97. image->start = entry;
  98. image->type = KEXEC_TYPE_DEFAULT;
  99. /* Initialize the list of control pages */
  100. INIT_LIST_HEAD(&image->control_pages);
  101. /* Initialize the list of destination pages */
  102. INIT_LIST_HEAD(&image->dest_pages);
  103. /* Initialize the list of unuseable pages */
  104. INIT_LIST_HEAD(&image->unuseable_pages);
  105. /* Read in the segments */
  106. image->nr_segments = nr_segments;
  107. segment_bytes = nr_segments * sizeof(*segments);
  108. result = copy_from_user(image->segment, segments, segment_bytes);
  109. if (result)
  110. goto out;
  111. /*
  112. * Verify we have good destination addresses. The caller is
  113. * responsible for making certain we don't attempt to load
  114. * the new image into invalid or reserved areas of RAM. This
  115. * just verifies it is an address we can use.
  116. *
  117. * Since the kernel does everything in page size chunks ensure
  118. * the destination addreses are page aligned. Too many
  119. * special cases crop of when we don't do this. The most
  120. * insidious is getting overlapping destination addresses
  121. * simply because addresses are changed to page size
  122. * granularity.
  123. */
  124. result = -EADDRNOTAVAIL;
  125. for (i = 0; i < nr_segments; i++) {
  126. unsigned long mstart, mend;
  127. mstart = image->segment[i].mem;
  128. mend = mstart + image->segment[i].memsz;
  129. if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
  130. goto out;
  131. if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
  132. goto out;
  133. }
  134. /* Verify our destination addresses do not overlap.
  135. * If we alloed overlapping destination addresses
  136. * through very weird things can happen with no
  137. * easy explanation as one segment stops on another.
  138. */
  139. result = -EINVAL;
  140. for(i = 0; i < nr_segments; i++) {
  141. unsigned long mstart, mend;
  142. unsigned long j;
  143. mstart = image->segment[i].mem;
  144. mend = mstart + image->segment[i].memsz;
  145. for(j = 0; j < i; j++) {
  146. unsigned long pstart, pend;
  147. pstart = image->segment[j].mem;
  148. pend = pstart + image->segment[j].memsz;
  149. /* Do the segments overlap ? */
  150. if ((mend > pstart) && (mstart < pend))
  151. goto out;
  152. }
  153. }
  154. /* Ensure our buffer sizes are strictly less than
  155. * our memory sizes. This should always be the case,
  156. * and it is easier to check up front than to be surprised
  157. * later on.
  158. */
  159. result = -EINVAL;
  160. for(i = 0; i < nr_segments; i++) {
  161. if (image->segment[i].bufsz > image->segment[i].memsz)
  162. goto out;
  163. }
  164. result = 0;
  165. out:
  166. if (result == 0) {
  167. *rimage = image;
  168. } else {
  169. kfree(image);
  170. }
  171. return result;
  172. }
  173. static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
  174. unsigned long nr_segments, struct kexec_segment __user *segments)
  175. {
  176. int result;
  177. struct kimage *image;
  178. /* Allocate and initialize a controlling structure */
  179. image = NULL;
  180. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  181. if (result) {
  182. goto out;
  183. }
  184. *rimage = image;
  185. /*
  186. * Find a location for the control code buffer, and add it
  187. * the vector of segments so that it's pages will also be
  188. * counted as destination pages.
  189. */
  190. result = -ENOMEM;
  191. image->control_code_page = kimage_alloc_control_pages(image,
  192. get_order(KEXEC_CONTROL_CODE_SIZE));
  193. if (!image->control_code_page) {
  194. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  195. goto out;
  196. }
  197. result = 0;
  198. out:
  199. if (result == 0) {
  200. *rimage = image;
  201. } else {
  202. kfree(image);
  203. }
  204. return result;
  205. }
  206. static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
  207. unsigned long nr_segments, struct kexec_segment *segments)
  208. {
  209. int result;
  210. struct kimage *image;
  211. unsigned long i;
  212. image = NULL;
  213. /* Verify we have a valid entry point */
  214. if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
  215. result = -EADDRNOTAVAIL;
  216. goto out;
  217. }
  218. /* Allocate and initialize a controlling structure */
  219. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  220. if (result) {
  221. goto out;
  222. }
  223. /* Enable the special crash kernel control page
  224. * allocation policy.
  225. */
  226. image->control_page = crashk_res.start;
  227. image->type = KEXEC_TYPE_CRASH;
  228. /*
  229. * Verify we have good destination addresses. Normally
  230. * the caller is responsible for making certain we don't
  231. * attempt to load the new image into invalid or reserved
  232. * areas of RAM. But crash kernels are preloaded into a
  233. * reserved area of ram. We must ensure the addresses
  234. * are in the reserved area otherwise preloading the
  235. * kernel could corrupt things.
  236. */
  237. result = -EADDRNOTAVAIL;
  238. for (i = 0; i < nr_segments; i++) {
  239. unsigned long mstart, mend;
  240. mstart = image->segment[i].mem;
  241. mend = mstart + image->segment[i].memsz - 1;
  242. /* Ensure we are within the crash kernel limits */
  243. if ((mstart < crashk_res.start) || (mend > crashk_res.end))
  244. goto out;
  245. }
  246. /*
  247. * Find a location for the control code buffer, and add
  248. * the vector of segments so that it's pages will also be
  249. * counted as destination pages.
  250. */
  251. result = -ENOMEM;
  252. image->control_code_page = kimage_alloc_control_pages(image,
  253. get_order(KEXEC_CONTROL_CODE_SIZE));
  254. if (!image->control_code_page) {
  255. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  256. goto out;
  257. }
  258. result = 0;
  259. out:
  260. if (result == 0) {
  261. *rimage = image;
  262. } else {
  263. kfree(image);
  264. }
  265. return result;
  266. }
  267. static int kimage_is_destination_range(
  268. struct kimage *image, unsigned long start, unsigned long end)
  269. {
  270. unsigned long i;
  271. for (i = 0; i < image->nr_segments; i++) {
  272. unsigned long mstart, mend;
  273. mstart = image->segment[i].mem;
  274. mend = mstart + image->segment[i].memsz;
  275. if ((end > mstart) && (start < mend)) {
  276. return 1;
  277. }
  278. }
  279. return 0;
  280. }
  281. static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
  282. {
  283. struct page *pages;
  284. pages = alloc_pages(gfp_mask, order);
  285. if (pages) {
  286. unsigned int count, i;
  287. pages->mapping = NULL;
  288. pages->private = order;
  289. count = 1 << order;
  290. for(i = 0; i < count; i++) {
  291. SetPageReserved(pages + i);
  292. }
  293. }
  294. return pages;
  295. }
  296. static void kimage_free_pages(struct page *page)
  297. {
  298. unsigned int order, count, i;
  299. order = page->private;
  300. count = 1 << order;
  301. for(i = 0; i < count; i++) {
  302. ClearPageReserved(page + i);
  303. }
  304. __free_pages(page, order);
  305. }
  306. static void kimage_free_page_list(struct list_head *list)
  307. {
  308. struct list_head *pos, *next;
  309. list_for_each_safe(pos, next, list) {
  310. struct page *page;
  311. page = list_entry(pos, struct page, lru);
  312. list_del(&page->lru);
  313. kimage_free_pages(page);
  314. }
  315. }
  316. static struct page *kimage_alloc_normal_control_pages(
  317. struct kimage *image, unsigned int order)
  318. {
  319. /* Control pages are special, they are the intermediaries
  320. * that are needed while we copy the rest of the pages
  321. * to their final resting place. As such they must
  322. * not conflict with either the destination addresses
  323. * or memory the kernel is already using.
  324. *
  325. * The only case where we really need more than one of
  326. * these are for architectures where we cannot disable
  327. * the MMU and must instead generate an identity mapped
  328. * page table for all of the memory.
  329. *
  330. * At worst this runs in O(N) of the image size.
  331. */
  332. struct list_head extra_pages;
  333. struct page *pages;
  334. unsigned int count;
  335. count = 1 << order;
  336. INIT_LIST_HEAD(&extra_pages);
  337. /* Loop while I can allocate a page and the page allocated
  338. * is a destination page.
  339. */
  340. do {
  341. unsigned long pfn, epfn, addr, eaddr;
  342. pages = kimage_alloc_pages(GFP_KERNEL, order);
  343. if (!pages)
  344. break;
  345. pfn = page_to_pfn(pages);
  346. epfn = pfn + count;
  347. addr = pfn << PAGE_SHIFT;
  348. eaddr = epfn << PAGE_SHIFT;
  349. if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
  350. kimage_is_destination_range(image, addr, eaddr))
  351. {
  352. list_add(&pages->lru, &extra_pages);
  353. pages = NULL;
  354. }
  355. } while(!pages);
  356. if (pages) {
  357. /* Remember the allocated page... */
  358. list_add(&pages->lru, &image->control_pages);
  359. /* Because the page is already in it's destination
  360. * location we will never allocate another page at
  361. * that address. Therefore kimage_alloc_pages
  362. * will not return it (again) and we don't need
  363. * to give it an entry in image->segment[].
  364. */
  365. }
  366. /* Deal with the destination pages I have inadvertently allocated.
  367. *
  368. * Ideally I would convert multi-page allocations into single
  369. * page allocations, and add everyting to image->dest_pages.
  370. *
  371. * For now it is simpler to just free the pages.
  372. */
  373. kimage_free_page_list(&extra_pages);
  374. return pages;
  375. }
  376. static struct page *kimage_alloc_crash_control_pages(
  377. struct kimage *image, unsigned int order)
  378. {
  379. /* Control pages are special, they are the intermediaries
  380. * that are needed while we copy the rest of the pages
  381. * to their final resting place. As such they must
  382. * not conflict with either the destination addresses
  383. * or memory the kernel is already using.
  384. *
  385. * Control pages are also the only pags we must allocate
  386. * when loading a crash kernel. All of the other pages
  387. * are specified by the segments and we just memcpy
  388. * into them directly.
  389. *
  390. * The only case where we really need more than one of
  391. * these are for architectures where we cannot disable
  392. * the MMU and must instead generate an identity mapped
  393. * page table for all of the memory.
  394. *
  395. * Given the low demand this implements a very simple
  396. * allocator that finds the first hole of the appropriate
  397. * size in the reserved memory region, and allocates all
  398. * of the memory up to and including the hole.
  399. */
  400. unsigned long hole_start, hole_end, size;
  401. struct page *pages;
  402. pages = NULL;
  403. size = (1 << order) << PAGE_SHIFT;
  404. hole_start = (image->control_page + (size - 1)) & ~(size - 1);
  405. hole_end = hole_start + size - 1;
  406. while(hole_end <= crashk_res.end) {
  407. unsigned long i;
  408. if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
  409. break;
  410. }
  411. if (hole_end > crashk_res.end) {
  412. break;
  413. }
  414. /* See if I overlap any of the segments */
  415. for(i = 0; i < image->nr_segments; i++) {
  416. unsigned long mstart, mend;
  417. mstart = image->segment[i].mem;
  418. mend = mstart + image->segment[i].memsz - 1;
  419. if ((hole_end >= mstart) && (hole_start <= mend)) {
  420. /* Advance the hole to the end of the segment */
  421. hole_start = (mend + (size - 1)) & ~(size - 1);
  422. hole_end = hole_start + size - 1;
  423. break;
  424. }
  425. }
  426. /* If I don't overlap any segments I have found my hole! */
  427. if (i == image->nr_segments) {
  428. pages = pfn_to_page(hole_start >> PAGE_SHIFT);
  429. break;
  430. }
  431. }
  432. if (pages) {
  433. image->control_page = hole_end;
  434. }
  435. return pages;
  436. }
  437. struct page *kimage_alloc_control_pages(
  438. struct kimage *image, unsigned int order)
  439. {
  440. struct page *pages = NULL;
  441. switch(image->type) {
  442. case KEXEC_TYPE_DEFAULT:
  443. pages = kimage_alloc_normal_control_pages(image, order);
  444. break;
  445. case KEXEC_TYPE_CRASH:
  446. pages = kimage_alloc_crash_control_pages(image, order);
  447. break;
  448. }
  449. return pages;
  450. }
  451. static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
  452. {
  453. if (*image->entry != 0) {
  454. image->entry++;
  455. }
  456. if (image->entry == image->last_entry) {
  457. kimage_entry_t *ind_page;
  458. struct page *page;
  459. page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
  460. if (!page) {
  461. return -ENOMEM;
  462. }
  463. ind_page = page_address(page);
  464. *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
  465. image->entry = ind_page;
  466. image->last_entry =
  467. ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
  468. }
  469. *image->entry = entry;
  470. image->entry++;
  471. *image->entry = 0;
  472. return 0;
  473. }
  474. static int kimage_set_destination(
  475. struct kimage *image, unsigned long destination)
  476. {
  477. int result;
  478. destination &= PAGE_MASK;
  479. result = kimage_add_entry(image, destination | IND_DESTINATION);
  480. if (result == 0) {
  481. image->destination = destination;
  482. }
  483. return result;
  484. }
  485. static int kimage_add_page(struct kimage *image, unsigned long page)
  486. {
  487. int result;
  488. page &= PAGE_MASK;
  489. result = kimage_add_entry(image, page | IND_SOURCE);
  490. if (result == 0) {
  491. image->destination += PAGE_SIZE;
  492. }
  493. return result;
  494. }
  495. static void kimage_free_extra_pages(struct kimage *image)
  496. {
  497. /* Walk through and free any extra destination pages I may have */
  498. kimage_free_page_list(&image->dest_pages);
  499. /* Walk through and free any unuseable pages I have cached */
  500. kimage_free_page_list(&image->unuseable_pages);
  501. }
  502. static int kimage_terminate(struct kimage *image)
  503. {
  504. if (*image->entry != 0) {
  505. image->entry++;
  506. }
  507. *image->entry = IND_DONE;
  508. return 0;
  509. }
  510. #define for_each_kimage_entry(image, ptr, entry) \
  511. for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
  512. ptr = (entry & IND_INDIRECTION)? \
  513. phys_to_virt((entry & PAGE_MASK)): ptr +1)
  514. static void kimage_free_entry(kimage_entry_t entry)
  515. {
  516. struct page *page;
  517. page = pfn_to_page(entry >> PAGE_SHIFT);
  518. kimage_free_pages(page);
  519. }
  520. static void kimage_free(struct kimage *image)
  521. {
  522. kimage_entry_t *ptr, entry;
  523. kimage_entry_t ind = 0;
  524. if (!image)
  525. return;
  526. kimage_free_extra_pages(image);
  527. for_each_kimage_entry(image, ptr, entry) {
  528. if (entry & IND_INDIRECTION) {
  529. /* Free the previous indirection page */
  530. if (ind & IND_INDIRECTION) {
  531. kimage_free_entry(ind);
  532. }
  533. /* Save this indirection page until we are
  534. * done with it.
  535. */
  536. ind = entry;
  537. }
  538. else if (entry & IND_SOURCE) {
  539. kimage_free_entry(entry);
  540. }
  541. }
  542. /* Free the final indirection page */
  543. if (ind & IND_INDIRECTION) {
  544. kimage_free_entry(ind);
  545. }
  546. /* Handle any machine specific cleanup */
  547. machine_kexec_cleanup(image);
  548. /* Free the kexec control pages... */
  549. kimage_free_page_list(&image->control_pages);
  550. kfree(image);
  551. }
  552. static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
  553. {
  554. kimage_entry_t *ptr, entry;
  555. unsigned long destination = 0;
  556. for_each_kimage_entry(image, ptr, entry) {
  557. if (entry & IND_DESTINATION) {
  558. destination = entry & PAGE_MASK;
  559. }
  560. else if (entry & IND_SOURCE) {
  561. if (page == destination) {
  562. return ptr;
  563. }
  564. destination += PAGE_SIZE;
  565. }
  566. }
  567. return 0;
  568. }
  569. static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
  570. {
  571. /*
  572. * Here we implement safeguards to ensure that a source page
  573. * is not copied to its destination page before the data on
  574. * the destination page is no longer useful.
  575. *
  576. * To do this we maintain the invariant that a source page is
  577. * either its own destination page, or it is not a
  578. * destination page at all.
  579. *
  580. * That is slightly stronger than required, but the proof
  581. * that no problems will not occur is trivial, and the
  582. * implementation is simply to verify.
  583. *
  584. * When allocating all pages normally this algorithm will run
  585. * in O(N) time, but in the worst case it will run in O(N^2)
  586. * time. If the runtime is a problem the data structures can
  587. * be fixed.
  588. */
  589. struct page *page;
  590. unsigned long addr;
  591. /*
  592. * Walk through the list of destination pages, and see if I
  593. * have a match.
  594. */
  595. list_for_each_entry(page, &image->dest_pages, lru) {
  596. addr = page_to_pfn(page) << PAGE_SHIFT;
  597. if (addr == destination) {
  598. list_del(&page->lru);
  599. return page;
  600. }
  601. }
  602. page = NULL;
  603. while (1) {
  604. kimage_entry_t *old;
  605. /* Allocate a page, if we run out of memory give up */
  606. page = kimage_alloc_pages(gfp_mask, 0);
  607. if (!page) {
  608. return 0;
  609. }
  610. /* If the page cannot be used file it away */
  611. if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
  612. list_add(&page->lru, &image->unuseable_pages);
  613. continue;
  614. }
  615. addr = page_to_pfn(page) << PAGE_SHIFT;
  616. /* If it is the destination page we want use it */
  617. if (addr == destination)
  618. break;
  619. /* If the page is not a destination page use it */
  620. if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
  621. break;
  622. /*
  623. * I know that the page is someones destination page.
  624. * See if there is already a source page for this
  625. * destination page. And if so swap the source pages.
  626. */
  627. old = kimage_dst_used(image, addr);
  628. if (old) {
  629. /* If so move it */
  630. unsigned long old_addr;
  631. struct page *old_page;
  632. old_addr = *old & PAGE_MASK;
  633. old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
  634. copy_highpage(page, old_page);
  635. *old = addr | (*old & ~PAGE_MASK);
  636. /* The old page I have found cannot be a
  637. * destination page, so return it.
  638. */
  639. addr = old_addr;
  640. page = old_page;
  641. break;
  642. }
  643. else {
  644. /* Place the page on the destination list I
  645. * will use it later.
  646. */
  647. list_add(&page->lru, &image->dest_pages);
  648. }
  649. }
  650. return page;
  651. }
  652. static int kimage_load_normal_segment(struct kimage *image,
  653. struct kexec_segment *segment)
  654. {
  655. unsigned long maddr;
  656. unsigned long ubytes, mbytes;
  657. int result;
  658. unsigned char *buf;
  659. result = 0;
  660. buf = segment->buf;
  661. ubytes = segment->bufsz;
  662. mbytes = segment->memsz;
  663. maddr = segment->mem;
  664. result = kimage_set_destination(image, maddr);
  665. if (result < 0) {
  666. goto out;
  667. }
  668. while(mbytes) {
  669. struct page *page;
  670. char *ptr;
  671. size_t uchunk, mchunk;
  672. page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
  673. if (page == 0) {
  674. result = -ENOMEM;
  675. goto out;
  676. }
  677. result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
  678. if (result < 0) {
  679. goto out;
  680. }
  681. ptr = kmap(page);
  682. /* Start with a clear page */
  683. memset(ptr, 0, PAGE_SIZE);
  684. ptr += maddr & ~PAGE_MASK;
  685. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  686. if (mchunk > mbytes) {
  687. mchunk = mbytes;
  688. }
  689. uchunk = mchunk;
  690. if (uchunk > ubytes) {
  691. uchunk = ubytes;
  692. }
  693. result = copy_from_user(ptr, buf, uchunk);
  694. kunmap(page);
  695. if (result) {
  696. result = (result < 0) ? result : -EIO;
  697. goto out;
  698. }
  699. ubytes -= uchunk;
  700. maddr += mchunk;
  701. buf += mchunk;
  702. mbytes -= mchunk;
  703. }
  704. out:
  705. return result;
  706. }
  707. static int kimage_load_crash_segment(struct kimage *image,
  708. struct kexec_segment *segment)
  709. {
  710. /* For crash dumps kernels we simply copy the data from
  711. * user space to it's destination.
  712. * We do things a page at a time for the sake of kmap.
  713. */
  714. unsigned long maddr;
  715. unsigned long ubytes, mbytes;
  716. int result;
  717. unsigned char *buf;
  718. result = 0;
  719. buf = segment->buf;
  720. ubytes = segment->bufsz;
  721. mbytes = segment->memsz;
  722. maddr = segment->mem;
  723. while(mbytes) {
  724. struct page *page;
  725. char *ptr;
  726. size_t uchunk, mchunk;
  727. page = pfn_to_page(maddr >> PAGE_SHIFT);
  728. if (page == 0) {
  729. result = -ENOMEM;
  730. goto out;
  731. }
  732. ptr = kmap(page);
  733. ptr += maddr & ~PAGE_MASK;
  734. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  735. if (mchunk > mbytes) {
  736. mchunk = mbytes;
  737. }
  738. uchunk = mchunk;
  739. if (uchunk > ubytes) {
  740. uchunk = ubytes;
  741. /* Zero the trailing part of the page */
  742. memset(ptr + uchunk, 0, mchunk - uchunk);
  743. }
  744. result = copy_from_user(ptr, buf, uchunk);
  745. kunmap(page);
  746. if (result) {
  747. result = (result < 0) ? result : -EIO;
  748. goto out;
  749. }
  750. ubytes -= uchunk;
  751. maddr += mchunk;
  752. buf += mchunk;
  753. mbytes -= mchunk;
  754. }
  755. out:
  756. return result;
  757. }
  758. static int kimage_load_segment(struct kimage *image,
  759. struct kexec_segment *segment)
  760. {
  761. int result = -ENOMEM;
  762. switch(image->type) {
  763. case KEXEC_TYPE_DEFAULT:
  764. result = kimage_load_normal_segment(image, segment);
  765. break;
  766. case KEXEC_TYPE_CRASH:
  767. result = kimage_load_crash_segment(image, segment);
  768. break;
  769. }
  770. return result;
  771. }
  772. /*
  773. * Exec Kernel system call: for obvious reasons only root may call it.
  774. *
  775. * This call breaks up into three pieces.
  776. * - A generic part which loads the new kernel from the current
  777. * address space, and very carefully places the data in the
  778. * allocated pages.
  779. *
  780. * - A generic part that interacts with the kernel and tells all of
  781. * the devices to shut down. Preventing on-going dmas, and placing
  782. * the devices in a consistent state so a later kernel can
  783. * reinitialize them.
  784. *
  785. * - A machine specific part that includes the syscall number
  786. * and the copies the image to it's final destination. And
  787. * jumps into the image at entry.
  788. *
  789. * kexec does not sync, or unmount filesystems so if you need
  790. * that to happen you need to do that yourself.
  791. */
  792. struct kimage *kexec_image = NULL;
  793. static struct kimage *kexec_crash_image = NULL;
  794. /*
  795. * A home grown binary mutex.
  796. * Nothing can wait so this mutex is safe to use
  797. * in interrupt context :)
  798. */
  799. static int kexec_lock = 0;
  800. asmlinkage long sys_kexec_load(unsigned long entry,
  801. unsigned long nr_segments, struct kexec_segment __user *segments,
  802. unsigned long flags)
  803. {
  804. struct kimage **dest_image, *image;
  805. int locked;
  806. int result;
  807. /* We only trust the superuser with rebooting the system. */
  808. if (!capable(CAP_SYS_BOOT))
  809. return -EPERM;
  810. /*
  811. * Verify we have a legal set of flags
  812. * This leaves us room for future extensions.
  813. */
  814. if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
  815. return -EINVAL;
  816. /* Verify we are on the appropriate architecture */
  817. if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
  818. ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
  819. {
  820. return -EINVAL;
  821. }
  822. /* Put an artificial cap on the number
  823. * of segments passed to kexec_load.
  824. */
  825. if (nr_segments > KEXEC_SEGMENT_MAX)
  826. return -EINVAL;
  827. image = NULL;
  828. result = 0;
  829. /* Because we write directly to the reserved memory
  830. * region when loading crash kernels we need a mutex here to
  831. * prevent multiple crash kernels from attempting to load
  832. * simultaneously, and to prevent a crash kernel from loading
  833. * over the top of a in use crash kernel.
  834. *
  835. * KISS: always take the mutex.
  836. */
  837. locked = xchg(&kexec_lock, 1);
  838. if (locked) {
  839. return -EBUSY;
  840. }
  841. dest_image = &kexec_image;
  842. if (flags & KEXEC_ON_CRASH) {
  843. dest_image = &kexec_crash_image;
  844. }
  845. if (nr_segments > 0) {
  846. unsigned long i;
  847. /* Loading another kernel to reboot into */
  848. if ((flags & KEXEC_ON_CRASH) == 0) {
  849. result = kimage_normal_alloc(&image, entry, nr_segments, segments);
  850. }
  851. /* Loading another kernel to switch to if this one crashes */
  852. else if (flags & KEXEC_ON_CRASH) {
  853. /* Free any current crash dump kernel before
  854. * we corrupt it.
  855. */
  856. kimage_free(xchg(&kexec_crash_image, NULL));
  857. result = kimage_crash_alloc(&image, entry, nr_segments, segments);
  858. }
  859. if (result) {
  860. goto out;
  861. }
  862. result = machine_kexec_prepare(image);
  863. if (result) {
  864. goto out;
  865. }
  866. for(i = 0; i < nr_segments; i++) {
  867. result = kimage_load_segment(image, &image->segment[i]);
  868. if (result) {
  869. goto out;
  870. }
  871. }
  872. result = kimage_terminate(image);
  873. if (result) {
  874. goto out;
  875. }
  876. }
  877. /* Install the new kernel, and Uninstall the old */
  878. image = xchg(dest_image, image);
  879. out:
  880. xchg(&kexec_lock, 0); /* Release the mutex */
  881. kimage_free(image);
  882. return result;
  883. }
  884. #ifdef CONFIG_COMPAT
  885. asmlinkage long compat_sys_kexec_load(unsigned long entry,
  886. unsigned long nr_segments, struct compat_kexec_segment __user *segments,
  887. unsigned long flags)
  888. {
  889. struct compat_kexec_segment in;
  890. struct kexec_segment out, __user *ksegments;
  891. unsigned long i, result;
  892. /* Don't allow clients that don't understand the native
  893. * architecture to do anything.
  894. */
  895. if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
  896. return -EINVAL;
  897. }
  898. if (nr_segments > KEXEC_SEGMENT_MAX) {
  899. return -EINVAL;
  900. }
  901. ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
  902. for (i=0; i < nr_segments; i++) {
  903. result = copy_from_user(&in, &segments[i], sizeof(in));
  904. if (result) {
  905. return -EFAULT;
  906. }
  907. out.buf = compat_ptr(in.buf);
  908. out.bufsz = in.bufsz;
  909. out.mem = in.mem;
  910. out.memsz = in.memsz;
  911. result = copy_to_user(&ksegments[i], &out, sizeof(out));
  912. if (result) {
  913. return -EFAULT;
  914. }
  915. }
  916. return sys_kexec_load(entry, nr_segments, ksegments, flags);
  917. }
  918. #endif
  919. void crash_kexec(void)
  920. {
  921. struct kimage *image;
  922. int locked;
  923. /* Take the kexec_lock here to prevent sys_kexec_load
  924. * running on one cpu from replacing the crash kernel
  925. * we are using after a panic on a different cpu.
  926. *
  927. * If the crash kernel was not located in a fixed area
  928. * of memory the xchg(&kexec_crash_image) would be
  929. * sufficient. But since I reuse the memory...
  930. */
  931. locked = xchg(&kexec_lock, 1);
  932. if (!locked) {
  933. image = xchg(&kexec_crash_image, NULL);
  934. if (image) {
  935. machine_crash_shutdown();
  936. machine_kexec(image);
  937. }
  938. xchg(&kexec_lock, 0);
  939. }
  940. }