kexec.c 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045
  1. /*
  2. * kexec.c - kexec system call
  3. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
  4. *
  5. * This source code is licensed under the GNU General Public License,
  6. * Version 2. See the file COPYING for more details.
  7. */
  8. #include <linux/mm.h>
  9. #include <linux/file.h>
  10. #include <linux/slab.h>
  11. #include <linux/fs.h>
  12. #include <linux/kexec.h>
  13. #include <linux/spinlock.h>
  14. #include <linux/list.h>
  15. #include <linux/highmem.h>
  16. #include <linux/syscalls.h>
  17. #include <linux/reboot.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/ioport.h>
  20. #include <linux/hardirq.h>
  21. #include <asm/page.h>
  22. #include <asm/uaccess.h>
  23. #include <asm/io.h>
  24. #include <asm/system.h>
  25. #include <asm/semaphore.h>
  26. /* Location of the reserved area for the crash kernel */
  27. struct resource crashk_res = {
  28. .name = "Crash kernel",
  29. .start = 0,
  30. .end = 0,
  31. .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  32. };
  33. int kexec_should_crash(struct task_struct *p)
  34. {
  35. if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
  36. return 1;
  37. return 0;
  38. }
  39. /*
  40. * When kexec transitions to the new kernel there is a one-to-one
  41. * mapping between physical and virtual addresses. On processors
  42. * where you can disable the MMU this is trivial, and easy. For
  43. * others it is still a simple predictable page table to setup.
  44. *
  45. * In that environment kexec copies the new kernel to its final
  46. * resting place. This means I can only support memory whose
  47. * physical address can fit in an unsigned long. In particular
  48. * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  49. * If the assembly stub has more restrictive requirements
  50. * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  51. * defined more restrictively in <asm/kexec.h>.
  52. *
  53. * The code for the transition from the current kernel to the
  54. * the new kernel is placed in the control_code_buffer, whose size
  55. * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
  56. * page of memory is necessary, but some architectures require more.
  57. * Because this memory must be identity mapped in the transition from
  58. * virtual to physical addresses it must live in the range
  59. * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  60. * modifiable.
  61. *
  62. * The assembly stub in the control code buffer is passed a linked list
  63. * of descriptor pages detailing the source pages of the new kernel,
  64. * and the destination addresses of those source pages. As this data
  65. * structure is not used in the context of the current OS, it must
  66. * be self-contained.
  67. *
  68. * The code has been made to work with highmem pages and will use a
  69. * destination page in its final resting place (if it happens
  70. * to allocate it). The end product of this is that most of the
  71. * physical address space, and most of RAM can be used.
  72. *
  73. * Future directions include:
  74. * - allocating a page table with the control code buffer identity
  75. * mapped, to simplify machine_kexec and make kexec_on_panic more
  76. * reliable.
  77. */
  78. /*
  79. * KIMAGE_NO_DEST is an impossible destination address..., for
  80. * allocating pages whose destination address we do not care about.
  81. */
  82. #define KIMAGE_NO_DEST (-1UL)
  83. static int kimage_is_destination_range(
  84. struct kimage *image, unsigned long start, unsigned long end);
  85. static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
  86. static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
  87. unsigned long nr_segments, struct kexec_segment __user *segments)
  88. {
  89. size_t segment_bytes;
  90. struct kimage *image;
  91. unsigned long i;
  92. int result;
  93. /* Allocate a controlling structure */
  94. result = -ENOMEM;
  95. image = kmalloc(sizeof(*image), GFP_KERNEL);
  96. if (!image) {
  97. goto out;
  98. }
  99. memset(image, 0, sizeof(*image));
  100. image->head = 0;
  101. image->entry = &image->head;
  102. image->last_entry = &image->head;
  103. image->control_page = ~0; /* By default this does not apply */
  104. image->start = entry;
  105. image->type = KEXEC_TYPE_DEFAULT;
  106. /* Initialize the list of control pages */
  107. INIT_LIST_HEAD(&image->control_pages);
  108. /* Initialize the list of destination pages */
  109. INIT_LIST_HEAD(&image->dest_pages);
  110. /* Initialize the list of unuseable pages */
  111. INIT_LIST_HEAD(&image->unuseable_pages);
  112. /* Read in the segments */
  113. image->nr_segments = nr_segments;
  114. segment_bytes = nr_segments * sizeof(*segments);
  115. result = copy_from_user(image->segment, segments, segment_bytes);
  116. if (result)
  117. goto out;
  118. /*
  119. * Verify we have good destination addresses. The caller is
  120. * responsible for making certain we don't attempt to load
  121. * the new image into invalid or reserved areas of RAM. This
  122. * just verifies it is an address we can use.
  123. *
  124. * Since the kernel does everything in page size chunks ensure
  125. * the destination addreses are page aligned. Too many
  126. * special cases crop of when we don't do this. The most
  127. * insidious is getting overlapping destination addresses
  128. * simply because addresses are changed to page size
  129. * granularity.
  130. */
  131. result = -EADDRNOTAVAIL;
  132. for (i = 0; i < nr_segments; i++) {
  133. unsigned long mstart, mend;
  134. mstart = image->segment[i].mem;
  135. mend = mstart + image->segment[i].memsz;
  136. if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
  137. goto out;
  138. if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
  139. goto out;
  140. }
  141. /* Verify our destination addresses do not overlap.
  142. * If we alloed overlapping destination addresses
  143. * through very weird things can happen with no
  144. * easy explanation as one segment stops on another.
  145. */
  146. result = -EINVAL;
  147. for(i = 0; i < nr_segments; i++) {
  148. unsigned long mstart, mend;
  149. unsigned long j;
  150. mstart = image->segment[i].mem;
  151. mend = mstart + image->segment[i].memsz;
  152. for(j = 0; j < i; j++) {
  153. unsigned long pstart, pend;
  154. pstart = image->segment[j].mem;
  155. pend = pstart + image->segment[j].memsz;
  156. /* Do the segments overlap ? */
  157. if ((mend > pstart) && (mstart < pend))
  158. goto out;
  159. }
  160. }
  161. /* Ensure our buffer sizes are strictly less than
  162. * our memory sizes. This should always be the case,
  163. * and it is easier to check up front than to be surprised
  164. * later on.
  165. */
  166. result = -EINVAL;
  167. for(i = 0; i < nr_segments; i++) {
  168. if (image->segment[i].bufsz > image->segment[i].memsz)
  169. goto out;
  170. }
  171. result = 0;
  172. out:
  173. if (result == 0) {
  174. *rimage = image;
  175. } else {
  176. kfree(image);
  177. }
  178. return result;
  179. }
  180. static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
  181. unsigned long nr_segments, struct kexec_segment __user *segments)
  182. {
  183. int result;
  184. struct kimage *image;
  185. /* Allocate and initialize a controlling structure */
  186. image = NULL;
  187. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  188. if (result) {
  189. goto out;
  190. }
  191. *rimage = image;
  192. /*
  193. * Find a location for the control code buffer, and add it
  194. * the vector of segments so that it's pages will also be
  195. * counted as destination pages.
  196. */
  197. result = -ENOMEM;
  198. image->control_code_page = kimage_alloc_control_pages(image,
  199. get_order(KEXEC_CONTROL_CODE_SIZE));
  200. if (!image->control_code_page) {
  201. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  202. goto out;
  203. }
  204. result = 0;
  205. out:
  206. if (result == 0) {
  207. *rimage = image;
  208. } else {
  209. kfree(image);
  210. }
  211. return result;
  212. }
  213. static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
  214. unsigned long nr_segments, struct kexec_segment *segments)
  215. {
  216. int result;
  217. struct kimage *image;
  218. unsigned long i;
  219. image = NULL;
  220. /* Verify we have a valid entry point */
  221. if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
  222. result = -EADDRNOTAVAIL;
  223. goto out;
  224. }
  225. /* Allocate and initialize a controlling structure */
  226. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  227. if (result) {
  228. goto out;
  229. }
  230. /* Enable the special crash kernel control page
  231. * allocation policy.
  232. */
  233. image->control_page = crashk_res.start;
  234. image->type = KEXEC_TYPE_CRASH;
  235. /*
  236. * Verify we have good destination addresses. Normally
  237. * the caller is responsible for making certain we don't
  238. * attempt to load the new image into invalid or reserved
  239. * areas of RAM. But crash kernels are preloaded into a
  240. * reserved area of ram. We must ensure the addresses
  241. * are in the reserved area otherwise preloading the
  242. * kernel could corrupt things.
  243. */
  244. result = -EADDRNOTAVAIL;
  245. for (i = 0; i < nr_segments; i++) {
  246. unsigned long mstart, mend;
  247. mstart = image->segment[i].mem;
  248. mend = mstart + image->segment[i].memsz - 1;
  249. /* Ensure we are within the crash kernel limits */
  250. if ((mstart < crashk_res.start) || (mend > crashk_res.end))
  251. goto out;
  252. }
  253. /*
  254. * Find a location for the control code buffer, and add
  255. * the vector of segments so that it's pages will also be
  256. * counted as destination pages.
  257. */
  258. result = -ENOMEM;
  259. image->control_code_page = kimage_alloc_control_pages(image,
  260. get_order(KEXEC_CONTROL_CODE_SIZE));
  261. if (!image->control_code_page) {
  262. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  263. goto out;
  264. }
  265. result = 0;
  266. out:
  267. if (result == 0) {
  268. *rimage = image;
  269. } else {
  270. kfree(image);
  271. }
  272. return result;
  273. }
  274. static int kimage_is_destination_range(
  275. struct kimage *image, unsigned long start, unsigned long end)
  276. {
  277. unsigned long i;
  278. for (i = 0; i < image->nr_segments; i++) {
  279. unsigned long mstart, mend;
  280. mstart = image->segment[i].mem;
  281. mend = mstart + image->segment[i].memsz;
  282. if ((end > mstart) && (start < mend)) {
  283. return 1;
  284. }
  285. }
  286. return 0;
  287. }
  288. static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
  289. {
  290. struct page *pages;
  291. pages = alloc_pages(gfp_mask, order);
  292. if (pages) {
  293. unsigned int count, i;
  294. pages->mapping = NULL;
  295. pages->private = order;
  296. count = 1 << order;
  297. for(i = 0; i < count; i++) {
  298. SetPageReserved(pages + i);
  299. }
  300. }
  301. return pages;
  302. }
  303. static void kimage_free_pages(struct page *page)
  304. {
  305. unsigned int order, count, i;
  306. order = page->private;
  307. count = 1 << order;
  308. for(i = 0; i < count; i++) {
  309. ClearPageReserved(page + i);
  310. }
  311. __free_pages(page, order);
  312. }
  313. static void kimage_free_page_list(struct list_head *list)
  314. {
  315. struct list_head *pos, *next;
  316. list_for_each_safe(pos, next, list) {
  317. struct page *page;
  318. page = list_entry(pos, struct page, lru);
  319. list_del(&page->lru);
  320. kimage_free_pages(page);
  321. }
  322. }
  323. static struct page *kimage_alloc_normal_control_pages(
  324. struct kimage *image, unsigned int order)
  325. {
  326. /* Control pages are special, they are the intermediaries
  327. * that are needed while we copy the rest of the pages
  328. * to their final resting place. As such they must
  329. * not conflict with either the destination addresses
  330. * or memory the kernel is already using.
  331. *
  332. * The only case where we really need more than one of
  333. * these are for architectures where we cannot disable
  334. * the MMU and must instead generate an identity mapped
  335. * page table for all of the memory.
  336. *
  337. * At worst this runs in O(N) of the image size.
  338. */
  339. struct list_head extra_pages;
  340. struct page *pages;
  341. unsigned int count;
  342. count = 1 << order;
  343. INIT_LIST_HEAD(&extra_pages);
  344. /* Loop while I can allocate a page and the page allocated
  345. * is a destination page.
  346. */
  347. do {
  348. unsigned long pfn, epfn, addr, eaddr;
  349. pages = kimage_alloc_pages(GFP_KERNEL, order);
  350. if (!pages)
  351. break;
  352. pfn = page_to_pfn(pages);
  353. epfn = pfn + count;
  354. addr = pfn << PAGE_SHIFT;
  355. eaddr = epfn << PAGE_SHIFT;
  356. if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
  357. kimage_is_destination_range(image, addr, eaddr))
  358. {
  359. list_add(&pages->lru, &extra_pages);
  360. pages = NULL;
  361. }
  362. } while(!pages);
  363. if (pages) {
  364. /* Remember the allocated page... */
  365. list_add(&pages->lru, &image->control_pages);
  366. /* Because the page is already in it's destination
  367. * location we will never allocate another page at
  368. * that address. Therefore kimage_alloc_pages
  369. * will not return it (again) and we don't need
  370. * to give it an entry in image->segment[].
  371. */
  372. }
  373. /* Deal with the destination pages I have inadvertently allocated.
  374. *
  375. * Ideally I would convert multi-page allocations into single
  376. * page allocations, and add everyting to image->dest_pages.
  377. *
  378. * For now it is simpler to just free the pages.
  379. */
  380. kimage_free_page_list(&extra_pages);
  381. return pages;
  382. }
  383. static struct page *kimage_alloc_crash_control_pages(
  384. struct kimage *image, unsigned int order)
  385. {
  386. /* Control pages are special, they are the intermediaries
  387. * that are needed while we copy the rest of the pages
  388. * to their final resting place. As such they must
  389. * not conflict with either the destination addresses
  390. * or memory the kernel is already using.
  391. *
  392. * Control pages are also the only pags we must allocate
  393. * when loading a crash kernel. All of the other pages
  394. * are specified by the segments and we just memcpy
  395. * into them directly.
  396. *
  397. * The only case where we really need more than one of
  398. * these are for architectures where we cannot disable
  399. * the MMU and must instead generate an identity mapped
  400. * page table for all of the memory.
  401. *
  402. * Given the low demand this implements a very simple
  403. * allocator that finds the first hole of the appropriate
  404. * size in the reserved memory region, and allocates all
  405. * of the memory up to and including the hole.
  406. */
  407. unsigned long hole_start, hole_end, size;
  408. struct page *pages;
  409. pages = NULL;
  410. size = (1 << order) << PAGE_SHIFT;
  411. hole_start = (image->control_page + (size - 1)) & ~(size - 1);
  412. hole_end = hole_start + size - 1;
  413. while(hole_end <= crashk_res.end) {
  414. unsigned long i;
  415. if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
  416. break;
  417. }
  418. if (hole_end > crashk_res.end) {
  419. break;
  420. }
  421. /* See if I overlap any of the segments */
  422. for(i = 0; i < image->nr_segments; i++) {
  423. unsigned long mstart, mend;
  424. mstart = image->segment[i].mem;
  425. mend = mstart + image->segment[i].memsz - 1;
  426. if ((hole_end >= mstart) && (hole_start <= mend)) {
  427. /* Advance the hole to the end of the segment */
  428. hole_start = (mend + (size - 1)) & ~(size - 1);
  429. hole_end = hole_start + size - 1;
  430. break;
  431. }
  432. }
  433. /* If I don't overlap any segments I have found my hole! */
  434. if (i == image->nr_segments) {
  435. pages = pfn_to_page(hole_start >> PAGE_SHIFT);
  436. break;
  437. }
  438. }
  439. if (pages) {
  440. image->control_page = hole_end;
  441. }
  442. return pages;
  443. }
  444. struct page *kimage_alloc_control_pages(
  445. struct kimage *image, unsigned int order)
  446. {
  447. struct page *pages = NULL;
  448. switch(image->type) {
  449. case KEXEC_TYPE_DEFAULT:
  450. pages = kimage_alloc_normal_control_pages(image, order);
  451. break;
  452. case KEXEC_TYPE_CRASH:
  453. pages = kimage_alloc_crash_control_pages(image, order);
  454. break;
  455. }
  456. return pages;
  457. }
  458. static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
  459. {
  460. if (*image->entry != 0) {
  461. image->entry++;
  462. }
  463. if (image->entry == image->last_entry) {
  464. kimage_entry_t *ind_page;
  465. struct page *page;
  466. page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
  467. if (!page) {
  468. return -ENOMEM;
  469. }
  470. ind_page = page_address(page);
  471. *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
  472. image->entry = ind_page;
  473. image->last_entry =
  474. ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
  475. }
  476. *image->entry = entry;
  477. image->entry++;
  478. *image->entry = 0;
  479. return 0;
  480. }
  481. static int kimage_set_destination(
  482. struct kimage *image, unsigned long destination)
  483. {
  484. int result;
  485. destination &= PAGE_MASK;
  486. result = kimage_add_entry(image, destination | IND_DESTINATION);
  487. if (result == 0) {
  488. image->destination = destination;
  489. }
  490. return result;
  491. }
  492. static int kimage_add_page(struct kimage *image, unsigned long page)
  493. {
  494. int result;
  495. page &= PAGE_MASK;
  496. result = kimage_add_entry(image, page | IND_SOURCE);
  497. if (result == 0) {
  498. image->destination += PAGE_SIZE;
  499. }
  500. return result;
  501. }
  502. static void kimage_free_extra_pages(struct kimage *image)
  503. {
  504. /* Walk through and free any extra destination pages I may have */
  505. kimage_free_page_list(&image->dest_pages);
  506. /* Walk through and free any unuseable pages I have cached */
  507. kimage_free_page_list(&image->unuseable_pages);
  508. }
  509. static int kimage_terminate(struct kimage *image)
  510. {
  511. if (*image->entry != 0) {
  512. image->entry++;
  513. }
  514. *image->entry = IND_DONE;
  515. return 0;
  516. }
  517. #define for_each_kimage_entry(image, ptr, entry) \
  518. for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
  519. ptr = (entry & IND_INDIRECTION)? \
  520. phys_to_virt((entry & PAGE_MASK)): ptr +1)
  521. static void kimage_free_entry(kimage_entry_t entry)
  522. {
  523. struct page *page;
  524. page = pfn_to_page(entry >> PAGE_SHIFT);
  525. kimage_free_pages(page);
  526. }
  527. static void kimage_free(struct kimage *image)
  528. {
  529. kimage_entry_t *ptr, entry;
  530. kimage_entry_t ind = 0;
  531. if (!image)
  532. return;
  533. kimage_free_extra_pages(image);
  534. for_each_kimage_entry(image, ptr, entry) {
  535. if (entry & IND_INDIRECTION) {
  536. /* Free the previous indirection page */
  537. if (ind & IND_INDIRECTION) {
  538. kimage_free_entry(ind);
  539. }
  540. /* Save this indirection page until we are
  541. * done with it.
  542. */
  543. ind = entry;
  544. }
  545. else if (entry & IND_SOURCE) {
  546. kimage_free_entry(entry);
  547. }
  548. }
  549. /* Free the final indirection page */
  550. if (ind & IND_INDIRECTION) {
  551. kimage_free_entry(ind);
  552. }
  553. /* Handle any machine specific cleanup */
  554. machine_kexec_cleanup(image);
  555. /* Free the kexec control pages... */
  556. kimage_free_page_list(&image->control_pages);
  557. kfree(image);
  558. }
  559. static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
  560. {
  561. kimage_entry_t *ptr, entry;
  562. unsigned long destination = 0;
  563. for_each_kimage_entry(image, ptr, entry) {
  564. if (entry & IND_DESTINATION) {
  565. destination = entry & PAGE_MASK;
  566. }
  567. else if (entry & IND_SOURCE) {
  568. if (page == destination) {
  569. return ptr;
  570. }
  571. destination += PAGE_SIZE;
  572. }
  573. }
  574. return 0;
  575. }
  576. static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
  577. {
  578. /*
  579. * Here we implement safeguards to ensure that a source page
  580. * is not copied to its destination page before the data on
  581. * the destination page is no longer useful.
  582. *
  583. * To do this we maintain the invariant that a source page is
  584. * either its own destination page, or it is not a
  585. * destination page at all.
  586. *
  587. * That is slightly stronger than required, but the proof
  588. * that no problems will not occur is trivial, and the
  589. * implementation is simply to verify.
  590. *
  591. * When allocating all pages normally this algorithm will run
  592. * in O(N) time, but in the worst case it will run in O(N^2)
  593. * time. If the runtime is a problem the data structures can
  594. * be fixed.
  595. */
  596. struct page *page;
  597. unsigned long addr;
  598. /*
  599. * Walk through the list of destination pages, and see if I
  600. * have a match.
  601. */
  602. list_for_each_entry(page, &image->dest_pages, lru) {
  603. addr = page_to_pfn(page) << PAGE_SHIFT;
  604. if (addr == destination) {
  605. list_del(&page->lru);
  606. return page;
  607. }
  608. }
  609. page = NULL;
  610. while (1) {
  611. kimage_entry_t *old;
  612. /* Allocate a page, if we run out of memory give up */
  613. page = kimage_alloc_pages(gfp_mask, 0);
  614. if (!page) {
  615. return 0;
  616. }
  617. /* If the page cannot be used file it away */
  618. if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
  619. list_add(&page->lru, &image->unuseable_pages);
  620. continue;
  621. }
  622. addr = page_to_pfn(page) << PAGE_SHIFT;
  623. /* If it is the destination page we want use it */
  624. if (addr == destination)
  625. break;
  626. /* If the page is not a destination page use it */
  627. if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
  628. break;
  629. /*
  630. * I know that the page is someones destination page.
  631. * See if there is already a source page for this
  632. * destination page. And if so swap the source pages.
  633. */
  634. old = kimage_dst_used(image, addr);
  635. if (old) {
  636. /* If so move it */
  637. unsigned long old_addr;
  638. struct page *old_page;
  639. old_addr = *old & PAGE_MASK;
  640. old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
  641. copy_highpage(page, old_page);
  642. *old = addr | (*old & ~PAGE_MASK);
  643. /* The old page I have found cannot be a
  644. * destination page, so return it.
  645. */
  646. addr = old_addr;
  647. page = old_page;
  648. break;
  649. }
  650. else {
  651. /* Place the page on the destination list I
  652. * will use it later.
  653. */
  654. list_add(&page->lru, &image->dest_pages);
  655. }
  656. }
  657. return page;
  658. }
  659. static int kimage_load_normal_segment(struct kimage *image,
  660. struct kexec_segment *segment)
  661. {
  662. unsigned long maddr;
  663. unsigned long ubytes, mbytes;
  664. int result;
  665. unsigned char *buf;
  666. result = 0;
  667. buf = segment->buf;
  668. ubytes = segment->bufsz;
  669. mbytes = segment->memsz;
  670. maddr = segment->mem;
  671. result = kimage_set_destination(image, maddr);
  672. if (result < 0) {
  673. goto out;
  674. }
  675. while(mbytes) {
  676. struct page *page;
  677. char *ptr;
  678. size_t uchunk, mchunk;
  679. page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
  680. if (page == 0) {
  681. result = -ENOMEM;
  682. goto out;
  683. }
  684. result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
  685. if (result < 0) {
  686. goto out;
  687. }
  688. ptr = kmap(page);
  689. /* Start with a clear page */
  690. memset(ptr, 0, PAGE_SIZE);
  691. ptr += maddr & ~PAGE_MASK;
  692. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  693. if (mchunk > mbytes) {
  694. mchunk = mbytes;
  695. }
  696. uchunk = mchunk;
  697. if (uchunk > ubytes) {
  698. uchunk = ubytes;
  699. }
  700. result = copy_from_user(ptr, buf, uchunk);
  701. kunmap(page);
  702. if (result) {
  703. result = (result < 0) ? result : -EIO;
  704. goto out;
  705. }
  706. ubytes -= uchunk;
  707. maddr += mchunk;
  708. buf += mchunk;
  709. mbytes -= mchunk;
  710. }
  711. out:
  712. return result;
  713. }
  714. static int kimage_load_crash_segment(struct kimage *image,
  715. struct kexec_segment *segment)
  716. {
  717. /* For crash dumps kernels we simply copy the data from
  718. * user space to it's destination.
  719. * We do things a page at a time for the sake of kmap.
  720. */
  721. unsigned long maddr;
  722. unsigned long ubytes, mbytes;
  723. int result;
  724. unsigned char *buf;
  725. result = 0;
  726. buf = segment->buf;
  727. ubytes = segment->bufsz;
  728. mbytes = segment->memsz;
  729. maddr = segment->mem;
  730. while(mbytes) {
  731. struct page *page;
  732. char *ptr;
  733. size_t uchunk, mchunk;
  734. page = pfn_to_page(maddr >> PAGE_SHIFT);
  735. if (page == 0) {
  736. result = -ENOMEM;
  737. goto out;
  738. }
  739. ptr = kmap(page);
  740. ptr += maddr & ~PAGE_MASK;
  741. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  742. if (mchunk > mbytes) {
  743. mchunk = mbytes;
  744. }
  745. uchunk = mchunk;
  746. if (uchunk > ubytes) {
  747. uchunk = ubytes;
  748. /* Zero the trailing part of the page */
  749. memset(ptr + uchunk, 0, mchunk - uchunk);
  750. }
  751. result = copy_from_user(ptr, buf, uchunk);
  752. kunmap(page);
  753. if (result) {
  754. result = (result < 0) ? result : -EIO;
  755. goto out;
  756. }
  757. ubytes -= uchunk;
  758. maddr += mchunk;
  759. buf += mchunk;
  760. mbytes -= mchunk;
  761. }
  762. out:
  763. return result;
  764. }
  765. static int kimage_load_segment(struct kimage *image,
  766. struct kexec_segment *segment)
  767. {
  768. int result = -ENOMEM;
  769. switch(image->type) {
  770. case KEXEC_TYPE_DEFAULT:
  771. result = kimage_load_normal_segment(image, segment);
  772. break;
  773. case KEXEC_TYPE_CRASH:
  774. result = kimage_load_crash_segment(image, segment);
  775. break;
  776. }
  777. return result;
  778. }
  779. /*
  780. * Exec Kernel system call: for obvious reasons only root may call it.
  781. *
  782. * This call breaks up into three pieces.
  783. * - A generic part which loads the new kernel from the current
  784. * address space, and very carefully places the data in the
  785. * allocated pages.
  786. *
  787. * - A generic part that interacts with the kernel and tells all of
  788. * the devices to shut down. Preventing on-going dmas, and placing
  789. * the devices in a consistent state so a later kernel can
  790. * reinitialize them.
  791. *
  792. * - A machine specific part that includes the syscall number
  793. * and the copies the image to it's final destination. And
  794. * jumps into the image at entry.
  795. *
  796. * kexec does not sync, or unmount filesystems so if you need
  797. * that to happen you need to do that yourself.
  798. */
  799. struct kimage *kexec_image = NULL;
  800. static struct kimage *kexec_crash_image = NULL;
  801. /*
  802. * A home grown binary mutex.
  803. * Nothing can wait so this mutex is safe to use
  804. * in interrupt context :)
  805. */
  806. static int kexec_lock = 0;
  807. asmlinkage long sys_kexec_load(unsigned long entry,
  808. unsigned long nr_segments, struct kexec_segment __user *segments,
  809. unsigned long flags)
  810. {
  811. struct kimage **dest_image, *image;
  812. int locked;
  813. int result;
  814. /* We only trust the superuser with rebooting the system. */
  815. if (!capable(CAP_SYS_BOOT))
  816. return -EPERM;
  817. /*
  818. * Verify we have a legal set of flags
  819. * This leaves us room for future extensions.
  820. */
  821. if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
  822. return -EINVAL;
  823. /* Verify we are on the appropriate architecture */
  824. if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
  825. ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
  826. {
  827. return -EINVAL;
  828. }
  829. /* Put an artificial cap on the number
  830. * of segments passed to kexec_load.
  831. */
  832. if (nr_segments > KEXEC_SEGMENT_MAX)
  833. return -EINVAL;
  834. image = NULL;
  835. result = 0;
  836. /* Because we write directly to the reserved memory
  837. * region when loading crash kernels we need a mutex here to
  838. * prevent multiple crash kernels from attempting to load
  839. * simultaneously, and to prevent a crash kernel from loading
  840. * over the top of a in use crash kernel.
  841. *
  842. * KISS: always take the mutex.
  843. */
  844. locked = xchg(&kexec_lock, 1);
  845. if (locked) {
  846. return -EBUSY;
  847. }
  848. dest_image = &kexec_image;
  849. if (flags & KEXEC_ON_CRASH) {
  850. dest_image = &kexec_crash_image;
  851. }
  852. if (nr_segments > 0) {
  853. unsigned long i;
  854. /* Loading another kernel to reboot into */
  855. if ((flags & KEXEC_ON_CRASH) == 0) {
  856. result = kimage_normal_alloc(&image, entry, nr_segments, segments);
  857. }
  858. /* Loading another kernel to switch to if this one crashes */
  859. else if (flags & KEXEC_ON_CRASH) {
  860. /* Free any current crash dump kernel before
  861. * we corrupt it.
  862. */
  863. kimage_free(xchg(&kexec_crash_image, NULL));
  864. result = kimage_crash_alloc(&image, entry, nr_segments, segments);
  865. }
  866. if (result) {
  867. goto out;
  868. }
  869. result = machine_kexec_prepare(image);
  870. if (result) {
  871. goto out;
  872. }
  873. for(i = 0; i < nr_segments; i++) {
  874. result = kimage_load_segment(image, &image->segment[i]);
  875. if (result) {
  876. goto out;
  877. }
  878. }
  879. result = kimage_terminate(image);
  880. if (result) {
  881. goto out;
  882. }
  883. }
  884. /* Install the new kernel, and Uninstall the old */
  885. image = xchg(dest_image, image);
  886. out:
  887. xchg(&kexec_lock, 0); /* Release the mutex */
  888. kimage_free(image);
  889. return result;
  890. }
  891. #ifdef CONFIG_COMPAT
  892. asmlinkage long compat_sys_kexec_load(unsigned long entry,
  893. unsigned long nr_segments, struct compat_kexec_segment __user *segments,
  894. unsigned long flags)
  895. {
  896. struct compat_kexec_segment in;
  897. struct kexec_segment out, __user *ksegments;
  898. unsigned long i, result;
  899. /* Don't allow clients that don't understand the native
  900. * architecture to do anything.
  901. */
  902. if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
  903. return -EINVAL;
  904. }
  905. if (nr_segments > KEXEC_SEGMENT_MAX) {
  906. return -EINVAL;
  907. }
  908. ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
  909. for (i=0; i < nr_segments; i++) {
  910. result = copy_from_user(&in, &segments[i], sizeof(in));
  911. if (result) {
  912. return -EFAULT;
  913. }
  914. out.buf = compat_ptr(in.buf);
  915. out.bufsz = in.bufsz;
  916. out.mem = in.mem;
  917. out.memsz = in.memsz;
  918. result = copy_to_user(&ksegments[i], &out, sizeof(out));
  919. if (result) {
  920. return -EFAULT;
  921. }
  922. }
  923. return sys_kexec_load(entry, nr_segments, ksegments, flags);
  924. }
  925. #endif
  926. void crash_kexec(struct pt_regs *regs)
  927. {
  928. struct kimage *image;
  929. int locked;
  930. /* Take the kexec_lock here to prevent sys_kexec_load
  931. * running on one cpu from replacing the crash kernel
  932. * we are using after a panic on a different cpu.
  933. *
  934. * If the crash kernel was not located in a fixed area
  935. * of memory the xchg(&kexec_crash_image) would be
  936. * sufficient. But since I reuse the memory...
  937. */
  938. locked = xchg(&kexec_lock, 1);
  939. if (!locked) {
  940. image = xchg(&kexec_crash_image, NULL);
  941. if (image) {
  942. machine_crash_shutdown(regs);
  943. machine_kexec(image);
  944. }
  945. xchg(&kexec_lock, 0);
  946. }
  947. }