namespace.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542
  1. /*
  2. * linux/fs/namespace.c
  3. *
  4. * (C) Copyright Al Viro 2000, 2001
  5. * Released under GPL v2.
  6. *
  7. * Based on code from fs/super.c, copyright Linus Torvalds and others.
  8. * Heavily rewritten.
  9. */
  10. #include <linux/config.h>
  11. #include <linux/syscalls.h>
  12. #include <linux/slab.h>
  13. #include <linux/sched.h>
  14. #include <linux/smp_lock.h>
  15. #include <linux/init.h>
  16. #include <linux/quotaops.h>
  17. #include <linux/acct.h>
  18. #include <linux/module.h>
  19. #include <linux/seq_file.h>
  20. #include <linux/namespace.h>
  21. #include <linux/namei.h>
  22. #include <linux/security.h>
  23. #include <linux/mount.h>
  24. #include <asm/uaccess.h>
  25. #include <asm/unistd.h>
  26. #include "pnode.h"
  27. extern int __init init_rootfs(void);
  28. #define CL_EXPIRE 0x01
  29. #ifdef CONFIG_SYSFS
  30. extern int __init sysfs_init(void);
  31. #else
  32. static inline int sysfs_init(void)
  33. {
  34. return 0;
  35. }
  36. #endif
  37. /* spinlock for vfsmount related operations, inplace of dcache_lock */
  38. __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
  39. static int event;
  40. static struct list_head *mount_hashtable;
  41. static int hash_mask __read_mostly, hash_bits __read_mostly;
  42. static kmem_cache_t *mnt_cache;
  43. static struct rw_semaphore namespace_sem;
  44. static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
  45. {
  46. unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
  47. tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
  48. tmp = tmp + (tmp >> hash_bits);
  49. return tmp & hash_mask;
  50. }
  51. struct vfsmount *alloc_vfsmnt(const char *name)
  52. {
  53. struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
  54. if (mnt) {
  55. memset(mnt, 0, sizeof(struct vfsmount));
  56. atomic_set(&mnt->mnt_count, 1);
  57. INIT_LIST_HEAD(&mnt->mnt_hash);
  58. INIT_LIST_HEAD(&mnt->mnt_child);
  59. INIT_LIST_HEAD(&mnt->mnt_mounts);
  60. INIT_LIST_HEAD(&mnt->mnt_list);
  61. INIT_LIST_HEAD(&mnt->mnt_expire);
  62. INIT_LIST_HEAD(&mnt->mnt_share);
  63. if (name) {
  64. int size = strlen(name) + 1;
  65. char *newname = kmalloc(size, GFP_KERNEL);
  66. if (newname) {
  67. memcpy(newname, name, size);
  68. mnt->mnt_devname = newname;
  69. }
  70. }
  71. }
  72. return mnt;
  73. }
  74. void free_vfsmnt(struct vfsmount *mnt)
  75. {
  76. kfree(mnt->mnt_devname);
  77. kmem_cache_free(mnt_cache, mnt);
  78. }
  79. /*
  80. * Now, lookup_mnt increments the ref count before returning
  81. * the vfsmount struct.
  82. */
  83. struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
  84. {
  85. struct list_head *head = mount_hashtable + hash(mnt, dentry);
  86. struct list_head *tmp = head;
  87. struct vfsmount *p, *found = NULL;
  88. spin_lock(&vfsmount_lock);
  89. for (;;) {
  90. tmp = tmp->next;
  91. p = NULL;
  92. if (tmp == head)
  93. break;
  94. p = list_entry(tmp, struct vfsmount, mnt_hash);
  95. if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
  96. found = mntget(p);
  97. break;
  98. }
  99. }
  100. spin_unlock(&vfsmount_lock);
  101. return found;
  102. }
  103. static inline int check_mnt(struct vfsmount *mnt)
  104. {
  105. return mnt->mnt_namespace == current->namespace;
  106. }
  107. static void touch_namespace(struct namespace *ns)
  108. {
  109. if (ns) {
  110. ns->event = ++event;
  111. wake_up_interruptible(&ns->poll);
  112. }
  113. }
  114. static void __touch_namespace(struct namespace *ns)
  115. {
  116. if (ns && ns->event != event) {
  117. ns->event = event;
  118. wake_up_interruptible(&ns->poll);
  119. }
  120. }
  121. static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
  122. {
  123. old_nd->dentry = mnt->mnt_mountpoint;
  124. old_nd->mnt = mnt->mnt_parent;
  125. mnt->mnt_parent = mnt;
  126. mnt->mnt_mountpoint = mnt->mnt_root;
  127. list_del_init(&mnt->mnt_child);
  128. list_del_init(&mnt->mnt_hash);
  129. old_nd->dentry->d_mounted--;
  130. }
  131. static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
  132. {
  133. mnt->mnt_parent = mntget(nd->mnt);
  134. mnt->mnt_mountpoint = dget(nd->dentry);
  135. list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
  136. list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
  137. nd->dentry->d_mounted++;
  138. }
  139. static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
  140. {
  141. struct list_head *next = p->mnt_mounts.next;
  142. if (next == &p->mnt_mounts) {
  143. while (1) {
  144. if (p == root)
  145. return NULL;
  146. next = p->mnt_child.next;
  147. if (next != &p->mnt_parent->mnt_mounts)
  148. break;
  149. p = p->mnt_parent;
  150. }
  151. }
  152. return list_entry(next, struct vfsmount, mnt_child);
  153. }
  154. static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
  155. int flag)
  156. {
  157. struct super_block *sb = old->mnt_sb;
  158. struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
  159. if (mnt) {
  160. mnt->mnt_flags = old->mnt_flags;
  161. atomic_inc(&sb->s_active);
  162. mnt->mnt_sb = sb;
  163. mnt->mnt_root = dget(root);
  164. mnt->mnt_mountpoint = mnt->mnt_root;
  165. mnt->mnt_parent = mnt;
  166. mnt->mnt_namespace = current->namespace;
  167. /* stick the duplicate mount on the same expiry list
  168. * as the original if that was on one */
  169. if (flag & CL_EXPIRE) {
  170. spin_lock(&vfsmount_lock);
  171. if (!list_empty(&old->mnt_expire))
  172. list_add(&mnt->mnt_expire, &old->mnt_expire);
  173. spin_unlock(&vfsmount_lock);
  174. }
  175. }
  176. return mnt;
  177. }
  178. static inline void __mntput(struct vfsmount *mnt)
  179. {
  180. struct super_block *sb = mnt->mnt_sb;
  181. dput(mnt->mnt_root);
  182. free_vfsmnt(mnt);
  183. deactivate_super(sb);
  184. }
  185. void mntput_no_expire(struct vfsmount *mnt)
  186. {
  187. repeat:
  188. if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
  189. if (likely(!mnt->mnt_pinned)) {
  190. spin_unlock(&vfsmount_lock);
  191. __mntput(mnt);
  192. return;
  193. }
  194. atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
  195. mnt->mnt_pinned = 0;
  196. spin_unlock(&vfsmount_lock);
  197. acct_auto_close_mnt(mnt);
  198. security_sb_umount_close(mnt);
  199. goto repeat;
  200. }
  201. }
  202. EXPORT_SYMBOL(mntput_no_expire);
  203. void mnt_pin(struct vfsmount *mnt)
  204. {
  205. spin_lock(&vfsmount_lock);
  206. mnt->mnt_pinned++;
  207. spin_unlock(&vfsmount_lock);
  208. }
  209. EXPORT_SYMBOL(mnt_pin);
  210. void mnt_unpin(struct vfsmount *mnt)
  211. {
  212. spin_lock(&vfsmount_lock);
  213. if (mnt->mnt_pinned) {
  214. atomic_inc(&mnt->mnt_count);
  215. mnt->mnt_pinned--;
  216. }
  217. spin_unlock(&vfsmount_lock);
  218. }
  219. EXPORT_SYMBOL(mnt_unpin);
  220. /* iterator */
  221. static void *m_start(struct seq_file *m, loff_t *pos)
  222. {
  223. struct namespace *n = m->private;
  224. struct list_head *p;
  225. loff_t l = *pos;
  226. down_read(&namespace_sem);
  227. list_for_each(p, &n->list)
  228. if (!l--)
  229. return list_entry(p, struct vfsmount, mnt_list);
  230. return NULL;
  231. }
  232. static void *m_next(struct seq_file *m, void *v, loff_t *pos)
  233. {
  234. struct namespace *n = m->private;
  235. struct list_head *p = ((struct vfsmount *)v)->mnt_list.next;
  236. (*pos)++;
  237. return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list);
  238. }
  239. static void m_stop(struct seq_file *m, void *v)
  240. {
  241. up_read(&namespace_sem);
  242. }
  243. static inline void mangle(struct seq_file *m, const char *s)
  244. {
  245. seq_escape(m, s, " \t\n\\");
  246. }
  247. static int show_vfsmnt(struct seq_file *m, void *v)
  248. {
  249. struct vfsmount *mnt = v;
  250. int err = 0;
  251. static struct proc_fs_info {
  252. int flag;
  253. char *str;
  254. } fs_info[] = {
  255. { MS_SYNCHRONOUS, ",sync" },
  256. { MS_DIRSYNC, ",dirsync" },
  257. { MS_MANDLOCK, ",mand" },
  258. { MS_NOATIME, ",noatime" },
  259. { MS_NODIRATIME, ",nodiratime" },
  260. { 0, NULL }
  261. };
  262. static struct proc_fs_info mnt_info[] = {
  263. { MNT_NOSUID, ",nosuid" },
  264. { MNT_NODEV, ",nodev" },
  265. { MNT_NOEXEC, ",noexec" },
  266. { 0, NULL }
  267. };
  268. struct proc_fs_info *fs_infop;
  269. mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
  270. seq_putc(m, ' ');
  271. seq_path(m, mnt, mnt->mnt_root, " \t\n\\");
  272. seq_putc(m, ' ');
  273. mangle(m, mnt->mnt_sb->s_type->name);
  274. seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
  275. for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
  276. if (mnt->mnt_sb->s_flags & fs_infop->flag)
  277. seq_puts(m, fs_infop->str);
  278. }
  279. for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
  280. if (mnt->mnt_flags & fs_infop->flag)
  281. seq_puts(m, fs_infop->str);
  282. }
  283. if (mnt->mnt_sb->s_op->show_options)
  284. err = mnt->mnt_sb->s_op->show_options(m, mnt);
  285. seq_puts(m, " 0 0\n");
  286. return err;
  287. }
  288. struct seq_operations mounts_op = {
  289. .start = m_start,
  290. .next = m_next,
  291. .stop = m_stop,
  292. .show = show_vfsmnt
  293. };
  294. /**
  295. * may_umount_tree - check if a mount tree is busy
  296. * @mnt: root of mount tree
  297. *
  298. * This is called to check if a tree of mounts has any
  299. * open files, pwds, chroots or sub mounts that are
  300. * busy.
  301. */
  302. int may_umount_tree(struct vfsmount *mnt)
  303. {
  304. int actual_refs = 0;
  305. int minimum_refs = 0;
  306. struct vfsmount *p;
  307. spin_lock(&vfsmount_lock);
  308. for (p = mnt; p; p = next_mnt(p, mnt)) {
  309. actual_refs += atomic_read(&p->mnt_count);
  310. minimum_refs += 2;
  311. }
  312. spin_unlock(&vfsmount_lock);
  313. if (actual_refs > minimum_refs)
  314. return -EBUSY;
  315. return 0;
  316. }
  317. EXPORT_SYMBOL(may_umount_tree);
  318. /**
  319. * may_umount - check if a mount point is busy
  320. * @mnt: root of mount
  321. *
  322. * This is called to check if a mount point has any
  323. * open files, pwds, chroots or sub mounts. If the
  324. * mount has sub mounts this will return busy
  325. * regardless of whether the sub mounts are busy.
  326. *
  327. * Doesn't take quota and stuff into account. IOW, in some cases it will
  328. * give false negatives. The main reason why it's here is that we need
  329. * a non-destructive way to look for easily umountable filesystems.
  330. */
  331. int may_umount(struct vfsmount *mnt)
  332. {
  333. if (atomic_read(&mnt->mnt_count) > 2)
  334. return -EBUSY;
  335. return 0;
  336. }
  337. EXPORT_SYMBOL(may_umount);
  338. static void release_mounts(struct list_head *head)
  339. {
  340. struct vfsmount *mnt;
  341. while(!list_empty(head)) {
  342. mnt = list_entry(head->next, struct vfsmount, mnt_hash);
  343. list_del_init(&mnt->mnt_hash);
  344. if (mnt->mnt_parent != mnt) {
  345. struct dentry *dentry;
  346. struct vfsmount *m;
  347. spin_lock(&vfsmount_lock);
  348. dentry = mnt->mnt_mountpoint;
  349. m = mnt->mnt_parent;
  350. mnt->mnt_mountpoint = mnt->mnt_root;
  351. mnt->mnt_parent = mnt;
  352. spin_unlock(&vfsmount_lock);
  353. dput(dentry);
  354. mntput(m);
  355. }
  356. mntput(mnt);
  357. }
  358. }
  359. static void umount_tree(struct vfsmount *mnt, struct list_head *kill)
  360. {
  361. struct vfsmount *p;
  362. for (p = mnt; p; p = next_mnt(p, mnt)) {
  363. list_del(&p->mnt_hash);
  364. list_add(&p->mnt_hash, kill);
  365. }
  366. list_for_each_entry(p, kill, mnt_hash) {
  367. list_del_init(&p->mnt_expire);
  368. list_del_init(&p->mnt_list);
  369. __touch_namespace(p->mnt_namespace);
  370. p->mnt_namespace = NULL;
  371. list_del_init(&p->mnt_child);
  372. if (p->mnt_parent != p)
  373. mnt->mnt_mountpoint->d_mounted--;
  374. }
  375. }
  376. static int do_umount(struct vfsmount *mnt, int flags)
  377. {
  378. struct super_block *sb = mnt->mnt_sb;
  379. int retval;
  380. LIST_HEAD(umount_list);
  381. retval = security_sb_umount(mnt, flags);
  382. if (retval)
  383. return retval;
  384. /*
  385. * Allow userspace to request a mountpoint be expired rather than
  386. * unmounting unconditionally. Unmount only happens if:
  387. * (1) the mark is already set (the mark is cleared by mntput())
  388. * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
  389. */
  390. if (flags & MNT_EXPIRE) {
  391. if (mnt == current->fs->rootmnt ||
  392. flags & (MNT_FORCE | MNT_DETACH))
  393. return -EINVAL;
  394. if (atomic_read(&mnt->mnt_count) != 2)
  395. return -EBUSY;
  396. if (!xchg(&mnt->mnt_expiry_mark, 1))
  397. return -EAGAIN;
  398. }
  399. /*
  400. * If we may have to abort operations to get out of this
  401. * mount, and they will themselves hold resources we must
  402. * allow the fs to do things. In the Unix tradition of
  403. * 'Gee thats tricky lets do it in userspace' the umount_begin
  404. * might fail to complete on the first run through as other tasks
  405. * must return, and the like. Thats for the mount program to worry
  406. * about for the moment.
  407. */
  408. lock_kernel();
  409. if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
  410. sb->s_op->umount_begin(sb);
  411. unlock_kernel();
  412. /*
  413. * No sense to grab the lock for this test, but test itself looks
  414. * somewhat bogus. Suggestions for better replacement?
  415. * Ho-hum... In principle, we might treat that as umount + switch
  416. * to rootfs. GC would eventually take care of the old vfsmount.
  417. * Actually it makes sense, especially if rootfs would contain a
  418. * /reboot - static binary that would close all descriptors and
  419. * call reboot(9). Then init(8) could umount root and exec /reboot.
  420. */
  421. if (mnt == current->fs->rootmnt && !(flags & MNT_DETACH)) {
  422. /*
  423. * Special case for "unmounting" root ...
  424. * we just try to remount it readonly.
  425. */
  426. down_write(&sb->s_umount);
  427. if (!(sb->s_flags & MS_RDONLY)) {
  428. lock_kernel();
  429. DQUOT_OFF(sb);
  430. retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
  431. unlock_kernel();
  432. }
  433. up_write(&sb->s_umount);
  434. return retval;
  435. }
  436. down_write(&namespace_sem);
  437. spin_lock(&vfsmount_lock);
  438. event++;
  439. retval = -EBUSY;
  440. if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
  441. if (!list_empty(&mnt->mnt_list))
  442. umount_tree(mnt, &umount_list);
  443. retval = 0;
  444. }
  445. spin_unlock(&vfsmount_lock);
  446. if (retval)
  447. security_sb_umount_busy(mnt);
  448. up_write(&namespace_sem);
  449. release_mounts(&umount_list);
  450. return retval;
  451. }
  452. /*
  453. * Now umount can handle mount points as well as block devices.
  454. * This is important for filesystems which use unnamed block devices.
  455. *
  456. * We now support a flag for forced unmount like the other 'big iron'
  457. * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
  458. */
  459. asmlinkage long sys_umount(char __user * name, int flags)
  460. {
  461. struct nameidata nd;
  462. int retval;
  463. retval = __user_walk(name, LOOKUP_FOLLOW, &nd);
  464. if (retval)
  465. goto out;
  466. retval = -EINVAL;
  467. if (nd.dentry != nd.mnt->mnt_root)
  468. goto dput_and_out;
  469. if (!check_mnt(nd.mnt))
  470. goto dput_and_out;
  471. retval = -EPERM;
  472. if (!capable(CAP_SYS_ADMIN))
  473. goto dput_and_out;
  474. retval = do_umount(nd.mnt, flags);
  475. dput_and_out:
  476. path_release_on_umount(&nd);
  477. out:
  478. return retval;
  479. }
  480. #ifdef __ARCH_WANT_SYS_OLDUMOUNT
  481. /*
  482. * The 2.0 compatible umount. No flags.
  483. */
  484. asmlinkage long sys_oldumount(char __user * name)
  485. {
  486. return sys_umount(name, 0);
  487. }
  488. #endif
  489. static int mount_is_safe(struct nameidata *nd)
  490. {
  491. if (capable(CAP_SYS_ADMIN))
  492. return 0;
  493. return -EPERM;
  494. #ifdef notyet
  495. if (S_ISLNK(nd->dentry->d_inode->i_mode))
  496. return -EPERM;
  497. if (nd->dentry->d_inode->i_mode & S_ISVTX) {
  498. if (current->uid != nd->dentry->d_inode->i_uid)
  499. return -EPERM;
  500. }
  501. if (permission(nd->dentry->d_inode, MAY_WRITE, nd))
  502. return -EPERM;
  503. return 0;
  504. #endif
  505. }
  506. static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
  507. {
  508. while (1) {
  509. if (d == dentry)
  510. return 1;
  511. if (d == NULL || d == d->d_parent)
  512. return 0;
  513. d = d->d_parent;
  514. }
  515. }
  516. static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
  517. int flag)
  518. {
  519. struct vfsmount *res, *p, *q, *r, *s;
  520. struct nameidata nd;
  521. res = q = clone_mnt(mnt, dentry, flag);
  522. if (!q)
  523. goto Enomem;
  524. q->mnt_mountpoint = mnt->mnt_mountpoint;
  525. p = mnt;
  526. list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
  527. if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry))
  528. continue;
  529. for (s = r; s; s = next_mnt(s, r)) {
  530. while (p != s->mnt_parent) {
  531. p = p->mnt_parent;
  532. q = q->mnt_parent;
  533. }
  534. p = s;
  535. nd.mnt = q;
  536. nd.dentry = p->mnt_mountpoint;
  537. q = clone_mnt(p, p->mnt_root, flag);
  538. if (!q)
  539. goto Enomem;
  540. spin_lock(&vfsmount_lock);
  541. list_add_tail(&q->mnt_list, &res->mnt_list);
  542. attach_mnt(q, &nd);
  543. spin_unlock(&vfsmount_lock);
  544. }
  545. }
  546. return res;
  547. Enomem:
  548. if (res) {
  549. LIST_HEAD(umount_list);
  550. spin_lock(&vfsmount_lock);
  551. umount_tree(res, &umount_list);
  552. spin_unlock(&vfsmount_lock);
  553. release_mounts(&umount_list);
  554. }
  555. return NULL;
  556. }
  557. static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
  558. {
  559. int err;
  560. if (mnt->mnt_sb->s_flags & MS_NOUSER)
  561. return -EINVAL;
  562. if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
  563. S_ISDIR(mnt->mnt_root->d_inode->i_mode))
  564. return -ENOTDIR;
  565. err = -ENOENT;
  566. down(&nd->dentry->d_inode->i_sem);
  567. if (IS_DEADDIR(nd->dentry->d_inode))
  568. goto out_unlock;
  569. err = security_sb_check_sb(mnt, nd);
  570. if (err)
  571. goto out_unlock;
  572. err = -ENOENT;
  573. spin_lock(&vfsmount_lock);
  574. if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
  575. struct list_head head;
  576. attach_mnt(mnt, nd);
  577. list_add_tail(&head, &mnt->mnt_list);
  578. list_splice(&head, current->namespace->list.prev);
  579. err = 0;
  580. touch_namespace(current->namespace);
  581. }
  582. spin_unlock(&vfsmount_lock);
  583. out_unlock:
  584. up(&nd->dentry->d_inode->i_sem);
  585. if (!err)
  586. security_sb_post_addmount(mnt, nd);
  587. return err;
  588. }
  589. /*
  590. * recursively change the type of the mountpoint.
  591. */
  592. static int do_change_type(struct nameidata *nd, int flag)
  593. {
  594. struct vfsmount *m, *mnt = nd->mnt;
  595. int recurse = flag & MS_REC;
  596. int type = flag & ~MS_REC;
  597. if (nd->dentry != nd->mnt->mnt_root)
  598. return -EINVAL;
  599. down_write(&namespace_sem);
  600. spin_lock(&vfsmount_lock);
  601. for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
  602. change_mnt_propagation(m, type);
  603. spin_unlock(&vfsmount_lock);
  604. up_write(&namespace_sem);
  605. return 0;
  606. }
  607. /*
  608. * do loopback mount.
  609. */
  610. static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
  611. {
  612. struct nameidata old_nd;
  613. struct vfsmount *mnt = NULL;
  614. int err = mount_is_safe(nd);
  615. if (err)
  616. return err;
  617. if (!old_name || !*old_name)
  618. return -EINVAL;
  619. err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
  620. if (err)
  621. return err;
  622. down_write(&namespace_sem);
  623. err = -EINVAL;
  624. if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
  625. goto out;
  626. err = -ENOMEM;
  627. if (recurse)
  628. mnt = copy_tree(old_nd.mnt, old_nd.dentry, 0);
  629. else
  630. mnt = clone_mnt(old_nd.mnt, old_nd.dentry, 0);
  631. if (!mnt)
  632. goto out;
  633. err = graft_tree(mnt, nd);
  634. if (err) {
  635. LIST_HEAD(umount_list);
  636. spin_lock(&vfsmount_lock);
  637. umount_tree(mnt, &umount_list);
  638. spin_unlock(&vfsmount_lock);
  639. release_mounts(&umount_list);
  640. }
  641. out:
  642. up_write(&namespace_sem);
  643. path_release(&old_nd);
  644. return err;
  645. }
  646. /*
  647. * change filesystem flags. dir should be a physical root of filesystem.
  648. * If you've mounted a non-root directory somewhere and want to do remount
  649. * on it - tough luck.
  650. */
  651. static int do_remount(struct nameidata *nd, int flags, int mnt_flags,
  652. void *data)
  653. {
  654. int err;
  655. struct super_block *sb = nd->mnt->mnt_sb;
  656. if (!capable(CAP_SYS_ADMIN))
  657. return -EPERM;
  658. if (!check_mnt(nd->mnt))
  659. return -EINVAL;
  660. if (nd->dentry != nd->mnt->mnt_root)
  661. return -EINVAL;
  662. down_write(&sb->s_umount);
  663. err = do_remount_sb(sb, flags, data, 0);
  664. if (!err)
  665. nd->mnt->mnt_flags = mnt_flags;
  666. up_write(&sb->s_umount);
  667. if (!err)
  668. security_sb_post_remount(nd->mnt, flags, data);
  669. return err;
  670. }
  671. static int do_move_mount(struct nameidata *nd, char *old_name)
  672. {
  673. struct nameidata old_nd, parent_nd;
  674. struct vfsmount *p;
  675. int err = 0;
  676. if (!capable(CAP_SYS_ADMIN))
  677. return -EPERM;
  678. if (!old_name || !*old_name)
  679. return -EINVAL;
  680. err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
  681. if (err)
  682. return err;
  683. down_write(&namespace_sem);
  684. while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
  685. ;
  686. err = -EINVAL;
  687. if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
  688. goto out;
  689. err = -ENOENT;
  690. down(&nd->dentry->d_inode->i_sem);
  691. if (IS_DEADDIR(nd->dentry->d_inode))
  692. goto out1;
  693. spin_lock(&vfsmount_lock);
  694. if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
  695. goto out2;
  696. err = -EINVAL;
  697. if (old_nd.dentry != old_nd.mnt->mnt_root)
  698. goto out2;
  699. if (old_nd.mnt == old_nd.mnt->mnt_parent)
  700. goto out2;
  701. if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
  702. S_ISDIR(old_nd.dentry->d_inode->i_mode))
  703. goto out2;
  704. err = -ELOOP;
  705. for (p = nd->mnt; p->mnt_parent != p; p = p->mnt_parent)
  706. if (p == old_nd.mnt)
  707. goto out2;
  708. err = 0;
  709. detach_mnt(old_nd.mnt, &parent_nd);
  710. attach_mnt(old_nd.mnt, nd);
  711. touch_namespace(current->namespace);
  712. /* if the mount is moved, it should no longer be expire
  713. * automatically */
  714. list_del_init(&old_nd.mnt->mnt_expire);
  715. out2:
  716. spin_unlock(&vfsmount_lock);
  717. out1:
  718. up(&nd->dentry->d_inode->i_sem);
  719. out:
  720. up_write(&namespace_sem);
  721. if (!err)
  722. path_release(&parent_nd);
  723. path_release(&old_nd);
  724. return err;
  725. }
  726. /*
  727. * create a new mount for userspace and request it to be added into the
  728. * namespace's tree
  729. */
  730. static int do_new_mount(struct nameidata *nd, char *type, int flags,
  731. int mnt_flags, char *name, void *data)
  732. {
  733. struct vfsmount *mnt;
  734. if (!type || !memchr(type, 0, PAGE_SIZE))
  735. return -EINVAL;
  736. /* we need capabilities... */
  737. if (!capable(CAP_SYS_ADMIN))
  738. return -EPERM;
  739. mnt = do_kern_mount(type, flags, name, data);
  740. if (IS_ERR(mnt))
  741. return PTR_ERR(mnt);
  742. return do_add_mount(mnt, nd, mnt_flags, NULL);
  743. }
  744. /*
  745. * add a mount into a namespace's mount tree
  746. * - provide the option of adding the new mount to an expiration list
  747. */
  748. int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
  749. int mnt_flags, struct list_head *fslist)
  750. {
  751. int err;
  752. down_write(&namespace_sem);
  753. /* Something was mounted here while we slept */
  754. while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
  755. ;
  756. err = -EINVAL;
  757. if (!check_mnt(nd->mnt))
  758. goto unlock;
  759. /* Refuse the same filesystem on the same mount point */
  760. err = -EBUSY;
  761. if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
  762. nd->mnt->mnt_root == nd->dentry)
  763. goto unlock;
  764. err = -EINVAL;
  765. if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
  766. goto unlock;
  767. newmnt->mnt_flags = mnt_flags;
  768. if ((err = graft_tree(newmnt, nd)))
  769. goto unlock;
  770. if (fslist) {
  771. /* add to the specified expiration list */
  772. spin_lock(&vfsmount_lock);
  773. list_add_tail(&newmnt->mnt_expire, fslist);
  774. spin_unlock(&vfsmount_lock);
  775. }
  776. up_write(&namespace_sem);
  777. return 0;
  778. unlock:
  779. up_write(&namespace_sem);
  780. mntput(newmnt);
  781. return err;
  782. }
  783. EXPORT_SYMBOL_GPL(do_add_mount);
  784. static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
  785. struct list_head *umounts)
  786. {
  787. spin_lock(&vfsmount_lock);
  788. /*
  789. * Check if mount is still attached, if not, let whoever holds it deal
  790. * with the sucker
  791. */
  792. if (mnt->mnt_parent == mnt) {
  793. spin_unlock(&vfsmount_lock);
  794. return;
  795. }
  796. /*
  797. * Check that it is still dead: the count should now be 2 - as
  798. * contributed by the vfsmount parent and the mntget above
  799. */
  800. if (atomic_read(&mnt->mnt_count) == 2) {
  801. /* delete from the namespace */
  802. touch_namespace(mnt->mnt_namespace);
  803. list_del_init(&mnt->mnt_list);
  804. mnt->mnt_namespace = NULL;
  805. umount_tree(mnt, umounts);
  806. spin_unlock(&vfsmount_lock);
  807. } else {
  808. /*
  809. * Someone brought it back to life whilst we didn't have any
  810. * locks held so return it to the expiration list
  811. */
  812. list_add_tail(&mnt->mnt_expire, mounts);
  813. spin_unlock(&vfsmount_lock);
  814. }
  815. }
  816. /*
  817. * process a list of expirable mountpoints with the intent of discarding any
  818. * mountpoints that aren't in use and haven't been touched since last we came
  819. * here
  820. */
  821. void mark_mounts_for_expiry(struct list_head *mounts)
  822. {
  823. struct namespace *namespace;
  824. struct vfsmount *mnt, *next;
  825. LIST_HEAD(graveyard);
  826. if (list_empty(mounts))
  827. return;
  828. spin_lock(&vfsmount_lock);
  829. /* extract from the expiration list every vfsmount that matches the
  830. * following criteria:
  831. * - only referenced by its parent vfsmount
  832. * - still marked for expiry (marked on the last call here; marks are
  833. * cleared by mntput())
  834. */
  835. list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
  836. if (!xchg(&mnt->mnt_expiry_mark, 1) ||
  837. atomic_read(&mnt->mnt_count) != 1)
  838. continue;
  839. mntget(mnt);
  840. list_move(&mnt->mnt_expire, &graveyard);
  841. }
  842. /*
  843. * go through the vfsmounts we've just consigned to the graveyard to
  844. * - check that they're still dead
  845. * - delete the vfsmount from the appropriate namespace under lock
  846. * - dispose of the corpse
  847. */
  848. while (!list_empty(&graveyard)) {
  849. LIST_HEAD(umounts);
  850. mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
  851. list_del_init(&mnt->mnt_expire);
  852. /* don't do anything if the namespace is dead - all the
  853. * vfsmounts from it are going away anyway */
  854. namespace = mnt->mnt_namespace;
  855. if (!namespace || !namespace->root)
  856. continue;
  857. get_namespace(namespace);
  858. spin_unlock(&vfsmount_lock);
  859. down_write(&namespace_sem);
  860. expire_mount(mnt, mounts, &umounts);
  861. up_write(&namespace_sem);
  862. release_mounts(&umounts);
  863. mntput(mnt);
  864. put_namespace(namespace);
  865. spin_lock(&vfsmount_lock);
  866. }
  867. spin_unlock(&vfsmount_lock);
  868. }
  869. EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
  870. /*
  871. * Some copy_from_user() implementations do not return the exact number of
  872. * bytes remaining to copy on a fault. But copy_mount_options() requires that.
  873. * Note that this function differs from copy_from_user() in that it will oops
  874. * on bad values of `to', rather than returning a short copy.
  875. */
  876. static long exact_copy_from_user(void *to, const void __user * from,
  877. unsigned long n)
  878. {
  879. char *t = to;
  880. const char __user *f = from;
  881. char c;
  882. if (!access_ok(VERIFY_READ, from, n))
  883. return n;
  884. while (n) {
  885. if (__get_user(c, f)) {
  886. memset(t, 0, n);
  887. break;
  888. }
  889. *t++ = c;
  890. f++;
  891. n--;
  892. }
  893. return n;
  894. }
  895. int copy_mount_options(const void __user * data, unsigned long *where)
  896. {
  897. int i;
  898. unsigned long page;
  899. unsigned long size;
  900. *where = 0;
  901. if (!data)
  902. return 0;
  903. if (!(page = __get_free_page(GFP_KERNEL)))
  904. return -ENOMEM;
  905. /* We only care that *some* data at the address the user
  906. * gave us is valid. Just in case, we'll zero
  907. * the remainder of the page.
  908. */
  909. /* copy_from_user cannot cross TASK_SIZE ! */
  910. size = TASK_SIZE - (unsigned long)data;
  911. if (size > PAGE_SIZE)
  912. size = PAGE_SIZE;
  913. i = size - exact_copy_from_user((void *)page, data, size);
  914. if (!i) {
  915. free_page(page);
  916. return -EFAULT;
  917. }
  918. if (i != PAGE_SIZE)
  919. memset((char *)page + i, 0, PAGE_SIZE - i);
  920. *where = page;
  921. return 0;
  922. }
  923. /*
  924. * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  925. * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
  926. *
  927. * data is a (void *) that can point to any structure up to
  928. * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
  929. * information (or be NULL).
  930. *
  931. * Pre-0.97 versions of mount() didn't have a flags word.
  932. * When the flags word was introduced its top half was required
  933. * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
  934. * Therefore, if this magic number is present, it carries no information
  935. * and must be discarded.
  936. */
  937. long do_mount(char *dev_name, char *dir_name, char *type_page,
  938. unsigned long flags, void *data_page)
  939. {
  940. struct nameidata nd;
  941. int retval = 0;
  942. int mnt_flags = 0;
  943. /* Discard magic */
  944. if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
  945. flags &= ~MS_MGC_MSK;
  946. /* Basic sanity checks */
  947. if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
  948. return -EINVAL;
  949. if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
  950. return -EINVAL;
  951. if (data_page)
  952. ((char *)data_page)[PAGE_SIZE - 1] = 0;
  953. /* Separate the per-mountpoint flags */
  954. if (flags & MS_NOSUID)
  955. mnt_flags |= MNT_NOSUID;
  956. if (flags & MS_NODEV)
  957. mnt_flags |= MNT_NODEV;
  958. if (flags & MS_NOEXEC)
  959. mnt_flags |= MNT_NOEXEC;
  960. flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE);
  961. /* ... and get the mountpoint */
  962. retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
  963. if (retval)
  964. return retval;
  965. retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
  966. if (retval)
  967. goto dput_out;
  968. if (flags & MS_REMOUNT)
  969. retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
  970. data_page);
  971. else if (flags & MS_BIND)
  972. retval = do_loopback(&nd, dev_name, flags & MS_REC);
  973. else if (flags & (MS_SHARED | MS_PRIVATE))
  974. retval = do_change_type(&nd, flags);
  975. else if (flags & MS_MOVE)
  976. retval = do_move_mount(&nd, dev_name);
  977. else
  978. retval = do_new_mount(&nd, type_page, flags, mnt_flags,
  979. dev_name, data_page);
  980. dput_out:
  981. path_release(&nd);
  982. return retval;
  983. }
  984. int copy_namespace(int flags, struct task_struct *tsk)
  985. {
  986. struct namespace *namespace = tsk->namespace;
  987. struct namespace *new_ns;
  988. struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
  989. struct fs_struct *fs = tsk->fs;
  990. struct vfsmount *p, *q;
  991. if (!namespace)
  992. return 0;
  993. get_namespace(namespace);
  994. if (!(flags & CLONE_NEWNS))
  995. return 0;
  996. if (!capable(CAP_SYS_ADMIN)) {
  997. put_namespace(namespace);
  998. return -EPERM;
  999. }
  1000. new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
  1001. if (!new_ns)
  1002. goto out;
  1003. atomic_set(&new_ns->count, 1);
  1004. INIT_LIST_HEAD(&new_ns->list);
  1005. init_waitqueue_head(&new_ns->poll);
  1006. new_ns->event = 0;
  1007. down_write(&namespace_sem);
  1008. /* First pass: copy the tree topology */
  1009. new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root,
  1010. CL_EXPIRE);
  1011. if (!new_ns->root) {
  1012. up_write(&namespace_sem);
  1013. kfree(new_ns);
  1014. goto out;
  1015. }
  1016. spin_lock(&vfsmount_lock);
  1017. list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
  1018. spin_unlock(&vfsmount_lock);
  1019. /*
  1020. * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
  1021. * as belonging to new namespace. We have already acquired a private
  1022. * fs_struct, so tsk->fs->lock is not needed.
  1023. */
  1024. p = namespace->root;
  1025. q = new_ns->root;
  1026. while (p) {
  1027. q->mnt_namespace = new_ns;
  1028. if (fs) {
  1029. if (p == fs->rootmnt) {
  1030. rootmnt = p;
  1031. fs->rootmnt = mntget(q);
  1032. }
  1033. if (p == fs->pwdmnt) {
  1034. pwdmnt = p;
  1035. fs->pwdmnt = mntget(q);
  1036. }
  1037. if (p == fs->altrootmnt) {
  1038. altrootmnt = p;
  1039. fs->altrootmnt = mntget(q);
  1040. }
  1041. }
  1042. p = next_mnt(p, namespace->root);
  1043. q = next_mnt(q, new_ns->root);
  1044. }
  1045. up_write(&namespace_sem);
  1046. tsk->namespace = new_ns;
  1047. if (rootmnt)
  1048. mntput(rootmnt);
  1049. if (pwdmnt)
  1050. mntput(pwdmnt);
  1051. if (altrootmnt)
  1052. mntput(altrootmnt);
  1053. put_namespace(namespace);
  1054. return 0;
  1055. out:
  1056. put_namespace(namespace);
  1057. return -ENOMEM;
  1058. }
  1059. asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
  1060. char __user * type, unsigned long flags,
  1061. void __user * data)
  1062. {
  1063. int retval;
  1064. unsigned long data_page;
  1065. unsigned long type_page;
  1066. unsigned long dev_page;
  1067. char *dir_page;
  1068. retval = copy_mount_options(type, &type_page);
  1069. if (retval < 0)
  1070. return retval;
  1071. dir_page = getname(dir_name);
  1072. retval = PTR_ERR(dir_page);
  1073. if (IS_ERR(dir_page))
  1074. goto out1;
  1075. retval = copy_mount_options(dev_name, &dev_page);
  1076. if (retval < 0)
  1077. goto out2;
  1078. retval = copy_mount_options(data, &data_page);
  1079. if (retval < 0)
  1080. goto out3;
  1081. lock_kernel();
  1082. retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
  1083. flags, (void *)data_page);
  1084. unlock_kernel();
  1085. free_page(data_page);
  1086. out3:
  1087. free_page(dev_page);
  1088. out2:
  1089. putname(dir_page);
  1090. out1:
  1091. free_page(type_page);
  1092. return retval;
  1093. }
  1094. /*
  1095. * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
  1096. * It can block. Requires the big lock held.
  1097. */
  1098. void set_fs_root(struct fs_struct *fs, struct vfsmount *mnt,
  1099. struct dentry *dentry)
  1100. {
  1101. struct dentry *old_root;
  1102. struct vfsmount *old_rootmnt;
  1103. write_lock(&fs->lock);
  1104. old_root = fs->root;
  1105. old_rootmnt = fs->rootmnt;
  1106. fs->rootmnt = mntget(mnt);
  1107. fs->root = dget(dentry);
  1108. write_unlock(&fs->lock);
  1109. if (old_root) {
  1110. dput(old_root);
  1111. mntput(old_rootmnt);
  1112. }
  1113. }
  1114. /*
  1115. * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
  1116. * It can block. Requires the big lock held.
  1117. */
  1118. void set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
  1119. struct dentry *dentry)
  1120. {
  1121. struct dentry *old_pwd;
  1122. struct vfsmount *old_pwdmnt;
  1123. write_lock(&fs->lock);
  1124. old_pwd = fs->pwd;
  1125. old_pwdmnt = fs->pwdmnt;
  1126. fs->pwdmnt = mntget(mnt);
  1127. fs->pwd = dget(dentry);
  1128. write_unlock(&fs->lock);
  1129. if (old_pwd) {
  1130. dput(old_pwd);
  1131. mntput(old_pwdmnt);
  1132. }
  1133. }
  1134. static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
  1135. {
  1136. struct task_struct *g, *p;
  1137. struct fs_struct *fs;
  1138. read_lock(&tasklist_lock);
  1139. do_each_thread(g, p) {
  1140. task_lock(p);
  1141. fs = p->fs;
  1142. if (fs) {
  1143. atomic_inc(&fs->count);
  1144. task_unlock(p);
  1145. if (fs->root == old_nd->dentry
  1146. && fs->rootmnt == old_nd->mnt)
  1147. set_fs_root(fs, new_nd->mnt, new_nd->dentry);
  1148. if (fs->pwd == old_nd->dentry
  1149. && fs->pwdmnt == old_nd->mnt)
  1150. set_fs_pwd(fs, new_nd->mnt, new_nd->dentry);
  1151. put_fs_struct(fs);
  1152. } else
  1153. task_unlock(p);
  1154. } while_each_thread(g, p);
  1155. read_unlock(&tasklist_lock);
  1156. }
  1157. /*
  1158. * pivot_root Semantics:
  1159. * Moves the root file system of the current process to the directory put_old,
  1160. * makes new_root as the new root file system of the current process, and sets
  1161. * root/cwd of all processes which had them on the current root to new_root.
  1162. *
  1163. * Restrictions:
  1164. * The new_root and put_old must be directories, and must not be on the
  1165. * same file system as the current process root. The put_old must be
  1166. * underneath new_root, i.e. adding a non-zero number of /.. to the string
  1167. * pointed to by put_old must yield the same directory as new_root. No other
  1168. * file system may be mounted on put_old. After all, new_root is a mountpoint.
  1169. *
  1170. * Notes:
  1171. * - we don't move root/cwd if they are not at the root (reason: if something
  1172. * cared enough to change them, it's probably wrong to force them elsewhere)
  1173. * - it's okay to pick a root that isn't the root of a file system, e.g.
  1174. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
  1175. * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  1176. * first.
  1177. */
  1178. asmlinkage long sys_pivot_root(const char __user * new_root,
  1179. const char __user * put_old)
  1180. {
  1181. struct vfsmount *tmp;
  1182. struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd;
  1183. int error;
  1184. if (!capable(CAP_SYS_ADMIN))
  1185. return -EPERM;
  1186. lock_kernel();
  1187. error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
  1188. &new_nd);
  1189. if (error)
  1190. goto out0;
  1191. error = -EINVAL;
  1192. if (!check_mnt(new_nd.mnt))
  1193. goto out1;
  1194. error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd);
  1195. if (error)
  1196. goto out1;
  1197. error = security_sb_pivotroot(&old_nd, &new_nd);
  1198. if (error) {
  1199. path_release(&old_nd);
  1200. goto out1;
  1201. }
  1202. read_lock(&current->fs->lock);
  1203. user_nd.mnt = mntget(current->fs->rootmnt);
  1204. user_nd.dentry = dget(current->fs->root);
  1205. read_unlock(&current->fs->lock);
  1206. down_write(&namespace_sem);
  1207. down(&old_nd.dentry->d_inode->i_sem);
  1208. error = -EINVAL;
  1209. if (!check_mnt(user_nd.mnt))
  1210. goto out2;
  1211. error = -ENOENT;
  1212. if (IS_DEADDIR(new_nd.dentry->d_inode))
  1213. goto out2;
  1214. if (d_unhashed(new_nd.dentry) && !IS_ROOT(new_nd.dentry))
  1215. goto out2;
  1216. if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry))
  1217. goto out2;
  1218. error = -EBUSY;
  1219. if (new_nd.mnt == user_nd.mnt || old_nd.mnt == user_nd.mnt)
  1220. goto out2; /* loop, on the same file system */
  1221. error = -EINVAL;
  1222. if (user_nd.mnt->mnt_root != user_nd.dentry)
  1223. goto out2; /* not a mountpoint */
  1224. if (user_nd.mnt->mnt_parent == user_nd.mnt)
  1225. goto out2; /* not attached */
  1226. if (new_nd.mnt->mnt_root != new_nd.dentry)
  1227. goto out2; /* not a mountpoint */
  1228. if (new_nd.mnt->mnt_parent == new_nd.mnt)
  1229. goto out2; /* not attached */
  1230. tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
  1231. spin_lock(&vfsmount_lock);
  1232. if (tmp != new_nd.mnt) {
  1233. for (;;) {
  1234. if (tmp->mnt_parent == tmp)
  1235. goto out3; /* already mounted on put_old */
  1236. if (tmp->mnt_parent == new_nd.mnt)
  1237. break;
  1238. tmp = tmp->mnt_parent;
  1239. }
  1240. if (!is_subdir(tmp->mnt_mountpoint, new_nd.dentry))
  1241. goto out3;
  1242. } else if (!is_subdir(old_nd.dentry, new_nd.dentry))
  1243. goto out3;
  1244. detach_mnt(new_nd.mnt, &parent_nd);
  1245. detach_mnt(user_nd.mnt, &root_parent);
  1246. attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */
  1247. attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */
  1248. touch_namespace(current->namespace);
  1249. spin_unlock(&vfsmount_lock);
  1250. chroot_fs_refs(&user_nd, &new_nd);
  1251. security_sb_post_pivotroot(&user_nd, &new_nd);
  1252. error = 0;
  1253. path_release(&root_parent);
  1254. path_release(&parent_nd);
  1255. out2:
  1256. up(&old_nd.dentry->d_inode->i_sem);
  1257. up_write(&namespace_sem);
  1258. path_release(&user_nd);
  1259. path_release(&old_nd);
  1260. out1:
  1261. path_release(&new_nd);
  1262. out0:
  1263. unlock_kernel();
  1264. return error;
  1265. out3:
  1266. spin_unlock(&vfsmount_lock);
  1267. goto out2;
  1268. }
  1269. static void __init init_mount_tree(void)
  1270. {
  1271. struct vfsmount *mnt;
  1272. struct namespace *namespace;
  1273. struct task_struct *g, *p;
  1274. mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
  1275. if (IS_ERR(mnt))
  1276. panic("Can't create rootfs");
  1277. namespace = kmalloc(sizeof(*namespace), GFP_KERNEL);
  1278. if (!namespace)
  1279. panic("Can't allocate initial namespace");
  1280. atomic_set(&namespace->count, 1);
  1281. INIT_LIST_HEAD(&namespace->list);
  1282. init_waitqueue_head(&namespace->poll);
  1283. namespace->event = 0;
  1284. list_add(&mnt->mnt_list, &namespace->list);
  1285. namespace->root = mnt;
  1286. mnt->mnt_namespace = namespace;
  1287. init_task.namespace = namespace;
  1288. read_lock(&tasklist_lock);
  1289. do_each_thread(g, p) {
  1290. get_namespace(namespace);
  1291. p->namespace = namespace;
  1292. } while_each_thread(g, p);
  1293. read_unlock(&tasklist_lock);
  1294. set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root);
  1295. set_fs_root(current->fs, namespace->root, namespace->root->mnt_root);
  1296. }
  1297. void __init mnt_init(unsigned long mempages)
  1298. {
  1299. struct list_head *d;
  1300. unsigned int nr_hash;
  1301. int i;
  1302. init_rwsem(&namespace_sem);
  1303. mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
  1304. 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
  1305. mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
  1306. if (!mount_hashtable)
  1307. panic("Failed to allocate mount hash table\n");
  1308. /*
  1309. * Find the power-of-two list-heads that can fit into the allocation..
  1310. * We don't guarantee that "sizeof(struct list_head)" is necessarily
  1311. * a power-of-two.
  1312. */
  1313. nr_hash = PAGE_SIZE / sizeof(struct list_head);
  1314. hash_bits = 0;
  1315. do {
  1316. hash_bits++;
  1317. } while ((nr_hash >> hash_bits) != 0);
  1318. hash_bits--;
  1319. /*
  1320. * Re-calculate the actual number of entries and the mask
  1321. * from the number of bits we can fit.
  1322. */
  1323. nr_hash = 1UL << hash_bits;
  1324. hash_mask = nr_hash - 1;
  1325. printk("Mount-cache hash table entries: %d\n", nr_hash);
  1326. /* And initialize the newly allocated array */
  1327. d = mount_hashtable;
  1328. i = nr_hash;
  1329. do {
  1330. INIT_LIST_HEAD(d);
  1331. d++;
  1332. i--;
  1333. } while (i);
  1334. sysfs_init();
  1335. init_rootfs();
  1336. init_mount_tree();
  1337. }
  1338. void __put_namespace(struct namespace *namespace)
  1339. {
  1340. struct vfsmount *root = namespace->root;
  1341. LIST_HEAD(umount_list);
  1342. namespace->root = NULL;
  1343. spin_unlock(&vfsmount_lock);
  1344. down_write(&namespace_sem);
  1345. spin_lock(&vfsmount_lock);
  1346. umount_tree(root, &umount_list);
  1347. spin_unlock(&vfsmount_lock);
  1348. up_write(&namespace_sem);
  1349. release_mounts(&umount_list);
  1350. kfree(namespace);
  1351. }