xref: /unit/src/nxt_isolation.c (revision 1671:04de10f67b33)
1 /*
2  * Copyright (C) NGINX, Inc.
3  */
4 
5 #include <nxt_main.h>
6 #include <nxt_application.h>
7 #include <nxt_process.h>
8 #include <nxt_isolation.h>
9 
10 #if (NXT_HAVE_PIVOT_ROOT)
11 #include <mntent.h>
12 #endif
13 
14 
15 static nxt_int_t nxt_isolation_set(nxt_task_t *task,
16     nxt_conf_value_t *isolation, nxt_process_t *process);
17 
18 #if (NXT_HAVE_CLONE)
19 static nxt_int_t nxt_isolation_set_namespaces(nxt_task_t *task,
20     nxt_conf_value_t *isolation, nxt_process_t *process);
21 static nxt_int_t nxt_isolation_clone_flags(nxt_task_t *task,
22     nxt_conf_value_t *namespaces, nxt_clone_t *clone);
23 #endif
24 
25 #if (NXT_HAVE_CLONE_NEWUSER)
26 static nxt_int_t nxt_isolation_set_creds(nxt_task_t *task,
27     nxt_conf_value_t *isolation, nxt_process_t *process);
28 static nxt_int_t nxt_isolation_credential_map(nxt_task_t *task,
29     nxt_mp_t *mem_pool, nxt_conf_value_t *map_array,
30     nxt_clone_credential_map_t *map);
31 static nxt_int_t nxt_isolation_vldt_creds(nxt_task_t *task,
32     nxt_process_t *process);
33 #endif
34 
35 #if (NXT_HAVE_ISOLATION_ROOTFS)
36 static nxt_int_t nxt_isolation_set_rootfs(nxt_task_t *task,
37     nxt_conf_value_t *isolation, nxt_process_t *process);
38 static nxt_int_t nxt_isolation_set_automount(nxt_task_t *task,
39     nxt_conf_value_t *isolation, nxt_process_t *process);
40 static nxt_int_t nxt_isolation_set_mounts(nxt_task_t *task,
41     nxt_process_t *process, nxt_str_t *app_type);
42 static nxt_int_t nxt_isolation_set_lang_mounts(nxt_task_t *task,
43     nxt_process_t *process, nxt_array_t *syspaths);
44 static int nxt_cdecl nxt_isolation_mount_compare(const void *v1,
45     const void *v2);
46 static void nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process);
47 
48 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
49 static nxt_int_t nxt_isolation_pivot_root(nxt_task_t *task, const char *rootfs);
50 static nxt_int_t nxt_isolation_make_private_mount(nxt_task_t *task,
51     const char *rootfs);
52 nxt_inline int nxt_pivot_root(const char *new_root, const char *old_root);
53 #endif
54 
55 static nxt_int_t nxt_isolation_chroot(nxt_task_t *task, const char *path);
56 #endif
57 
58 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
59 static nxt_int_t nxt_isolation_set_new_privs(nxt_task_t *task,
60     nxt_conf_value_t *isolation, nxt_process_t *process);
61 #endif
62 
63 
64 nxt_int_t
65 nxt_isolation_main_prefork(nxt_task_t *task, nxt_process_t *process,
66     nxt_mp_t *mp)
67 {
68     nxt_int_t              cap_setid;
69     nxt_int_t              ret;
70     nxt_runtime_t          *rt;
71     nxt_common_app_conf_t  *app_conf;
72 
73     rt = task->thread->runtime;
74     app_conf = process->data.app;
75     cap_setid = rt->capabilities.setid;
76 
77     if (app_conf->isolation != NULL) {
78         ret = nxt_isolation_set(task, app_conf->isolation, process);
79         if (nxt_slow_path(ret != NXT_OK)) {
80             return ret;
81         }
82     }
83 
84 #if (NXT_HAVE_CLONE_NEWUSER)
85     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
86         cap_setid = 1;
87     }
88 #endif
89 
90 #if (NXT_HAVE_ISOLATION_ROOTFS)
91     if (process->isolation.rootfs != NULL) {
92         ret = nxt_isolation_set_mounts(task, process, &app_conf->type);
93         if (nxt_slow_path(ret != NXT_OK)) {
94             return ret;
95         }
96     }
97 #endif
98 
99     if (cap_setid) {
100         ret = nxt_process_creds_set(task, process, &app_conf->user,
101                                     &app_conf->group);
102 
103         if (nxt_slow_path(ret != NXT_OK)) {
104             return ret;
105         }
106 
107     } else {
108         if (!nxt_str_eq(&app_conf->user, (u_char *) rt->user_cred.user,
109                         nxt_strlen(rt->user_cred.user)))
110         {
111             nxt_alert(task, "cannot set user \"%V\" for app \"%V\": "
112                       "missing capabilities", &app_conf->user, &app_conf->name);
113 
114             return NXT_ERROR;
115         }
116 
117         if (app_conf->group.length > 0
118             && !nxt_str_eq(&app_conf->group, (u_char *) rt->group,
119                            nxt_strlen(rt->group)))
120         {
121             nxt_alert(task, "cannot set group \"%V\" for app \"%V\": "
122                             "missing capabilities", &app_conf->group,
123                             &app_conf->name);
124 
125             return NXT_ERROR;
126         }
127     }
128 
129 #if (NXT_HAVE_CLONE_NEWUSER)
130     ret = nxt_isolation_vldt_creds(task, process);
131     if (nxt_slow_path(ret != NXT_OK)) {
132         return ret;
133     }
134 #endif
135 
136     return NXT_OK;
137 }
138 
139 
140 static nxt_int_t
141 nxt_isolation_set(nxt_task_t *task, nxt_conf_value_t *isolation,
142     nxt_process_t *process)
143 {
144 #if (NXT_HAVE_CLONE)
145     if (nxt_slow_path(nxt_isolation_set_namespaces(task, isolation, process)
146                       != NXT_OK))
147     {
148         return NXT_ERROR;
149     }
150 #endif
151 
152 #if (NXT_HAVE_CLONE_NEWUSER)
153     if (nxt_slow_path(nxt_isolation_set_creds(task, isolation, process)
154                       != NXT_OK))
155     {
156         return NXT_ERROR;
157     }
158 #endif
159 
160 #if (NXT_HAVE_ISOLATION_ROOTFS)
161     if (nxt_slow_path(nxt_isolation_set_rootfs(task, isolation, process)
162                       != NXT_OK))
163     {
164         return NXT_ERROR;
165     }
166 
167     if (nxt_slow_path(nxt_isolation_set_automount(task, isolation, process)
168                       != NXT_OK))
169     {
170         return NXT_ERROR;
171     }
172 #endif
173 
174 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
175     if (nxt_slow_path(nxt_isolation_set_new_privs(task, isolation, process)
176                       != NXT_OK))
177     {
178         return NXT_ERROR;
179     }
180 #endif
181 
182     return NXT_OK;
183 }
184 
185 
186 #if (NXT_HAVE_CLONE)
187 
188 static nxt_int_t
189 nxt_isolation_set_namespaces(nxt_task_t *task, nxt_conf_value_t *isolation,
190     nxt_process_t *process)
191 {
192     nxt_int_t         ret;
193     nxt_conf_value_t  *obj;
194 
195     static nxt_str_t  nsname = nxt_string("namespaces");
196 
197     obj = nxt_conf_get_object_member(isolation, &nsname, NULL);
198     if (obj != NULL) {
199         ret = nxt_isolation_clone_flags(task, obj, &process->isolation.clone);
200         if (nxt_slow_path(ret != NXT_OK)) {
201             return NXT_ERROR;
202         }
203     }
204 
205     return NXT_OK;
206 }
207 
208 #endif
209 
210 
211 #if (NXT_HAVE_CLONE_NEWUSER)
212 
213 static nxt_int_t
214 nxt_isolation_set_creds(nxt_task_t *task, nxt_conf_value_t *isolation,
215     nxt_process_t *process)
216 {
217     nxt_int_t         ret;
218     nxt_clone_t       *clone;
219     nxt_conf_value_t  *array;
220 
221     static nxt_str_t uidname = nxt_string("uidmap");
222     static nxt_str_t gidname = nxt_string("gidmap");
223 
224     clone = &process->isolation.clone;
225 
226     array = nxt_conf_get_object_member(isolation, &uidname, NULL);
227     if (array != NULL) {
228         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
229                                            &clone->uidmap);
230 
231         if (nxt_slow_path(ret != NXT_OK)) {
232             return NXT_ERROR;
233         }
234     }
235 
236     array = nxt_conf_get_object_member(isolation, &gidname, NULL);
237     if (array != NULL) {
238         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
239                                            &clone->gidmap);
240 
241         if (nxt_slow_path(ret != NXT_OK)) {
242             return NXT_ERROR;
243         }
244     }
245 
246     return NXT_OK;
247 }
248 
249 
250 static nxt_int_t
251 nxt_isolation_credential_map(nxt_task_t *task, nxt_mp_t *mp,
252     nxt_conf_value_t *map_array, nxt_clone_credential_map_t *map)
253 {
254     nxt_int_t         ret;
255     nxt_uint_t        i;
256     nxt_conf_value_t  *obj;
257 
258     static nxt_conf_map_t  nxt_clone_map_entry_conf[] = {
259         {
260             nxt_string("container"),
261             NXT_CONF_MAP_INT,
262             offsetof(nxt_clone_map_entry_t, container),
263         },
264 
265         {
266             nxt_string("host"),
267             NXT_CONF_MAP_INT,
268             offsetof(nxt_clone_map_entry_t, host),
269         },
270 
271         {
272             nxt_string("size"),
273             NXT_CONF_MAP_INT,
274             offsetof(nxt_clone_map_entry_t, size),
275         },
276     };
277 
278     map->size = nxt_conf_array_elements_count(map_array);
279 
280     if (map->size == 0) {
281         return NXT_OK;
282     }
283 
284     map->map = nxt_mp_alloc(mp, map->size * sizeof(nxt_clone_map_entry_t));
285     if (nxt_slow_path(map->map == NULL)) {
286         return NXT_ERROR;
287     }
288 
289     for (i = 0; i < map->size; i++) {
290         obj = nxt_conf_get_array_element(map_array, i);
291 
292         ret = nxt_conf_map_object(mp, obj, nxt_clone_map_entry_conf,
293                                   nxt_nitems(nxt_clone_map_entry_conf),
294                                   map->map + i);
295         if (nxt_slow_path(ret != NXT_OK)) {
296             nxt_alert(task, "clone map entry map error");
297             return NXT_ERROR;
298         }
299     }
300 
301     return NXT_OK;
302 }
303 
304 
305 static nxt_int_t
306 nxt_isolation_vldt_creds(nxt_task_t *task, nxt_process_t *process)
307 {
308     nxt_int_t         ret;
309     nxt_clone_t       *clone;
310     nxt_credential_t  *creds;
311 
312     clone = &process->isolation.clone;
313     creds = process->user_cred;
314 
315     if (clone->uidmap.size == 0 && clone->gidmap.size == 0) {
316         return NXT_OK;
317     }
318 
319     if (!nxt_is_clone_flag_set(clone->flags, NEWUSER)) {
320         if (nxt_slow_path(clone->uidmap.size > 0)) {
321             nxt_log(task, NXT_LOG_ERR, "\"uidmap\" is set but "
322                     "\"isolation.namespaces.credential\" is false or unset");
323 
324             return NXT_ERROR;
325         }
326 
327         if (nxt_slow_path(clone->gidmap.size > 0)) {
328             nxt_log(task, NXT_LOG_ERR, "\"gidmap\" is set but "
329                     "\"isolation.namespaces.credential\" is false or unset");
330 
331             return NXT_ERROR;
332         }
333 
334         return NXT_OK;
335     }
336 
337     ret = nxt_clone_vldt_credential_uidmap(task, &clone->uidmap, creds);
338     if (nxt_slow_path(ret != NXT_OK)) {
339         return NXT_ERROR;
340     }
341 
342     return nxt_clone_vldt_credential_gidmap(task, &clone->gidmap, creds);
343 }
344 
345 #endif
346 
347 
348 #if (NXT_HAVE_CLONE)
349 
350 static nxt_int_t
351 nxt_isolation_clone_flags(nxt_task_t *task, nxt_conf_value_t *namespaces,
352     nxt_clone_t *clone)
353 {
354     uint32_t          index;
355     nxt_str_t         name;
356     nxt_int_t         flag;
357     nxt_conf_value_t  *value;
358 
359     index = 0;
360 
361     for ( ;; ) {
362         value = nxt_conf_next_object_member(namespaces, &name, &index);
363 
364         if (value == NULL) {
365             break;
366         }
367 
368         flag = 0;
369 
370 #if (NXT_HAVE_CLONE_NEWUSER)
371         if (nxt_str_eq(&name, "credential", 10)) {
372             flag = CLONE_NEWUSER;
373         }
374 #endif
375 
376 #if (NXT_HAVE_CLONE_NEWPID)
377         if (nxt_str_eq(&name, "pid", 3)) {
378             flag = CLONE_NEWPID;
379         }
380 #endif
381 
382 #if (NXT_HAVE_CLONE_NEWNET)
383         if (nxt_str_eq(&name, "network", 7)) {
384             flag = CLONE_NEWNET;
385         }
386 #endif
387 
388 #if (NXT_HAVE_CLONE_NEWUTS)
389         if (nxt_str_eq(&name, "uname", 5)) {
390             flag = CLONE_NEWUTS;
391         }
392 #endif
393 
394 #if (NXT_HAVE_CLONE_NEWNS)
395         if (nxt_str_eq(&name, "mount", 5)) {
396             flag = CLONE_NEWNS;
397         }
398 #endif
399 
400 #if (NXT_HAVE_CLONE_NEWCGROUP)
401         if (nxt_str_eq(&name, "cgroup", 6)) {
402             flag = CLONE_NEWCGROUP;
403         }
404 #endif
405 
406         if (!flag) {
407             nxt_alert(task, "unknown namespace flag: \"%V\"", &name);
408             return NXT_ERROR;
409         }
410 
411         if (nxt_conf_get_boolean(value)) {
412             clone->flags |= flag;
413         }
414     }
415 
416     return NXT_OK;
417 }
418 
419 #endif
420 
421 
422 #if (NXT_HAVE_ISOLATION_ROOTFS)
423 
424 static nxt_int_t
425 nxt_isolation_set_rootfs(nxt_task_t *task, nxt_conf_value_t *isolation,
426     nxt_process_t *process)
427 {
428     nxt_str_t         str;
429     nxt_conf_value_t  *obj;
430 
431     static nxt_str_t  rootfs_name = nxt_string("rootfs");
432 
433     obj = nxt_conf_get_object_member(isolation, &rootfs_name, NULL);
434     if (obj != NULL) {
435         nxt_conf_get_string(obj, &str);
436 
437         if (nxt_slow_path(str.length <= 1 || str.start[0] != '/')) {
438             nxt_log(task, NXT_LOG_ERR, "rootfs requires an absolute path other "
439                     "than \"/\" but given \"%V\"", &str);
440 
441             return NXT_ERROR;
442         }
443 
444         if (str.start[str.length - 1] == '/') {
445             str.length--;
446         }
447 
448         process->isolation.rootfs = nxt_mp_alloc(process->mem_pool,
449                                                  str.length + 1);
450 
451         if (nxt_slow_path(process->isolation.rootfs == NULL)) {
452             return NXT_ERROR;
453         }
454 
455         nxt_memcpy(process->isolation.rootfs, str.start, str.length);
456 
457         process->isolation.rootfs[str.length] = '\0';
458     }
459 
460     return NXT_OK;
461 }
462 
463 
464 static nxt_int_t
465 nxt_isolation_set_automount(nxt_task_t *task, nxt_conf_value_t *isolation,
466     nxt_process_t *process)
467 {
468     nxt_conf_value_t         *conf, *value;
469     nxt_process_automount_t  *automount;
470 
471     static nxt_str_t  automount_name = nxt_string("automount");
472     static nxt_str_t  langdeps_name = nxt_string("language_deps");
473 
474     automount = &process->isolation.automount;
475 
476     automount->language_deps = 1;
477 
478     conf = nxt_conf_get_object_member(isolation, &automount_name, NULL);
479     if (conf != NULL) {
480         value = nxt_conf_get_object_member(conf, &langdeps_name, NULL);
481         if (value != NULL) {
482             automount->language_deps = nxt_conf_get_boolean(value);
483         }
484     }
485 
486     return NXT_OK;
487 }
488 
489 
490 static nxt_int_t
491 nxt_isolation_set_mounts(nxt_task_t *task, nxt_process_t *process,
492     nxt_str_t *app_type)
493 {
494     nxt_int_t              ret, cap_chroot;
495     nxt_runtime_t          *rt;
496     nxt_app_lang_module_t  *lang;
497 
498     rt = task->thread->runtime;
499     cap_chroot = rt->capabilities.chroot;
500     lang = nxt_app_lang_module(rt, app_type);
501 
502     nxt_assert(lang != NULL);
503 
504 #if (NXT_HAVE_CLONE_NEWUSER)
505     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
506         cap_chroot = 1;
507     }
508 #endif
509 
510     if (!cap_chroot) {
511         nxt_log(task, NXT_LOG_ERR, "The \"rootfs\" field requires privileges");
512         return NXT_ERROR;
513     }
514 
515     ret = nxt_isolation_set_lang_mounts(task, process, lang->mounts);
516     if (nxt_slow_path(ret != NXT_OK)) {
517         return NXT_ERROR;
518     }
519 
520     process->isolation.cleanup = nxt_isolation_unmount_all;
521 
522     return NXT_OK;
523 }
524 
525 
526 static nxt_int_t
527 nxt_isolation_set_lang_mounts(nxt_task_t *task, nxt_process_t *process,
528     nxt_array_t *lang_mounts)
529 {
530     u_char          *p;
531     size_t          i, n, rootfs_len, len;
532     nxt_mp_t        *mp;
533     nxt_array_t     *mounts;
534     const u_char    *rootfs;
535     nxt_fs_mount_t  *mnt, *lang_mnt;
536 
537     mp = process->mem_pool;
538 
539     /* copy to init mem pool */
540     mounts = nxt_array_copy(mp, NULL, lang_mounts);
541     if (mounts == NULL) {
542         return NXT_ERROR;
543     }
544 
545     n = mounts->nelts;
546     mnt = mounts->elts;
547     lang_mnt = lang_mounts->elts;
548 
549     rootfs = process->isolation.rootfs;
550     rootfs_len = nxt_strlen(rootfs);
551 
552     for (i = 0; i < n; i++) {
553         len = nxt_strlen(lang_mnt[i].dst);
554 
555         mnt[i].dst = nxt_mp_alloc(mp, rootfs_len + len + 1);
556         if (nxt_slow_path(mnt[i].dst == NULL)) {
557             return NXT_ERROR;
558         }
559 
560         p = nxt_cpymem(mnt[i].dst, rootfs, rootfs_len);
561         p = nxt_cpymem(p, lang_mnt[i].dst, len);
562         *p = '\0';
563     }
564 
565     mnt = nxt_array_add(mounts);
566     if (nxt_slow_path(mnt == NULL)) {
567         return NXT_ERROR;
568     }
569 
570     mnt->src = (u_char *) "tmpfs";
571     mnt->fstype = (u_char *) "tmpfs";
572     mnt->flags = NXT_MS_NOSUID | NXT_MS_NODEV | NXT_MS_NOEXEC | NXT_MS_RELATIME;
573     mnt->data = (u_char *) "size=1m,mode=777";
574     mnt->builtin = 1;
575 
576     mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/tmp") + 1);
577     if (nxt_slow_path(mnt->dst == NULL)) {
578         return NXT_ERROR;
579     }
580 
581     p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
582     p = nxt_cpymem(p, "/tmp", 4);
583     *p = '\0';
584 
585 #if (NXT_HAVE_CLONE_NEWPID) && (NXT_HAVE_CLONE_NEWNS)
586 
587     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWPID)
588         && nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS))
589     {
590         mnt = nxt_array_add(mounts);
591         if (nxt_slow_path(mnt == NULL)) {
592             return NXT_ERROR;
593         }
594 
595         mnt->fstype = (u_char *) "proc";
596         mnt->src = (u_char *) "proc";
597 
598         mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/proc") + 1);
599         if (nxt_slow_path(mnt->dst == NULL)) {
600             return NXT_ERROR;
601         }
602 
603         p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
604         p = nxt_cpymem(p, "/proc", 5);
605         *p = '\0';
606 
607         mnt->data = (u_char *) "";
608         mnt->flags = 0;
609     }
610 #endif
611 
612     qsort(mounts->elts, mounts->nelts, sizeof(nxt_fs_mount_t),
613           nxt_isolation_mount_compare);
614 
615     process->isolation.mounts = mounts;
616 
617     return NXT_OK;
618 }
619 
620 
621 static int nxt_cdecl
622 nxt_isolation_mount_compare(const void *v1, const void *v2)
623 {
624     const nxt_fs_mount_t  *mnt1, *mnt2;
625 
626     mnt1 = v1;
627     mnt2 = v2;
628 
629     return nxt_strlen(mnt1->src) > nxt_strlen(mnt2->src);
630 }
631 
632 
633 void
634 nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process)
635 {
636     size_t                   n;
637     nxt_array_t              *mounts;
638     nxt_runtime_t            *rt;
639     nxt_fs_mount_t           *mnt;
640     nxt_process_automount_t  *automount;
641 
642     rt = task->thread->runtime;
643 
644     if (!rt->capabilities.setid) {
645         return;
646     }
647 
648 #if (NXT_HAVE_CLONE_NEWNS)
649     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
650         return;
651     }
652 #endif
653 
654     nxt_debug(task, "unmount all (%s)", process->name);
655 
656     automount = &process->isolation.automount;
657     mounts = process->isolation.mounts;
658     n = mounts->nelts;
659     mnt = mounts->elts;
660 
661     while (n > 0) {
662         n--;
663 
664         if (mnt[n].builtin && !automount->language_deps) {
665             continue;
666         }
667 
668         nxt_fs_unmount(mnt[n].dst);
669     }
670 }
671 
672 
673 nxt_int_t
674 nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
675 {
676     size_t                   i, n;
677     nxt_int_t                ret;
678     struct stat              st;
679     nxt_array_t              *mounts;
680     const u_char             *dst;
681     nxt_fs_mount_t           *mnt;
682     nxt_process_automount_t  *automount;
683 
684     automount = &process->isolation.automount;
685     mounts = process->isolation.mounts;
686 
687     n = mounts->nelts;
688     mnt = mounts->elts;
689 
690     for (i = 0; i < n; i++) {
691         dst = mnt[i].dst;
692 
693         if (mnt[i].builtin && !automount->language_deps) {
694             continue;
695         }
696 
697         if (nxt_slow_path(nxt_memcmp(mnt[i].fstype, "bind", 4) == 0
698                           && stat((const char *) mnt[i].src, &st) != 0))
699         {
700             nxt_log(task, NXT_LOG_WARN, "host path not found: %s", mnt[i].src);
701             continue;
702         }
703 
704         ret = nxt_fs_mkdir_all(dst, S_IRWXU | S_IRWXG | S_IRWXO);
705         if (nxt_slow_path(ret != NXT_OK)) {
706             nxt_alert(task, "mkdir(%s) %E", dst, nxt_errno);
707             goto undo;
708         }
709 
710         ret = nxt_fs_mount(task, &mnt[i]);
711         if (nxt_slow_path(ret != NXT_OK)) {
712             goto undo;
713         }
714     }
715 
716     return NXT_OK;
717 
718 undo:
719 
720     n = i + 1;
721 
722     for (i = 0; i < n; i++) {
723         nxt_fs_unmount(mnt[i].dst);
724     }
725 
726     return NXT_ERROR;
727 }
728 
729 
730 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
731 
732 nxt_int_t
733 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
734 {
735     char       *rootfs;
736     nxt_int_t  ret;
737 
738     rootfs = (char *) process->isolation.rootfs;
739 
740     nxt_debug(task, "change root: %s", rootfs);
741 
742     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
743         ret = nxt_isolation_pivot_root(task, rootfs);
744 
745     } else {
746         ret = nxt_isolation_chroot(task, rootfs);
747     }
748 
749     if (nxt_fast_path(ret == NXT_OK)) {
750         if (nxt_slow_path(chdir("/") < 0)) {
751             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
752             return NXT_ERROR;
753         }
754     }
755 
756     return ret;
757 }
758 
759 
760 /*
761  * pivot_root(2) can only be safely used with containers, otherwise it can
762  * umount(2) the global root filesystem and screw up the machine.
763  */
764 
765 static nxt_int_t
766 nxt_isolation_pivot_root(nxt_task_t *task, const char *path)
767 {
768     /*
769      * This implementation makes use of a kernel trick that works for ages
770      * and now documented in Linux kernel 5.
771      * https://lore.kernel.org/linux-man/87r24piwhm.fsf@x220.int.ebiederm.org/T/
772      */
773 
774     if (nxt_slow_path(mount("", "/", "", MS_SLAVE|MS_REC, "") != 0)) {
775         nxt_alert(task, "mount(\"/\", MS_SLAVE|MS_REC) failed: %E", nxt_errno);
776         return NXT_ERROR;
777     }
778 
779     if (nxt_slow_path(nxt_isolation_make_private_mount(task, path) != NXT_OK)) {
780         return NXT_ERROR;
781     }
782 
783     if (nxt_slow_path(mount(path, path, "bind", MS_BIND|MS_REC, "") != 0)) {
784         nxt_alert(task, "error bind mounting rootfs %E", nxt_errno);
785         return NXT_ERROR;
786     }
787 
788     if (nxt_slow_path(chdir(path) != 0)) {
789         nxt_alert(task, "failed to chdir(%s) %E", path, nxt_errno);
790         return NXT_ERROR;
791     }
792 
793     if (nxt_slow_path(nxt_pivot_root(".", ".") != 0)) {
794         nxt_alert(task, "failed to pivot_root %E", nxt_errno);
795         return NXT_ERROR;
796     }
797 
798     /*
799      * Demote the oldroot mount to avoid unmounts getting propagated to
800      * the host.
801      */
802     if (nxt_slow_path(mount("", ".", "", MS_SLAVE | MS_REC, NULL) != 0)) {
803         nxt_alert(task, "failed to bind mount rootfs %E", nxt_errno);
804         return NXT_ERROR;
805     }
806 
807     if (nxt_slow_path(umount2(".", MNT_DETACH) != 0)) {
808         nxt_alert(task, "failed to umount old root directory %E", nxt_errno);
809         return NXT_ERROR;
810     }
811 
812     return NXT_OK;
813 }
814 
815 
816 static nxt_int_t
817 nxt_isolation_make_private_mount(nxt_task_t *task, const char *rootfs)
818 {
819     char           *parent_mnt;
820     FILE           *procfile;
821     u_char         **mounts;
822     size_t         len;
823     uint8_t        *shared;
824     nxt_int_t      ret, index, nmounts;
825     struct mntent  *ent;
826 
827     static const char  *mount_path = "/proc/self/mounts";
828 
829     ret = NXT_ERROR;
830     ent = NULL;
831     shared = NULL;
832     procfile = NULL;
833     parent_mnt = NULL;
834 
835     nmounts = 256;
836 
837     mounts = nxt_malloc(nmounts * sizeof(uintptr_t));
838     if (nxt_slow_path(mounts == NULL)) {
839         goto fail;
840     }
841 
842     shared = nxt_malloc(nmounts);
843     if (nxt_slow_path(shared == NULL)) {
844         goto fail;
845     }
846 
847     procfile = setmntent(mount_path, "r");
848     if (nxt_slow_path(procfile == NULL)) {
849         nxt_alert(task, "failed to open %s %E", mount_path, nxt_errno);
850 
851         goto fail;
852     }
853 
854     index = 0;
855 
856 again:
857 
858     for ( ; index < nmounts; index++) {
859         ent = getmntent(procfile);
860         if (ent == NULL) {
861             nmounts = index;
862             break;
863         }
864 
865         mounts[index] = (u_char *) strdup(ent->mnt_dir);
866         shared[index] = hasmntopt(ent, "shared") != NULL;
867     }
868 
869     if (ent != NULL) {
870         /* there are still entries to be read */
871 
872         nmounts *= 2;
873         mounts = nxt_realloc(mounts, nmounts);
874         if (nxt_slow_path(mounts == NULL)) {
875             goto fail;
876         }
877 
878         shared = nxt_realloc(shared, nmounts);
879         if (nxt_slow_path(shared == NULL)) {
880             goto fail;
881         }
882 
883         goto again;
884     }
885 
886     for (index = 0; index < nmounts; index++) {
887         if (nxt_strcmp(mounts[index], rootfs) == 0) {
888             parent_mnt = (char *) rootfs;
889             break;
890         }
891     }
892 
893     if (parent_mnt == NULL) {
894         len = nxt_strlen(rootfs);
895 
896         parent_mnt = nxt_malloc(len + 1);
897         if (parent_mnt == NULL) {
898             goto fail;
899         }
900 
901         nxt_memcpy(parent_mnt, rootfs, len);
902         parent_mnt[len] = '\0';
903 
904         if (parent_mnt[len - 1] == '/') {
905             parent_mnt[len - 1] = '\0';
906             len--;
907         }
908 
909         for ( ;; ) {
910             for (index = 0; index < nmounts; index++) {
911                 if (nxt_strcmp(mounts[index], parent_mnt) == 0) {
912                     goto found;
913                 }
914             }
915 
916             if (len == 1 && parent_mnt[0] == '/') {
917                 nxt_alert(task, "parent mount not found");
918                 goto fail;
919             }
920 
921             /* parent dir */
922             while (parent_mnt[len - 1] != '/' && len > 0) {
923                 len--;
924             }
925 
926             if (nxt_slow_path(len == 0)) {
927                 nxt_alert(task, "parent mount not found");
928                 goto fail;
929             }
930 
931             if (len == 1) {
932                 parent_mnt[len] = '\0';     /* / */
933             } else {
934                 parent_mnt[len - 1] = '\0'; /* /<path> */
935             }
936         }
937     }
938 
939 found:
940 
941     if (shared[index]) {
942         if (nxt_slow_path(mount("", parent_mnt, "", MS_PRIVATE, "") != 0)) {
943             nxt_alert(task, "mount(\"\", \"%s\", MS_PRIVATE) %E", parent_mnt,
944                       nxt_errno);
945 
946             goto fail;
947         }
948     }
949 
950     ret = NXT_OK;
951 
952 fail:
953 
954     if (procfile != NULL) {
955         endmntent(procfile);
956     }
957 
958     if (mounts != NULL) {
959         for (index = 0; index < nmounts; index++) {
960             nxt_free(mounts[index]);
961         }
962 
963         nxt_free(mounts);
964     }
965 
966     if (shared != NULL) {
967         nxt_free(shared);
968     }
969 
970     if (parent_mnt != NULL && parent_mnt != rootfs) {
971         nxt_free(parent_mnt);
972     }
973 
974     return ret;
975 }
976 
977 
978 nxt_inline int
979 nxt_pivot_root(const char *new_root, const char *old_root)
980 {
981     return syscall(__NR_pivot_root, new_root, old_root);
982 }
983 
984 
985 #else /* !(NXT_HAVE_PIVOT_ROOT) || !(NXT_HAVE_CLONE_NEWNS) */
986 
987 
988 nxt_int_t
989 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
990 {
991     char       *rootfs;
992 
993     rootfs = (char *) process->isolation.rootfs;
994 
995     nxt_debug(task, "change root: %s", rootfs);
996 
997     if (nxt_fast_path(nxt_isolation_chroot(task, rootfs) == NXT_OK)) {
998         if (nxt_slow_path(chdir("/") < 0)) {
999             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
1000             return NXT_ERROR;
1001         }
1002 
1003         return NXT_OK;
1004     }
1005 
1006     return NXT_ERROR;
1007 }
1008 
1009 #endif
1010 
1011 
1012 static nxt_int_t
1013 nxt_isolation_chroot(nxt_task_t *task, const char *path)
1014 {
1015     if (nxt_slow_path(chroot(path) < 0)) {
1016         nxt_alert(task, "chroot(%s) %E", path, nxt_errno);
1017         return NXT_ERROR;
1018     }
1019 
1020     return NXT_OK;
1021 }
1022 
1023 #endif /* NXT_HAVE_ISOLATION_ROOTFS */
1024 
1025 
1026 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
1027 
1028 static nxt_int_t
1029 nxt_isolation_set_new_privs(nxt_task_t *task, nxt_conf_value_t *isolation,
1030     nxt_process_t *process)
1031 {
1032     nxt_conf_value_t  *obj;
1033 
1034     static nxt_str_t  new_privs_name = nxt_string("new_privs");
1035 
1036     obj = nxt_conf_get_object_member(isolation, &new_privs_name, NULL);
1037     if (obj != NULL) {
1038         process->isolation.new_privs = nxt_conf_get_boolean(obj);
1039     }
1040 
1041     return NXT_OK;
1042 }
1043 
1044 #endif
1045