xref: /unit/src/nxt_isolation.c (revision 1704:6a003e0f3a6e)
1 /*
2  * Copyright (C) NGINX, Inc.
3  */
4 
5 #include <nxt_main.h>
6 #include <nxt_application.h>
7 #include <nxt_process.h>
8 #include <nxt_isolation.h>
9 
10 #if (NXT_HAVE_PIVOT_ROOT)
11 #include <mntent.h>
12 #endif
13 
14 
15 static nxt_int_t nxt_isolation_set(nxt_task_t *task,
16     nxt_conf_value_t *isolation, nxt_process_t *process);
17 
18 #if (NXT_HAVE_CLONE)
19 static nxt_int_t nxt_isolation_set_namespaces(nxt_task_t *task,
20     nxt_conf_value_t *isolation, nxt_process_t *process);
21 static nxt_int_t nxt_isolation_clone_flags(nxt_task_t *task,
22     nxt_conf_value_t *namespaces, nxt_clone_t *clone);
23 #endif
24 
25 #if (NXT_HAVE_CLONE_NEWUSER)
26 static nxt_int_t nxt_isolation_set_creds(nxt_task_t *task,
27     nxt_conf_value_t *isolation, nxt_process_t *process);
28 static nxt_int_t nxt_isolation_credential_map(nxt_task_t *task,
29     nxt_mp_t *mem_pool, nxt_conf_value_t *map_array,
30     nxt_clone_credential_map_t *map);
31 static nxt_int_t nxt_isolation_vldt_creds(nxt_task_t *task,
32     nxt_process_t *process);
33 #endif
34 
35 #if (NXT_HAVE_ISOLATION_ROOTFS)
36 static nxt_int_t nxt_isolation_set_rootfs(nxt_task_t *task,
37     nxt_conf_value_t *isolation, nxt_process_t *process);
38 static nxt_int_t nxt_isolation_set_automount(nxt_task_t *task,
39     nxt_conf_value_t *isolation, nxt_process_t *process);
40 static nxt_int_t nxt_isolation_set_mounts(nxt_task_t *task,
41     nxt_process_t *process, nxt_str_t *app_type);
42 static nxt_int_t nxt_isolation_set_lang_mounts(nxt_task_t *task,
43     nxt_process_t *process, nxt_array_t *syspaths);
44 static int nxt_cdecl nxt_isolation_mount_compare(const void *v1,
45     const void *v2);
46 static void nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process);
47 
48 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
49 static nxt_int_t nxt_isolation_pivot_root(nxt_task_t *task, const char *rootfs);
50 static nxt_int_t nxt_isolation_make_private_mount(nxt_task_t *task,
51     const char *rootfs);
52 nxt_inline int nxt_pivot_root(const char *new_root, const char *old_root);
53 #endif
54 
55 static nxt_int_t nxt_isolation_chroot(nxt_task_t *task, const char *path);
56 #endif
57 
58 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
59 static nxt_int_t nxt_isolation_set_new_privs(nxt_task_t *task,
60     nxt_conf_value_t *isolation, nxt_process_t *process);
61 #endif
62 
63 
64 nxt_int_t
65 nxt_isolation_main_prefork(nxt_task_t *task, nxt_process_t *process,
66     nxt_mp_t *mp)
67 {
68     nxt_int_t              cap_setid;
69     nxt_int_t              ret;
70     nxt_runtime_t          *rt;
71     nxt_common_app_conf_t  *app_conf;
72 
73     rt = task->thread->runtime;
74     app_conf = process->data.app;
75     cap_setid = rt->capabilities.setid;
76 
77     if (app_conf->isolation != NULL) {
78         ret = nxt_isolation_set(task, app_conf->isolation, process);
79         if (nxt_slow_path(ret != NXT_OK)) {
80             return ret;
81         }
82     }
83 
84 #if (NXT_HAVE_CLONE_NEWUSER)
85     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
86         cap_setid = 1;
87     }
88 #endif
89 
90     if (cap_setid) {
91         ret = nxt_process_creds_set(task, process, &app_conf->user,
92                                     &app_conf->group);
93 
94         if (nxt_slow_path(ret != NXT_OK)) {
95             return ret;
96         }
97 
98     } else {
99         if (!nxt_str_eq(&app_conf->user, (u_char *) rt->user_cred.user,
100                         nxt_strlen(rt->user_cred.user)))
101         {
102             nxt_alert(task, "cannot set user \"%V\" for app \"%V\": "
103                       "missing capabilities", &app_conf->user, &app_conf->name);
104 
105             return NXT_ERROR;
106         }
107 
108         if (app_conf->group.length > 0
109             && !nxt_str_eq(&app_conf->group, (u_char *) rt->group,
110                            nxt_strlen(rt->group)))
111         {
112             nxt_alert(task, "cannot set group \"%V\" for app \"%V\": "
113                             "missing capabilities", &app_conf->group,
114                             &app_conf->name);
115 
116             return NXT_ERROR;
117         }
118     }
119 
120 #if (NXT_HAVE_ISOLATION_ROOTFS)
121     if (process->isolation.rootfs != NULL) {
122         nxt_int_t  has_mnt;
123 
124         ret = nxt_isolation_set_mounts(task, process, &app_conf->type);
125         if (nxt_slow_path(ret != NXT_OK)) {
126             return ret;
127         }
128 
129         has_mnt = 0;
130 
131 #if (NXT_HAVE_CLONE_NEWNS)
132         has_mnt = nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS);
133 #endif
134 
135         if (process->user_cred->uid == 0 && !has_mnt) {
136             nxt_log(task, NXT_LOG_WARN,
137                     "setting user \"root\" with \"rootfs\" is unsafe without "
138                     "\"mount\" namespace isolation");
139         }
140     }
141 #endif
142 
143 #if (NXT_HAVE_CLONE_NEWUSER)
144     ret = nxt_isolation_vldt_creds(task, process);
145     if (nxt_slow_path(ret != NXT_OK)) {
146         return ret;
147     }
148 #endif
149 
150     return NXT_OK;
151 }
152 
153 
154 static nxt_int_t
155 nxt_isolation_set(nxt_task_t *task, nxt_conf_value_t *isolation,
156     nxt_process_t *process)
157 {
158 #if (NXT_HAVE_CLONE)
159     if (nxt_slow_path(nxt_isolation_set_namespaces(task, isolation, process)
160                       != NXT_OK))
161     {
162         return NXT_ERROR;
163     }
164 #endif
165 
166 #if (NXT_HAVE_CLONE_NEWUSER)
167     if (nxt_slow_path(nxt_isolation_set_creds(task, isolation, process)
168                       != NXT_OK))
169     {
170         return NXT_ERROR;
171     }
172 #endif
173 
174 #if (NXT_HAVE_ISOLATION_ROOTFS)
175     if (nxt_slow_path(nxt_isolation_set_rootfs(task, isolation, process)
176                       != NXT_OK))
177     {
178         return NXT_ERROR;
179     }
180 
181     if (nxt_slow_path(nxt_isolation_set_automount(task, isolation, process)
182                       != NXT_OK))
183     {
184         return NXT_ERROR;
185     }
186 #endif
187 
188 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
189     if (nxt_slow_path(nxt_isolation_set_new_privs(task, isolation, process)
190                       != NXT_OK))
191     {
192         return NXT_ERROR;
193     }
194 #endif
195 
196     return NXT_OK;
197 }
198 
199 
200 #if (NXT_HAVE_CLONE)
201 
202 static nxt_int_t
203 nxt_isolation_set_namespaces(nxt_task_t *task, nxt_conf_value_t *isolation,
204     nxt_process_t *process)
205 {
206     nxt_int_t         ret;
207     nxt_conf_value_t  *obj;
208 
209     static nxt_str_t  nsname = nxt_string("namespaces");
210 
211     obj = nxt_conf_get_object_member(isolation, &nsname, NULL);
212     if (obj != NULL) {
213         ret = nxt_isolation_clone_flags(task, obj, &process->isolation.clone);
214         if (nxt_slow_path(ret != NXT_OK)) {
215             return NXT_ERROR;
216         }
217     }
218 
219     return NXT_OK;
220 }
221 
222 #endif
223 
224 
225 #if (NXT_HAVE_CLONE_NEWUSER)
226 
227 static nxt_int_t
228 nxt_isolation_set_creds(nxt_task_t *task, nxt_conf_value_t *isolation,
229     nxt_process_t *process)
230 {
231     nxt_int_t         ret;
232     nxt_clone_t       *clone;
233     nxt_conf_value_t  *array;
234 
235     static nxt_str_t uidname = nxt_string("uidmap");
236     static nxt_str_t gidname = nxt_string("gidmap");
237 
238     clone = &process->isolation.clone;
239 
240     array = nxt_conf_get_object_member(isolation, &uidname, NULL);
241     if (array != NULL) {
242         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
243                                            &clone->uidmap);
244 
245         if (nxt_slow_path(ret != NXT_OK)) {
246             return NXT_ERROR;
247         }
248     }
249 
250     array = nxt_conf_get_object_member(isolation, &gidname, NULL);
251     if (array != NULL) {
252         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
253                                            &clone->gidmap);
254 
255         if (nxt_slow_path(ret != NXT_OK)) {
256             return NXT_ERROR;
257         }
258     }
259 
260     return NXT_OK;
261 }
262 
263 
264 static nxt_int_t
265 nxt_isolation_credential_map(nxt_task_t *task, nxt_mp_t *mp,
266     nxt_conf_value_t *map_array, nxt_clone_credential_map_t *map)
267 {
268     nxt_int_t         ret;
269     nxt_uint_t        i;
270     nxt_conf_value_t  *obj;
271 
272     static nxt_conf_map_t  nxt_clone_map_entry_conf[] = {
273         {
274             nxt_string("container"),
275             NXT_CONF_MAP_INT,
276             offsetof(nxt_clone_map_entry_t, container),
277         },
278 
279         {
280             nxt_string("host"),
281             NXT_CONF_MAP_INT,
282             offsetof(nxt_clone_map_entry_t, host),
283         },
284 
285         {
286             nxt_string("size"),
287             NXT_CONF_MAP_INT,
288             offsetof(nxt_clone_map_entry_t, size),
289         },
290     };
291 
292     map->size = nxt_conf_array_elements_count(map_array);
293 
294     if (map->size == 0) {
295         return NXT_OK;
296     }
297 
298     map->map = nxt_mp_alloc(mp, map->size * sizeof(nxt_clone_map_entry_t));
299     if (nxt_slow_path(map->map == NULL)) {
300         return NXT_ERROR;
301     }
302 
303     for (i = 0; i < map->size; i++) {
304         obj = nxt_conf_get_array_element(map_array, i);
305 
306         ret = nxt_conf_map_object(mp, obj, nxt_clone_map_entry_conf,
307                                   nxt_nitems(nxt_clone_map_entry_conf),
308                                   map->map + i);
309         if (nxt_slow_path(ret != NXT_OK)) {
310             nxt_alert(task, "clone map entry map error");
311             return NXT_ERROR;
312         }
313     }
314 
315     return NXT_OK;
316 }
317 
318 
319 static nxt_int_t
320 nxt_isolation_vldt_creds(nxt_task_t *task, nxt_process_t *process)
321 {
322     nxt_int_t         ret;
323     nxt_clone_t       *clone;
324     nxt_credential_t  *creds;
325 
326     clone = &process->isolation.clone;
327     creds = process->user_cred;
328 
329     if (clone->uidmap.size == 0 && clone->gidmap.size == 0) {
330         return NXT_OK;
331     }
332 
333     if (!nxt_is_clone_flag_set(clone->flags, NEWUSER)) {
334         if (nxt_slow_path(clone->uidmap.size > 0)) {
335             nxt_log(task, NXT_LOG_ERR, "\"uidmap\" is set but "
336                     "\"isolation.namespaces.credential\" is false or unset");
337 
338             return NXT_ERROR;
339         }
340 
341         if (nxt_slow_path(clone->gidmap.size > 0)) {
342             nxt_log(task, NXT_LOG_ERR, "\"gidmap\" is set but "
343                     "\"isolation.namespaces.credential\" is false or unset");
344 
345             return NXT_ERROR;
346         }
347 
348         return NXT_OK;
349     }
350 
351     ret = nxt_clone_vldt_credential_uidmap(task, &clone->uidmap, creds);
352     if (nxt_slow_path(ret != NXT_OK)) {
353         return NXT_ERROR;
354     }
355 
356     return nxt_clone_vldt_credential_gidmap(task, &clone->gidmap, creds);
357 }
358 
359 #endif
360 
361 
362 #if (NXT_HAVE_CLONE)
363 
364 static nxt_int_t
365 nxt_isolation_clone_flags(nxt_task_t *task, nxt_conf_value_t *namespaces,
366     nxt_clone_t *clone)
367 {
368     uint32_t          index;
369     nxt_str_t         name;
370     nxt_int_t         flag;
371     nxt_conf_value_t  *value;
372 
373     index = 0;
374 
375     for ( ;; ) {
376         value = nxt_conf_next_object_member(namespaces, &name, &index);
377 
378         if (value == NULL) {
379             break;
380         }
381 
382         flag = 0;
383 
384 #if (NXT_HAVE_CLONE_NEWUSER)
385         if (nxt_str_eq(&name, "credential", 10)) {
386             flag = CLONE_NEWUSER;
387         }
388 #endif
389 
390 #if (NXT_HAVE_CLONE_NEWPID)
391         if (nxt_str_eq(&name, "pid", 3)) {
392             flag = CLONE_NEWPID;
393         }
394 #endif
395 
396 #if (NXT_HAVE_CLONE_NEWNET)
397         if (nxt_str_eq(&name, "network", 7)) {
398             flag = CLONE_NEWNET;
399         }
400 #endif
401 
402 #if (NXT_HAVE_CLONE_NEWUTS)
403         if (nxt_str_eq(&name, "uname", 5)) {
404             flag = CLONE_NEWUTS;
405         }
406 #endif
407 
408 #if (NXT_HAVE_CLONE_NEWNS)
409         if (nxt_str_eq(&name, "mount", 5)) {
410             flag = CLONE_NEWNS;
411         }
412 #endif
413 
414 #if (NXT_HAVE_CLONE_NEWCGROUP)
415         if (nxt_str_eq(&name, "cgroup", 6)) {
416             flag = CLONE_NEWCGROUP;
417         }
418 #endif
419 
420         if (!flag) {
421             nxt_alert(task, "unknown namespace flag: \"%V\"", &name);
422             return NXT_ERROR;
423         }
424 
425         if (nxt_conf_get_boolean(value)) {
426             clone->flags |= flag;
427         }
428     }
429 
430     return NXT_OK;
431 }
432 
433 #endif
434 
435 
436 #if (NXT_HAVE_ISOLATION_ROOTFS)
437 
438 static nxt_int_t
439 nxt_isolation_set_rootfs(nxt_task_t *task, nxt_conf_value_t *isolation,
440     nxt_process_t *process)
441 {
442     nxt_str_t         str;
443     nxt_conf_value_t  *obj;
444 
445     static nxt_str_t  rootfs_name = nxt_string("rootfs");
446 
447     obj = nxt_conf_get_object_member(isolation, &rootfs_name, NULL);
448     if (obj != NULL) {
449         nxt_conf_get_string(obj, &str);
450 
451         if (nxt_slow_path(str.length <= 1 || str.start[0] != '/')) {
452             nxt_log(task, NXT_LOG_ERR, "rootfs requires an absolute path other "
453                     "than \"/\" but given \"%V\"", &str);
454 
455             return NXT_ERROR;
456         }
457 
458         if (str.start[str.length - 1] == '/') {
459             str.length--;
460         }
461 
462         process->isolation.rootfs = nxt_mp_alloc(process->mem_pool,
463                                                  str.length + 1);
464 
465         if (nxt_slow_path(process->isolation.rootfs == NULL)) {
466             return NXT_ERROR;
467         }
468 
469         nxt_memcpy(process->isolation.rootfs, str.start, str.length);
470 
471         process->isolation.rootfs[str.length] = '\0';
472     }
473 
474     return NXT_OK;
475 }
476 
477 
478 static nxt_int_t
479 nxt_isolation_set_automount(nxt_task_t *task, nxt_conf_value_t *isolation,
480     nxt_process_t *process)
481 {
482     nxt_conf_value_t         *conf, *value;
483     nxt_process_automount_t  *automount;
484 
485     static nxt_str_t  automount_name = nxt_string("automount");
486     static nxt_str_t  langdeps_name = nxt_string("language_deps");
487     static nxt_str_t  tmp_name = nxt_string("tmpfs");
488 
489     automount = &process->isolation.automount;
490 
491     automount->language_deps = 1;
492     automount->tmpfs = 1;
493 
494     conf = nxt_conf_get_object_member(isolation, &automount_name, NULL);
495     if (conf != NULL) {
496         value = nxt_conf_get_object_member(conf, &langdeps_name, NULL);
497         if (value != NULL) {
498             automount->language_deps = nxt_conf_get_boolean(value);
499         }
500 
501         value = nxt_conf_get_object_member(conf, &tmp_name, NULL);
502         if (value != NULL) {
503             automount->tmpfs = nxt_conf_get_boolean(value);
504         }
505     }
506 
507     return NXT_OK;
508 }
509 
510 
511 static nxt_int_t
512 nxt_isolation_set_mounts(nxt_task_t *task, nxt_process_t *process,
513     nxt_str_t *app_type)
514 {
515     nxt_int_t              ret, cap_chroot;
516     nxt_runtime_t          *rt;
517     nxt_app_lang_module_t  *lang;
518 
519     rt = task->thread->runtime;
520     cap_chroot = rt->capabilities.chroot;
521     lang = nxt_app_lang_module(rt, app_type);
522 
523     nxt_assert(lang != NULL);
524 
525 #if (NXT_HAVE_CLONE_NEWUSER)
526     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
527         cap_chroot = 1;
528     }
529 #endif
530 
531     if (!cap_chroot) {
532         nxt_log(task, NXT_LOG_ERR, "The \"rootfs\" field requires privileges");
533         return NXT_ERROR;
534     }
535 
536     ret = nxt_isolation_set_lang_mounts(task, process, lang->mounts);
537     if (nxt_slow_path(ret != NXT_OK)) {
538         return NXT_ERROR;
539     }
540 
541     process->isolation.cleanup = nxt_isolation_unmount_all;
542 
543     return NXT_OK;
544 }
545 
546 
547 static nxt_int_t
548 nxt_isolation_set_lang_mounts(nxt_task_t *task, nxt_process_t *process,
549     nxt_array_t *lang_mounts)
550 {
551     u_char          *p;
552     size_t          i, n, rootfs_len, len;
553     nxt_mp_t        *mp;
554     nxt_array_t     *mounts;
555     const u_char    *rootfs;
556     nxt_fs_mount_t  *mnt, *lang_mnt;
557 
558     mp = process->mem_pool;
559 
560     /* copy to init mem pool */
561     mounts = nxt_array_copy(mp, NULL, lang_mounts);
562     if (mounts == NULL) {
563         return NXT_ERROR;
564     }
565 
566     n = mounts->nelts;
567     mnt = mounts->elts;
568     lang_mnt = lang_mounts->elts;
569 
570     rootfs = process->isolation.rootfs;
571     rootfs_len = nxt_strlen(rootfs);
572 
573     for (i = 0; i < n; i++) {
574         len = nxt_strlen(lang_mnt[i].dst);
575 
576         mnt[i].dst = nxt_mp_alloc(mp, rootfs_len + len + 1);
577         if (nxt_slow_path(mnt[i].dst == NULL)) {
578             return NXT_ERROR;
579         }
580 
581         p = nxt_cpymem(mnt[i].dst, rootfs, rootfs_len);
582         p = nxt_cpymem(p, lang_mnt[i].dst, len);
583         *p = '\0';
584     }
585 
586     if (process->isolation.automount.tmpfs) {
587         mnt = nxt_array_add(mounts);
588         if (nxt_slow_path(mnt == NULL)) {
589             return NXT_ERROR;
590         }
591 
592         mnt->src = (u_char *) "tmpfs";
593         mnt->name = (u_char *) "tmpfs";
594         mnt->type = NXT_FS_TMP;
595         mnt->flags = (NXT_FS_FLAGS_NOSUID
596                       | NXT_FS_FLAGS_NODEV
597                       | NXT_FS_FLAGS_NOEXEC);
598         mnt->data = (u_char *) "size=1m,mode=777";
599         mnt->builtin = 1;
600         mnt->deps = 0;
601 
602         mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/tmp") + 1);
603         if (nxt_slow_path(mnt->dst == NULL)) {
604             return NXT_ERROR;
605         }
606 
607         p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
608         p = nxt_cpymem(p, "/tmp", 4);
609         *p = '\0';
610     }
611 
612     mnt = nxt_array_add(mounts);
613     if (nxt_slow_path(mnt == NULL)) {
614         return NXT_ERROR;
615     }
616 
617     mnt->name = (u_char *) "proc";
618     mnt->type = NXT_FS_PROC;
619     mnt->src = (u_char *) "none";
620     mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/proc") + 1);
621     if (nxt_slow_path(mnt->dst == NULL)) {
622         return NXT_ERROR;
623     }
624 
625     p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
626     p = nxt_cpymem(p, "/proc", 5);
627     *p = '\0';
628 
629     mnt->data = (u_char *) "";
630     mnt->flags = NXT_FS_FLAGS_NOEXEC | NXT_FS_FLAGS_NOSUID;
631     mnt->builtin = 1;
632     mnt->deps = 0;
633 
634     qsort(mounts->elts, mounts->nelts, sizeof(nxt_fs_mount_t),
635           nxt_isolation_mount_compare);
636 
637     process->isolation.mounts = mounts;
638 
639     return NXT_OK;
640 }
641 
642 
643 static int nxt_cdecl
644 nxt_isolation_mount_compare(const void *v1, const void *v2)
645 {
646     const nxt_fs_mount_t  *mnt1, *mnt2;
647 
648     mnt1 = v1;
649     mnt2 = v2;
650 
651     return nxt_strlen(mnt1->src) > nxt_strlen(mnt2->src);
652 }
653 
654 
655 void
656 nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process)
657 {
658     size_t                   n;
659     nxt_array_t              *mounts;
660     nxt_runtime_t            *rt;
661     nxt_fs_mount_t           *mnt;
662     nxt_process_automount_t  *automount;
663 
664     rt = task->thread->runtime;
665 
666     if (!rt->capabilities.setid) {
667         return;
668     }
669 
670 #if (NXT_HAVE_CLONE_NEWNS)
671     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
672         return;
673     }
674 #endif
675 
676     nxt_debug(task, "unmount all (%s)", process->name);
677 
678     automount = &process->isolation.automount;
679     mounts = process->isolation.mounts;
680     n = mounts->nelts;
681     mnt = mounts->elts;
682 
683     while (n > 0) {
684         n--;
685 
686         if (mnt[n].deps && !automount->language_deps) {
687             continue;
688         }
689 
690         nxt_fs_unmount(mnt[n].dst);
691     }
692 }
693 
694 
695 nxt_int_t
696 nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
697 {
698     size_t                   i, n;
699     nxt_int_t                ret;
700     struct stat              st;
701     nxt_array_t              *mounts;
702     const u_char             *dst;
703     nxt_fs_mount_t           *mnt;
704     nxt_process_automount_t  *automount;
705 
706     automount = &process->isolation.automount;
707     mounts = process->isolation.mounts;
708 
709     n = mounts->nelts;
710     mnt = mounts->elts;
711 
712     for (i = 0; i < n; i++) {
713         dst = mnt[i].dst;
714 
715         if (mnt[i].deps && !automount->language_deps) {
716             continue;
717         }
718 
719         if (nxt_slow_path(mnt[i].type == NXT_FS_BIND
720                           && stat((const char *) mnt[i].src, &st) != 0))
721         {
722             nxt_log(task, NXT_LOG_WARN, "host path not found: %s", mnt[i].src);
723             continue;
724         }
725 
726         ret = nxt_fs_mkdir_all(dst, S_IRWXU | S_IRWXG | S_IRWXO);
727         if (nxt_slow_path(ret != NXT_OK)) {
728             nxt_alert(task, "mkdir(%s) %E", dst, nxt_errno);
729             goto undo;
730         }
731 
732         ret = nxt_fs_mount(task, &mnt[i]);
733         if (nxt_slow_path(ret != NXT_OK)) {
734             goto undo;
735         }
736     }
737 
738     return NXT_OK;
739 
740 undo:
741 
742     n = i + 1;
743 
744     for (i = 0; i < n; i++) {
745         nxt_fs_unmount(mnt[i].dst);
746     }
747 
748     return NXT_ERROR;
749 }
750 
751 
752 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
753 
754 nxt_int_t
755 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
756 {
757     char       *rootfs;
758     nxt_int_t  ret;
759 
760     rootfs = (char *) process->isolation.rootfs;
761 
762     nxt_debug(task, "change root: %s", rootfs);
763 
764     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
765         ret = nxt_isolation_pivot_root(task, rootfs);
766 
767     } else {
768         ret = nxt_isolation_chroot(task, rootfs);
769     }
770 
771     if (nxt_fast_path(ret == NXT_OK)) {
772         if (nxt_slow_path(chdir("/") < 0)) {
773             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
774             return NXT_ERROR;
775         }
776     }
777 
778     return ret;
779 }
780 
781 
782 /*
783  * pivot_root(2) can only be safely used with containers, otherwise it can
784  * umount(2) the global root filesystem and screw up the machine.
785  */
786 
787 static nxt_int_t
788 nxt_isolation_pivot_root(nxt_task_t *task, const char *path)
789 {
790     /*
791      * This implementation makes use of a kernel trick that works for ages
792      * and now documented in Linux kernel 5.
793      * https://lore.kernel.org/linux-man/87r24piwhm.fsf@x220.int.ebiederm.org/T/
794      */
795 
796     if (nxt_slow_path(mount("", "/", "", MS_SLAVE|MS_REC, "") != 0)) {
797         nxt_alert(task, "mount(\"/\", MS_SLAVE|MS_REC) failed: %E", nxt_errno);
798         return NXT_ERROR;
799     }
800 
801     if (nxt_slow_path(nxt_isolation_make_private_mount(task, path) != NXT_OK)) {
802         return NXT_ERROR;
803     }
804 
805     if (nxt_slow_path(mount(path, path, "bind", MS_BIND|MS_REC, "") != 0)) {
806         nxt_alert(task, "error bind mounting rootfs %E", nxt_errno);
807         return NXT_ERROR;
808     }
809 
810     if (nxt_slow_path(chdir(path) != 0)) {
811         nxt_alert(task, "failed to chdir(%s) %E", path, nxt_errno);
812         return NXT_ERROR;
813     }
814 
815     if (nxt_slow_path(nxt_pivot_root(".", ".") != 0)) {
816         nxt_alert(task, "failed to pivot_root %E", nxt_errno);
817         return NXT_ERROR;
818     }
819 
820     /*
821      * Demote the oldroot mount to avoid unmounts getting propagated to
822      * the host.
823      */
824     if (nxt_slow_path(mount("", ".", "", MS_SLAVE | MS_REC, NULL) != 0)) {
825         nxt_alert(task, "failed to bind mount rootfs %E", nxt_errno);
826         return NXT_ERROR;
827     }
828 
829     if (nxt_slow_path(umount2(".", MNT_DETACH) != 0)) {
830         nxt_alert(task, "failed to umount old root directory %E", nxt_errno);
831         return NXT_ERROR;
832     }
833 
834     return NXT_OK;
835 }
836 
837 
838 static nxt_int_t
839 nxt_isolation_make_private_mount(nxt_task_t *task, const char *rootfs)
840 {
841     char           *parent_mnt;
842     FILE           *procfile;
843     u_char         **mounts;
844     size_t         len;
845     uint8_t        *shared;
846     nxt_int_t      ret, index, nmounts;
847     struct mntent  *ent;
848 
849     static const char  *mount_path = "/proc/self/mounts";
850 
851     ret = NXT_ERROR;
852     ent = NULL;
853     shared = NULL;
854     procfile = NULL;
855     parent_mnt = NULL;
856 
857     nmounts = 256;
858 
859     mounts = nxt_malloc(nmounts * sizeof(uintptr_t));
860     if (nxt_slow_path(mounts == NULL)) {
861         goto fail;
862     }
863 
864     shared = nxt_malloc(nmounts);
865     if (nxt_slow_path(shared == NULL)) {
866         goto fail;
867     }
868 
869     procfile = setmntent(mount_path, "r");
870     if (nxt_slow_path(procfile == NULL)) {
871         nxt_alert(task, "failed to open %s %E", mount_path, nxt_errno);
872 
873         goto fail;
874     }
875 
876     index = 0;
877 
878 again:
879 
880     for ( ; index < nmounts; index++) {
881         ent = getmntent(procfile);
882         if (ent == NULL) {
883             nmounts = index;
884             break;
885         }
886 
887         mounts[index] = (u_char *) strdup(ent->mnt_dir);
888         shared[index] = hasmntopt(ent, "shared") != NULL;
889     }
890 
891     if (ent != NULL) {
892         /* there are still entries to be read */
893 
894         nmounts *= 2;
895         mounts = nxt_realloc(mounts, nmounts);
896         if (nxt_slow_path(mounts == NULL)) {
897             goto fail;
898         }
899 
900         shared = nxt_realloc(shared, nmounts);
901         if (nxt_slow_path(shared == NULL)) {
902             goto fail;
903         }
904 
905         goto again;
906     }
907 
908     for (index = 0; index < nmounts; index++) {
909         if (nxt_strcmp(mounts[index], rootfs) == 0) {
910             parent_mnt = (char *) rootfs;
911             break;
912         }
913     }
914 
915     if (parent_mnt == NULL) {
916         len = nxt_strlen(rootfs);
917 
918         parent_mnt = nxt_malloc(len + 1);
919         if (parent_mnt == NULL) {
920             goto fail;
921         }
922 
923         nxt_memcpy(parent_mnt, rootfs, len);
924         parent_mnt[len] = '\0';
925 
926         if (parent_mnt[len - 1] == '/') {
927             parent_mnt[len - 1] = '\0';
928             len--;
929         }
930 
931         for ( ;; ) {
932             for (index = 0; index < nmounts; index++) {
933                 if (nxt_strcmp(mounts[index], parent_mnt) == 0) {
934                     goto found;
935                 }
936             }
937 
938             if (len == 1 && parent_mnt[0] == '/') {
939                 nxt_alert(task, "parent mount not found");
940                 goto fail;
941             }
942 
943             /* parent dir */
944             while (parent_mnt[len - 1] != '/' && len > 0) {
945                 len--;
946             }
947 
948             if (nxt_slow_path(len == 0)) {
949                 nxt_alert(task, "parent mount not found");
950                 goto fail;
951             }
952 
953             if (len == 1) {
954                 parent_mnt[len] = '\0';     /* / */
955             } else {
956                 parent_mnt[len - 1] = '\0'; /* /<path> */
957             }
958         }
959     }
960 
961 found:
962 
963     if (shared[index]) {
964         if (nxt_slow_path(mount("", parent_mnt, "", MS_PRIVATE, "") != 0)) {
965             nxt_alert(task, "mount(\"\", \"%s\", MS_PRIVATE) %E", parent_mnt,
966                       nxt_errno);
967 
968             goto fail;
969         }
970     }
971 
972     ret = NXT_OK;
973 
974 fail:
975 
976     if (procfile != NULL) {
977         endmntent(procfile);
978     }
979 
980     if (mounts != NULL) {
981         for (index = 0; index < nmounts; index++) {
982             nxt_free(mounts[index]);
983         }
984 
985         nxt_free(mounts);
986     }
987 
988     if (shared != NULL) {
989         nxt_free(shared);
990     }
991 
992     if (parent_mnt != NULL && parent_mnt != rootfs) {
993         nxt_free(parent_mnt);
994     }
995 
996     return ret;
997 }
998 
999 
1000 nxt_inline int
1001 nxt_pivot_root(const char *new_root, const char *old_root)
1002 {
1003     return syscall(__NR_pivot_root, new_root, old_root);
1004 }
1005 
1006 
1007 #else /* !(NXT_HAVE_PIVOT_ROOT) || !(NXT_HAVE_CLONE_NEWNS) */
1008 
1009 
1010 nxt_int_t
1011 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
1012 {
1013     char       *rootfs;
1014 
1015     rootfs = (char *) process->isolation.rootfs;
1016 
1017     nxt_debug(task, "change root: %s", rootfs);
1018 
1019     if (nxt_fast_path(nxt_isolation_chroot(task, rootfs) == NXT_OK)) {
1020         if (nxt_slow_path(chdir("/") < 0)) {
1021             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
1022             return NXT_ERROR;
1023         }
1024 
1025         return NXT_OK;
1026     }
1027 
1028     return NXT_ERROR;
1029 }
1030 
1031 #endif
1032 
1033 
1034 static nxt_int_t
1035 nxt_isolation_chroot(nxt_task_t *task, const char *path)
1036 {
1037     if (nxt_slow_path(chroot(path) < 0)) {
1038         nxt_alert(task, "chroot(%s) %E", path, nxt_errno);
1039         return NXT_ERROR;
1040     }
1041 
1042     return NXT_OK;
1043 }
1044 
1045 #endif /* NXT_HAVE_ISOLATION_ROOTFS */
1046 
1047 
1048 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
1049 
1050 static nxt_int_t
1051 nxt_isolation_set_new_privs(nxt_task_t *task, nxt_conf_value_t *isolation,
1052     nxt_process_t *process)
1053 {
1054     nxt_conf_value_t  *obj;
1055 
1056     static nxt_str_t  new_privs_name = nxt_string("new_privs");
1057 
1058     obj = nxt_conf_get_object_member(isolation, &new_privs_name, NULL);
1059     if (obj != NULL) {
1060         process->isolation.new_privs = nxt_conf_get_boolean(obj);
1061     }
1062 
1063     return NXT_OK;
1064 }
1065 
1066 #endif
1067