xref: /unit/src/nxt_isolation.c (revision 2629:116cb969f351)
1 /*
2  * Copyright (C) NGINX, Inc.
3  */
4 
5 #include <nxt_main.h>
6 #include <nxt_application.h>
7 #include <nxt_process.h>
8 #include <nxt_isolation.h>
9 #include <nxt_cgroup.h>
10 
11 #if (NXT_HAVE_MNTENT_H)
12 #include <mntent.h>
13 #endif
14 
15 
16 static nxt_int_t nxt_isolation_set(nxt_task_t *task,
17     nxt_conf_value_t *isolation, nxt_process_t *process);
18 
19 #if (NXT_HAVE_CGROUP)
20 static nxt_int_t nxt_isolation_set_cgroup(nxt_task_t *task,
21     nxt_conf_value_t *isolation, nxt_process_t *process);
22 #endif
23 
24 #if (NXT_HAVE_LINUX_NS)
25 static nxt_int_t nxt_isolation_set_namespaces(nxt_task_t *task,
26     nxt_conf_value_t *isolation, nxt_process_t *process);
27 static nxt_int_t nxt_isolation_clone_flags(nxt_task_t *task,
28     nxt_conf_value_t *namespaces, nxt_clone_t *clone);
29 #endif
30 
31 #if (NXT_HAVE_CLONE_NEWUSER)
32 static nxt_int_t nxt_isolation_set_creds(nxt_task_t *task,
33     nxt_conf_value_t *isolation, nxt_process_t *process);
34 static nxt_int_t nxt_isolation_credential_map(nxt_task_t *task,
35     nxt_mp_t *mem_pool, nxt_conf_value_t *map_array,
36     nxt_clone_credential_map_t *map);
37 static nxt_int_t nxt_isolation_vldt_creds(nxt_task_t *task,
38     nxt_process_t *process);
39 #endif
40 
41 #if (NXT_HAVE_ISOLATION_ROOTFS)
42 static nxt_int_t nxt_isolation_set_rootfs(nxt_task_t *task,
43     nxt_conf_value_t *isolation, nxt_process_t *process);
44 static nxt_int_t nxt_isolation_set_automount(nxt_task_t *task,
45     nxt_conf_value_t *isolation, nxt_process_t *process);
46 static nxt_int_t nxt_isolation_set_mounts(nxt_task_t *task,
47     nxt_process_t *process, nxt_str_t *app_type);
48 static nxt_int_t nxt_isolation_set_lang_mounts(nxt_task_t *task,
49     nxt_process_t *process, nxt_array_t *syspaths);
50 static int nxt_cdecl nxt_isolation_mount_compare(const void *v1,
51     const void *v2);
52 static void nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process);
53 
54 #if (NXT_HAVE_LINUX_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
55 static nxt_int_t nxt_isolation_pivot_root(nxt_task_t *task, const char *rootfs);
56 static nxt_int_t nxt_isolation_make_private_mount(nxt_task_t *task,
57     const char *rootfs);
58 nxt_inline int nxt_pivot_root(const char *new_root, const char *old_root);
59 #endif
60 
61 static nxt_int_t nxt_isolation_chroot(nxt_task_t *task, const char *path);
62 #endif
63 
64 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
65 static nxt_int_t nxt_isolation_set_new_privs(nxt_task_t *task,
66     nxt_conf_value_t *isolation, nxt_process_t *process);
67 #endif
68 
69 
70 nxt_int_t
nxt_isolation_main_prefork(nxt_task_t * task,nxt_process_t * process,nxt_mp_t * mp)71 nxt_isolation_main_prefork(nxt_task_t *task, nxt_process_t *process,
72     nxt_mp_t *mp)
73 {
74     nxt_int_t              cap_setid;
75     nxt_int_t              ret;
76     nxt_runtime_t          *rt;
77     nxt_common_app_conf_t  *app_conf;
78 
79     rt = task->thread->runtime;
80     app_conf = process->data.app;
81     cap_setid = rt->capabilities.setid;
82 
83 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
84     process->isolation.new_privs = 1;
85 #endif
86 
87     if (app_conf->isolation != NULL) {
88         ret = nxt_isolation_set(task, app_conf->isolation, process);
89         if (nxt_slow_path(ret != NXT_OK)) {
90             return ret;
91         }
92     }
93 
94 #if (NXT_HAVE_CLONE_NEWUSER)
95     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
96         cap_setid = 1;
97     }
98 #endif
99 
100     if (cap_setid) {
101         ret = nxt_process_creds_set(task, process, &app_conf->user,
102                                     &app_conf->group);
103 
104         if (nxt_slow_path(ret != NXT_OK)) {
105             return ret;
106         }
107 
108     } else {
109         if (!nxt_str_eq(&app_conf->user, (u_char *) rt->user_cred.user,
110                         nxt_strlen(rt->user_cred.user)))
111         {
112             nxt_alert(task, "cannot set user \"%V\" for app \"%V\": "
113                       "missing capabilities", &app_conf->user, &app_conf->name);
114 
115             return NXT_ERROR;
116         }
117 
118         if (app_conf->group.length > 0
119             && !nxt_str_eq(&app_conf->group, (u_char *) rt->group,
120                            nxt_strlen(rt->group)))
121         {
122             nxt_alert(task, "cannot set group \"%V\" for app \"%V\": "
123                             "missing capabilities", &app_conf->group,
124                             &app_conf->name);
125 
126             return NXT_ERROR;
127         }
128     }
129 
130 #if (NXT_HAVE_ISOLATION_ROOTFS)
131     if (process->isolation.rootfs != NULL) {
132         nxt_int_t  has_mnt;
133 
134         ret = nxt_isolation_set_mounts(task, process, &app_conf->type);
135         if (nxt_slow_path(ret != NXT_OK)) {
136             return ret;
137         }
138 
139 #if (NXT_HAVE_CLONE_NEWNS)
140         has_mnt = nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS);
141 #else
142         has_mnt = 0;
143 #endif
144 
145         if (process->user_cred->uid == 0 && !has_mnt) {
146             nxt_log(task, NXT_LOG_WARN,
147                     "setting user \"root\" with \"rootfs\" is unsafe without "
148                     "\"mount\" namespace isolation");
149         }
150     }
151 #endif
152 
153 #if (NXT_HAVE_CLONE_NEWUSER)
154     ret = nxt_isolation_vldt_creds(task, process);
155     if (nxt_slow_path(ret != NXT_OK)) {
156         return ret;
157     }
158 #endif
159 
160     return NXT_OK;
161 }
162 
163 
164 static nxt_int_t
nxt_isolation_set(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)165 nxt_isolation_set(nxt_task_t *task, nxt_conf_value_t *isolation,
166     nxt_process_t *process)
167 {
168 #if (NXT_HAVE_CGROUP)
169     if (nxt_slow_path(nxt_isolation_set_cgroup(task, isolation, process)
170                       != NXT_OK))
171     {
172         return NXT_ERROR;
173     }
174 #endif
175 
176 #if (NXT_HAVE_LINUX_NS)
177     if (nxt_slow_path(nxt_isolation_set_namespaces(task, isolation, process)
178                       != NXT_OK))
179     {
180         return NXT_ERROR;
181     }
182 #endif
183 
184 #if (NXT_HAVE_CLONE_NEWUSER)
185     if (nxt_slow_path(nxt_isolation_set_creds(task, isolation, process)
186                       != NXT_OK))
187     {
188         return NXT_ERROR;
189     }
190 #endif
191 
192 #if (NXT_HAVE_ISOLATION_ROOTFS)
193     if (nxt_slow_path(nxt_isolation_set_rootfs(task, isolation, process)
194                       != NXT_OK))
195     {
196         return NXT_ERROR;
197     }
198 
199     if (nxt_slow_path(nxt_isolation_set_automount(task, isolation, process)
200                       != NXT_OK))
201     {
202         return NXT_ERROR;
203     }
204 #endif
205 
206 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
207     if (nxt_slow_path(nxt_isolation_set_new_privs(task, isolation, process)
208                       != NXT_OK))
209     {
210         return NXT_ERROR;
211     }
212 #endif
213 
214     return NXT_OK;
215 }
216 
217 
218 #if (NXT_HAVE_CGROUP)
219 
220 static nxt_int_t
nxt_isolation_set_cgroup(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)221 nxt_isolation_set_cgroup(nxt_task_t *task, nxt_conf_value_t *isolation,
222     nxt_process_t *process)
223 {
224     nxt_str_t         str;
225     nxt_conf_value_t  *obj;
226 
227     static nxt_str_t  cgname = nxt_string("cgroup");
228     static nxt_str_t  path = nxt_string("path");
229 
230     obj = nxt_conf_get_object_member(isolation, &cgname, NULL);
231     if (obj == NULL) {
232         return NXT_OK;
233     }
234 
235     obj = nxt_conf_get_object_member(obj, &path, NULL);
236     if (obj == NULL) {
237         return NXT_ERROR;
238     }
239 
240     nxt_conf_get_string(obj, &str);
241     process->isolation.cgroup.path = nxt_mp_alloc(process->mem_pool,
242                                                   str.length + 1);
243     nxt_memcpy(process->isolation.cgroup.path, str.start, str.length);
244     process->isolation.cgroup.path[str.length] = '\0';
245 
246     process->isolation.cgroup_cleanup = nxt_cgroup_cleanup;
247 
248     return NXT_OK;
249 }
250 
251 #endif
252 
253 
254 #if (NXT_HAVE_LINUX_NS)
255 
256 static nxt_int_t
nxt_isolation_set_namespaces(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)257 nxt_isolation_set_namespaces(nxt_task_t *task, nxt_conf_value_t *isolation,
258     nxt_process_t *process)
259 {
260     nxt_int_t         ret;
261     nxt_conf_value_t  *obj;
262 
263     static nxt_str_t  nsname = nxt_string("namespaces");
264 
265     obj = nxt_conf_get_object_member(isolation, &nsname, NULL);
266     if (obj != NULL) {
267         ret = nxt_isolation_clone_flags(task, obj, &process->isolation.clone);
268         if (nxt_slow_path(ret != NXT_OK)) {
269             return NXT_ERROR;
270         }
271     }
272 
273     return NXT_OK;
274 }
275 
276 #endif
277 
278 
279 #if (NXT_HAVE_CLONE_NEWUSER)
280 
281 static nxt_int_t
nxt_isolation_set_creds(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)282 nxt_isolation_set_creds(nxt_task_t *task, nxt_conf_value_t *isolation,
283     nxt_process_t *process)
284 {
285     nxt_int_t         ret;
286     nxt_clone_t       *clone;
287     nxt_conf_value_t  *array;
288 
289     static nxt_str_t uidname = nxt_string("uidmap");
290     static nxt_str_t gidname = nxt_string("gidmap");
291 
292     clone = &process->isolation.clone;
293 
294     array = nxt_conf_get_object_member(isolation, &uidname, NULL);
295     if (array != NULL) {
296         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
297                                            &clone->uidmap);
298 
299         if (nxt_slow_path(ret != NXT_OK)) {
300             return NXT_ERROR;
301         }
302     }
303 
304     array = nxt_conf_get_object_member(isolation, &gidname, NULL);
305     if (array != NULL) {
306         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
307                                            &clone->gidmap);
308 
309         if (nxt_slow_path(ret != NXT_OK)) {
310             return NXT_ERROR;
311         }
312     }
313 
314     return NXT_OK;
315 }
316 
317 
318 static nxt_int_t
nxt_isolation_credential_map(nxt_task_t * task,nxt_mp_t * mp,nxt_conf_value_t * map_array,nxt_clone_credential_map_t * map)319 nxt_isolation_credential_map(nxt_task_t *task, nxt_mp_t *mp,
320     nxt_conf_value_t *map_array, nxt_clone_credential_map_t *map)
321 {
322     nxt_int_t         ret;
323     nxt_uint_t        i;
324     nxt_conf_value_t  *obj;
325 
326     static nxt_conf_map_t  nxt_clone_map_entry_conf[] = {
327         {
328             nxt_string("container"),
329             NXT_CONF_MAP_INT64,
330             offsetof(nxt_clone_map_entry_t, container),
331         },
332 
333         {
334             nxt_string("host"),
335             NXT_CONF_MAP_INT64,
336             offsetof(nxt_clone_map_entry_t, host),
337         },
338 
339         {
340             nxt_string("size"),
341             NXT_CONF_MAP_INT64,
342             offsetof(nxt_clone_map_entry_t, size),
343         },
344     };
345 
346     map->size = nxt_conf_array_elements_count(map_array);
347 
348     if (map->size == 0) {
349         return NXT_OK;
350     }
351 
352     map->map = nxt_mp_alloc(mp, map->size * sizeof(nxt_clone_map_entry_t));
353     if (nxt_slow_path(map->map == NULL)) {
354         return NXT_ERROR;
355     }
356 
357     for (i = 0; i < map->size; i++) {
358         obj = nxt_conf_get_array_element(map_array, i);
359 
360         ret = nxt_conf_map_object(mp, obj, nxt_clone_map_entry_conf,
361                                   nxt_nitems(nxt_clone_map_entry_conf),
362                                   map->map + i);
363         if (nxt_slow_path(ret != NXT_OK)) {
364             nxt_alert(task, "clone map entry map error");
365             return NXT_ERROR;
366         }
367     }
368 
369     return NXT_OK;
370 }
371 
372 
373 static nxt_int_t
nxt_isolation_vldt_creds(nxt_task_t * task,nxt_process_t * process)374 nxt_isolation_vldt_creds(nxt_task_t *task, nxt_process_t *process)
375 {
376     nxt_int_t         ret;
377     nxt_clone_t       *clone;
378     nxt_credential_t  *creds;
379 
380     clone = &process->isolation.clone;
381     creds = process->user_cred;
382 
383     if (clone->uidmap.size == 0 && clone->gidmap.size == 0) {
384         return NXT_OK;
385     }
386 
387     if (!nxt_is_clone_flag_set(clone->flags, NEWUSER)) {
388         if (nxt_slow_path(clone->uidmap.size > 0)) {
389             nxt_log(task, NXT_LOG_ERR, "\"uidmap\" is set but "
390                     "\"isolation.namespaces.credential\" is false or unset");
391 
392             return NXT_ERROR;
393         }
394 
395         if (nxt_slow_path(clone->gidmap.size > 0)) {
396             nxt_log(task, NXT_LOG_ERR, "\"gidmap\" is set but "
397                     "\"isolation.namespaces.credential\" is false or unset");
398 
399             return NXT_ERROR;
400         }
401 
402         return NXT_OK;
403     }
404 
405     ret = nxt_clone_vldt_credential_uidmap(task, &clone->uidmap, creds);
406     if (nxt_slow_path(ret != NXT_OK)) {
407         return NXT_ERROR;
408     }
409 
410     return nxt_clone_vldt_credential_gidmap(task, &clone->gidmap, creds);
411 }
412 
413 #endif
414 
415 
416 #if (NXT_HAVE_LINUX_NS)
417 
418 static nxt_int_t
nxt_isolation_clone_flags(nxt_task_t * task,nxt_conf_value_t * namespaces,nxt_clone_t * clone)419 nxt_isolation_clone_flags(nxt_task_t *task, nxt_conf_value_t *namespaces,
420     nxt_clone_t *clone)
421 {
422     uint32_t          index;
423     nxt_str_t         name;
424     nxt_int_t         flag;
425     nxt_conf_value_t  *value;
426 
427     index = 0;
428 
429     for ( ;; ) {
430         value = nxt_conf_next_object_member(namespaces, &name, &index);
431 
432         if (value == NULL) {
433             break;
434         }
435 
436         flag = 0;
437 
438 #if (NXT_HAVE_CLONE_NEWUSER)
439         if (nxt_str_eq(&name, "credential", 10)) {
440             flag = CLONE_NEWUSER;
441         }
442 #endif
443 
444 #if (NXT_HAVE_CLONE_NEWPID)
445         if (nxt_str_eq(&name, "pid", 3)) {
446             flag = CLONE_NEWPID;
447         }
448 #endif
449 
450 #if (NXT_HAVE_CLONE_NEWNET)
451         if (nxt_str_eq(&name, "network", 7)) {
452             flag = CLONE_NEWNET;
453         }
454 #endif
455 
456 #if (NXT_HAVE_CLONE_NEWUTS)
457         if (nxt_str_eq(&name, "uname", 5)) {
458             flag = CLONE_NEWUTS;
459         }
460 #endif
461 
462 #if (NXT_HAVE_CLONE_NEWNS)
463         if (nxt_str_eq(&name, "mount", 5)) {
464             flag = CLONE_NEWNS;
465         }
466 #endif
467 
468 #if (NXT_HAVE_CLONE_NEWCGROUP)
469         if (nxt_str_eq(&name, "cgroup", 6)) {
470             flag = CLONE_NEWCGROUP;
471         }
472 #endif
473 
474         if (!flag) {
475             nxt_alert(task, "unknown namespace flag: \"%V\"", &name);
476             return NXT_ERROR;
477         }
478 
479         if (nxt_conf_get_boolean(value)) {
480             clone->flags |= flag;
481         }
482     }
483 
484     return NXT_OK;
485 }
486 
487 #endif
488 
489 
490 #if (NXT_HAVE_ISOLATION_ROOTFS)
491 
492 static nxt_int_t
nxt_isolation_set_rootfs(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)493 nxt_isolation_set_rootfs(nxt_task_t *task, nxt_conf_value_t *isolation,
494     nxt_process_t *process)
495 {
496     nxt_str_t         str;
497     nxt_conf_value_t  *obj;
498 
499     static nxt_str_t  rootfs_name = nxt_string("rootfs");
500 
501     obj = nxt_conf_get_object_member(isolation, &rootfs_name, NULL);
502     if (obj != NULL) {
503         nxt_conf_get_string(obj, &str);
504 
505         if (nxt_slow_path(str.length <= 1 || str.start[0] != '/')) {
506             nxt_log(task, NXT_LOG_ERR, "rootfs requires an absolute path other "
507                     "than \"/\" but given \"%V\"", &str);
508 
509             return NXT_ERROR;
510         }
511 
512         if (str.start[str.length - 1] == '/') {
513             str.length--;
514         }
515 
516         process->isolation.rootfs = nxt_mp_alloc(process->mem_pool,
517                                                  str.length + 1);
518 
519         if (nxt_slow_path(process->isolation.rootfs == NULL)) {
520             return NXT_ERROR;
521         }
522 
523         nxt_memcpy(process->isolation.rootfs, str.start, str.length);
524 
525         process->isolation.rootfs[str.length] = '\0';
526     }
527 
528     return NXT_OK;
529 }
530 
531 
532 static nxt_int_t
nxt_isolation_set_automount(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)533 nxt_isolation_set_automount(nxt_task_t *task, nxt_conf_value_t *isolation,
534     nxt_process_t *process)
535 {
536     nxt_conf_value_t         *conf, *value;
537     nxt_process_automount_t  *automount;
538 
539     static nxt_str_t  automount_name = nxt_string("automount");
540     static nxt_str_t  langdeps_name = nxt_string("language_deps");
541     static nxt_str_t  tmp_name = nxt_string("tmpfs");
542     static nxt_str_t  proc_name = nxt_string("procfs");
543 
544     automount = &process->isolation.automount;
545 
546     automount->language_deps = 1;
547     automount->tmpfs = 1;
548     automount->procfs = 1;
549 
550     conf = nxt_conf_get_object_member(isolation, &automount_name, NULL);
551     if (conf != NULL) {
552         value = nxt_conf_get_object_member(conf, &langdeps_name, NULL);
553         if (value != NULL) {
554             automount->language_deps = nxt_conf_get_boolean(value);
555         }
556 
557         value = nxt_conf_get_object_member(conf, &tmp_name, NULL);
558         if (value != NULL) {
559             automount->tmpfs = nxt_conf_get_boolean(value);
560         }
561 
562         value = nxt_conf_get_object_member(conf, &proc_name, NULL);
563         if (value != NULL) {
564             automount->procfs = nxt_conf_get_boolean(value);
565         }
566     }
567 
568     return NXT_OK;
569 }
570 
571 
572 static nxt_int_t
nxt_isolation_set_mounts(nxt_task_t * task,nxt_process_t * process,nxt_str_t * app_type)573 nxt_isolation_set_mounts(nxt_task_t *task, nxt_process_t *process,
574     nxt_str_t *app_type)
575 {
576     nxt_int_t              ret, cap_chroot;
577     nxt_runtime_t          *rt;
578     nxt_app_lang_module_t  *lang;
579 
580     rt = task->thread->runtime;
581     cap_chroot = rt->capabilities.chroot;
582     lang = nxt_app_lang_module(rt, app_type);
583 
584     nxt_assert(lang != NULL);
585 
586 #if (NXT_HAVE_CLONE_NEWUSER)
587     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
588         cap_chroot = 1;
589     }
590 #endif
591 
592     if (!cap_chroot) {
593         nxt_log(task, NXT_LOG_ERR, "The \"rootfs\" field requires privileges");
594         return NXT_ERROR;
595     }
596 
597     ret = nxt_isolation_set_lang_mounts(task, process, lang->mounts);
598     if (nxt_slow_path(ret != NXT_OK)) {
599         return NXT_ERROR;
600     }
601 
602     process->isolation.cleanup = nxt_isolation_unmount_all;
603 
604     return NXT_OK;
605 }
606 
607 
608 static nxt_int_t
nxt_isolation_set_lang_mounts(nxt_task_t * task,nxt_process_t * process,nxt_array_t * lang_mounts)609 nxt_isolation_set_lang_mounts(nxt_task_t *task, nxt_process_t *process,
610     nxt_array_t *lang_mounts)
611 {
612     u_char          *p;
613     size_t          i, n, rootfs_len, len;
614     nxt_mp_t        *mp;
615     nxt_array_t     *mounts;
616     const u_char    *rootfs;
617     nxt_fs_mount_t  *mnt, *lang_mnt;
618 
619     mp = process->mem_pool;
620 
621     /* copy to init mem pool */
622     mounts = nxt_array_copy(mp, NULL, lang_mounts);
623     if (mounts == NULL) {
624         return NXT_ERROR;
625     }
626 
627     n = mounts->nelts;
628     mnt = mounts->elts;
629     lang_mnt = lang_mounts->elts;
630 
631     rootfs = process->isolation.rootfs;
632     rootfs_len = nxt_strlen(rootfs);
633 
634     for (i = 0; i < n; i++) {
635         len = nxt_strlen(lang_mnt[i].dst);
636 
637         mnt[i].dst = nxt_mp_alloc(mp, rootfs_len + len + 1);
638         if (nxt_slow_path(mnt[i].dst == NULL)) {
639             return NXT_ERROR;
640         }
641 
642         p = nxt_cpymem(mnt[i].dst, rootfs, rootfs_len);
643         p = nxt_cpymem(p, lang_mnt[i].dst, len);
644         *p = '\0';
645     }
646 
647     if (process->isolation.automount.tmpfs) {
648         mnt = nxt_array_add(mounts);
649         if (nxt_slow_path(mnt == NULL)) {
650             return NXT_ERROR;
651         }
652 
653         mnt->src = (u_char *) "tmpfs";
654         mnt->name = (u_char *) "tmpfs";
655         mnt->type = NXT_FS_TMP;
656         mnt->flags = (NXT_FS_FLAGS_NOSUID
657                       | NXT_FS_FLAGS_NODEV
658                       | NXT_FS_FLAGS_NOEXEC);
659         mnt->data = (u_char *) "size=1m,mode=1777";
660         mnt->builtin = 1;
661         mnt->deps = 0;
662 
663         mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/tmp") + 1);
664         if (nxt_slow_path(mnt->dst == NULL)) {
665             return NXT_ERROR;
666         }
667 
668         p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
669         p = nxt_cpymem(p, "/tmp", 4);
670         *p = '\0';
671     }
672 
673     if (process->isolation.automount.procfs) {
674         mnt = nxt_array_add(mounts);
675         if (nxt_slow_path(mnt == NULL)) {
676             return NXT_ERROR;
677         }
678 
679         mnt->name = (u_char *) "proc";
680         mnt->type = NXT_FS_PROC;
681         mnt->src = (u_char *) "none";
682         mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/proc") + 1);
683         if (nxt_slow_path(mnt->dst == NULL)) {
684             return NXT_ERROR;
685         }
686 
687         p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
688         p = nxt_cpymem(p, "/proc", 5);
689         *p = '\0';
690 
691         mnt->data = (u_char *) "";
692         mnt->flags = NXT_FS_FLAGS_NOEXEC | NXT_FS_FLAGS_NOSUID;
693         mnt->builtin = 1;
694         mnt->deps = 0;
695     }
696 
697     qsort(mounts->elts, mounts->nelts, sizeof(nxt_fs_mount_t),
698           nxt_isolation_mount_compare);
699 
700     process->isolation.mounts = mounts;
701 
702     return NXT_OK;
703 }
704 
705 
706 static int nxt_cdecl
nxt_isolation_mount_compare(const void * v1,const void * v2)707 nxt_isolation_mount_compare(const void *v1, const void *v2)
708 {
709     const nxt_fs_mount_t  *mnt1, *mnt2;
710 
711     mnt1 = v1;
712     mnt2 = v2;
713 
714     return nxt_strlen(mnt1->src) > nxt_strlen(mnt2->src);
715 }
716 
717 
718 void
nxt_isolation_unmount_all(nxt_task_t * task,nxt_process_t * process)719 nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process)
720 {
721     size_t                   n;
722     nxt_array_t              *mounts;
723     nxt_runtime_t            *rt;
724     nxt_fs_mount_t           *mnt;
725     nxt_process_automount_t  *automount;
726 
727     rt = task->thread->runtime;
728 
729     if (!rt->capabilities.setid) {
730         return;
731     }
732 
733     nxt_debug(task, "unmount all (%s)", process->name);
734 
735     automount = &process->isolation.automount;
736     mounts = process->isolation.mounts;
737     n = mounts->nelts;
738     mnt = mounts->elts;
739 
740     while (n > 0) {
741         n--;
742 
743         if (mnt[n].deps && !automount->language_deps) {
744             continue;
745         }
746 
747         nxt_fs_unmount(mnt[n].dst);
748     }
749 }
750 
751 
752 nxt_int_t
nxt_isolation_prepare_rootfs(nxt_task_t * task,nxt_process_t * process)753 nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
754 {
755     size_t                   i, n;
756     nxt_int_t                ret;
757     struct stat              st;
758     nxt_array_t              *mounts;
759     const u_char             *dst;
760     nxt_fs_mount_t           *mnt;
761     nxt_process_automount_t  *automount;
762 
763     automount = &process->isolation.automount;
764     mounts = process->isolation.mounts;
765 
766     n = mounts->nelts;
767     mnt = mounts->elts;
768 
769     for (i = 0; i < n; i++) {
770         dst = mnt[i].dst;
771 
772         if (mnt[i].deps && !automount->language_deps) {
773             continue;
774         }
775 
776         if (nxt_slow_path(mnt[i].type == NXT_FS_BIND
777                           && stat((const char *) mnt[i].src, &st) != 0))
778         {
779             nxt_log(task, NXT_LOG_WARN, "host path not found: %s", mnt[i].src);
780             continue;
781         }
782 
783         ret = nxt_fs_mkdir_all(dst, S_IRWXU | S_IRWXG | S_IRWXO);
784         if (nxt_slow_path(ret != NXT_OK)) {
785             nxt_alert(task, "mkdir(%s) %E", dst, nxt_errno);
786             goto undo;
787         }
788 
789         ret = nxt_fs_mount(task, &mnt[i]);
790         if (nxt_slow_path(ret != NXT_OK)) {
791             goto undo;
792         }
793     }
794 
795     return NXT_OK;
796 
797 undo:
798 
799     n = i + 1;
800 
801     for (i = 0; i < n; i++) {
802         nxt_fs_unmount(mnt[i].dst);
803     }
804 
805     return NXT_ERROR;
806 }
807 
808 
809 #if (NXT_HAVE_LINUX_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
810 
811 nxt_int_t
nxt_isolation_change_root(nxt_task_t * task,nxt_process_t * process)812 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
813 {
814     char       *rootfs;
815     nxt_int_t  ret;
816 
817     rootfs = (char *) process->isolation.rootfs;
818 
819     nxt_debug(task, "change root: %s", rootfs);
820 
821     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
822         ret = nxt_isolation_pivot_root(task, rootfs);
823 
824     } else {
825         ret = nxt_isolation_chroot(task, rootfs);
826     }
827 
828     if (nxt_fast_path(ret == NXT_OK)) {
829         if (nxt_slow_path(chdir("/") < 0)) {
830             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
831             return NXT_ERROR;
832         }
833     }
834 
835     return ret;
836 }
837 
838 
839 /*
840  * pivot_root(2) can only be safely used with containers, otherwise it can
841  * umount(2) the global root filesystem and screw up the machine.
842  */
843 
844 static nxt_int_t
nxt_isolation_pivot_root(nxt_task_t * task,const char * path)845 nxt_isolation_pivot_root(nxt_task_t *task, const char *path)
846 {
847     /*
848      * This implementation makes use of a kernel trick that works for ages
849      * and now documented in Linux kernel 5.
850      * https://lore.kernel.org/linux-man/87r24piwhm.fsf@x220.int.ebiederm.org/T/
851      */
852 
853     if (nxt_slow_path(mount("", "/", "", MS_SLAVE|MS_REC, "") != 0)) {
854         nxt_alert(task, "mount(\"/\", MS_SLAVE|MS_REC) failed: %E", nxt_errno);
855         return NXT_ERROR;
856     }
857 
858     if (nxt_slow_path(nxt_isolation_make_private_mount(task, path) != NXT_OK)) {
859         return NXT_ERROR;
860     }
861 
862     if (nxt_slow_path(mount(path, path, "bind", MS_BIND|MS_REC, "") != 0)) {
863         nxt_alert(task, "error bind mounting rootfs %E", nxt_errno);
864         return NXT_ERROR;
865     }
866 
867     if (nxt_slow_path(chdir(path) != 0)) {
868         nxt_alert(task, "failed to chdir(%s) %E", path, nxt_errno);
869         return NXT_ERROR;
870     }
871 
872     if (nxt_slow_path(nxt_pivot_root(".", ".") != 0)) {
873         nxt_alert(task, "failed to pivot_root %E", nxt_errno);
874         return NXT_ERROR;
875     }
876 
877     /*
878      * Demote the oldroot mount to avoid unmounts getting propagated to
879      * the host.
880      */
881     if (nxt_slow_path(mount("", ".", "", MS_SLAVE | MS_REC, NULL) != 0)) {
882         nxt_alert(task, "failed to bind mount rootfs %E", nxt_errno);
883         return NXT_ERROR;
884     }
885 
886     if (nxt_slow_path(umount2(".", MNT_DETACH) != 0)) {
887         nxt_alert(task, "failed to umount old root directory %E", nxt_errno);
888         return NXT_ERROR;
889     }
890 
891     return NXT_OK;
892 }
893 
894 
895 static nxt_int_t
nxt_isolation_make_private_mount(nxt_task_t * task,const char * rootfs)896 nxt_isolation_make_private_mount(nxt_task_t *task, const char *rootfs)
897 {
898     char           *parent_mnt;
899     FILE           *procfile;
900     u_char         **mounts;
901     size_t         len;
902     uint8_t        *shared;
903     nxt_int_t      ret, index, nmounts;
904     struct mntent  *ent;
905 
906     static const char  *mount_path = "/proc/self/mounts";
907 
908     ret = NXT_ERROR;
909     ent = NULL;
910     shared = NULL;
911     procfile = NULL;
912     parent_mnt = NULL;
913 
914     nmounts = 256;
915 
916     mounts = nxt_malloc(nmounts * sizeof(uintptr_t));
917     if (nxt_slow_path(mounts == NULL)) {
918         goto fail;
919     }
920 
921     shared = nxt_malloc(nmounts);
922     if (nxt_slow_path(shared == NULL)) {
923         goto fail;
924     }
925 
926     procfile = setmntent(mount_path, "r");
927     if (nxt_slow_path(procfile == NULL)) {
928         nxt_alert(task, "failed to open %s %E", mount_path, nxt_errno);
929 
930         goto fail;
931     }
932 
933     index = 0;
934 
935 again:
936 
937     for ( ; index < nmounts; index++) {
938         ent = getmntent(procfile);
939         if (ent == NULL) {
940             nmounts = index;
941             break;
942         }
943 
944         mounts[index] = (u_char *) strdup(ent->mnt_dir);
945         shared[index] = hasmntopt(ent, "shared") != NULL;
946     }
947 
948     if (ent != NULL) {
949         /* there are still entries to be read */
950 
951         nmounts *= 2;
952         mounts = nxt_realloc(mounts, nmounts);
953         if (nxt_slow_path(mounts == NULL)) {
954             goto fail;
955         }
956 
957         shared = nxt_realloc(shared, nmounts);
958         if (nxt_slow_path(shared == NULL)) {
959             goto fail;
960         }
961 
962         goto again;
963     }
964 
965     for (index = 0; index < nmounts; index++) {
966         if (nxt_strcmp(mounts[index], rootfs) == 0) {
967             parent_mnt = (char *) rootfs;
968             break;
969         }
970     }
971 
972     if (parent_mnt == NULL) {
973         len = nxt_strlen(rootfs);
974 
975         parent_mnt = nxt_malloc(len + 1);
976         if (parent_mnt == NULL) {
977             goto fail;
978         }
979 
980         nxt_memcpy(parent_mnt, rootfs, len);
981         parent_mnt[len] = '\0';
982 
983         if (parent_mnt[len - 1] == '/') {
984             parent_mnt[len - 1] = '\0';
985             len--;
986         }
987 
988         for ( ;; ) {
989             for (index = 0; index < nmounts; index++) {
990                 if (nxt_strcmp(mounts[index], parent_mnt) == 0) {
991                     goto found;
992                 }
993             }
994 
995             if (len == 1 && parent_mnt[0] == '/') {
996                 nxt_alert(task, "parent mount not found");
997                 goto fail;
998             }
999 
1000             /* parent dir */
1001             while (parent_mnt[len - 1] != '/' && len > 0) {
1002                 len--;
1003             }
1004 
1005             if (nxt_slow_path(len == 0)) {
1006                 nxt_alert(task, "parent mount not found");
1007                 goto fail;
1008             }
1009 
1010             if (len == 1) {
1011                 parent_mnt[len] = '\0';     /* / */
1012             } else {
1013                 parent_mnt[len - 1] = '\0'; /* /<path> */
1014             }
1015         }
1016     }
1017 
1018 found:
1019 
1020     if (shared[index]) {
1021         if (nxt_slow_path(mount("", parent_mnt, "", MS_PRIVATE, "") != 0)) {
1022             nxt_alert(task, "mount(\"\", \"%s\", MS_PRIVATE) %E", parent_mnt,
1023                       nxt_errno);
1024 
1025             goto fail;
1026         }
1027     }
1028 
1029     ret = NXT_OK;
1030 
1031 fail:
1032 
1033     if (procfile != NULL) {
1034         endmntent(procfile);
1035     }
1036 
1037     if (mounts != NULL) {
1038         for (index = 0; index < nmounts; index++) {
1039             nxt_free(mounts[index]);
1040         }
1041 
1042         nxt_free(mounts);
1043     }
1044 
1045     if (shared != NULL) {
1046         nxt_free(shared);
1047     }
1048 
1049     if (parent_mnt != NULL && parent_mnt != rootfs) {
1050         nxt_free(parent_mnt);
1051     }
1052 
1053     return ret;
1054 }
1055 
1056 
1057 nxt_inline int
nxt_pivot_root(const char * new_root,const char * old_root)1058 nxt_pivot_root(const char *new_root, const char *old_root)
1059 {
1060     return syscall(SYS_pivot_root, new_root, old_root);
1061 }
1062 
1063 
1064 #else /* !(NXT_HAVE_LINUX_PIVOT_ROOT) || !(NXT_HAVE_CLONE_NEWNS) */
1065 
1066 
1067 nxt_int_t
nxt_isolation_change_root(nxt_task_t * task,nxt_process_t * process)1068 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
1069 {
1070     char       *rootfs;
1071 
1072     rootfs = (char *) process->isolation.rootfs;
1073 
1074     nxt_debug(task, "change root: %s", rootfs);
1075 
1076     if (nxt_fast_path(nxt_isolation_chroot(task, rootfs) == NXT_OK)) {
1077         if (nxt_slow_path(chdir("/") < 0)) {
1078             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
1079             return NXT_ERROR;
1080         }
1081 
1082         return NXT_OK;
1083     }
1084 
1085     return NXT_ERROR;
1086 }
1087 
1088 #endif
1089 
1090 
1091 static nxt_int_t
nxt_isolation_chroot(nxt_task_t * task,const char * path)1092 nxt_isolation_chroot(nxt_task_t *task, const char *path)
1093 {
1094     if (nxt_slow_path(chroot(path) < 0)) {
1095         nxt_alert(task, "chroot(%s) %E", path, nxt_errno);
1096         return NXT_ERROR;
1097     }
1098 
1099     return NXT_OK;
1100 }
1101 
1102 #endif /* NXT_HAVE_ISOLATION_ROOTFS */
1103 
1104 
1105 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
1106 
1107 static nxt_int_t
nxt_isolation_set_new_privs(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)1108 nxt_isolation_set_new_privs(nxt_task_t *task, nxt_conf_value_t *isolation,
1109     nxt_process_t *process)
1110 {
1111     nxt_conf_value_t  *obj;
1112 
1113     static nxt_str_t  new_privs_name = nxt_string("new_privs");
1114 
1115     obj = nxt_conf_get_object_member(isolation, &new_privs_name, NULL);
1116     if (obj != NULL) {
1117         process->isolation.new_privs = nxt_conf_get_boolean(obj);
1118     }
1119 
1120     return NXT_OK;
1121 }
1122 
1123 #endif
1124