b1d99b4c81f6d033efbd46c93943e65c26ca171f
[multipath-tools/.git] / libmultipath / checkers / rbd.c
1 /*
2  * Copyright (c) 2016 Red Hat
3  * Copyright (c) 2004 Christophe Varoqui
4  *
5  * Code based off of tur.c and ceph's krbd.cc
6  */
7 #define _GNU_SOURCE
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <fcntl.h>
13 #include <errno.h>
14 #include <pthread.h>
15 #include <libudev.h>
16 #include <ifaddrs.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <sys/ioctl.h>
20 #include <sys/time.h>
21 #include <sys/wait.h>
22
23 #include "rados/librados.h"
24
25 #include "structs.h"
26 #include "checkers.h"
27
28 #include "../libmultipath/debug.h"
29 #include "../libmultipath/util.h"
30 #include "../libmultipath/time-util.h"
31 #include "../libmultipath/util.h"
32
33 struct rbd_checker_context;
34 typedef int (thread_fn)(struct rbd_checker_context *ct, char *msg);
35
36 #define RBD_MSG(msg, fmt, args...) snprintf(msg, CHECKER_MSG_LEN, fmt, ##args);
37
38 #define RBD_FEATURE_EXCLUSIVE_LOCK      (1 << 2)
39
40 struct rbd_checker_context {
41         int rbd_bus_id;
42         char *client_addr;
43         char *config_info;
44         char *snap;
45         char *pool;
46         char *image;
47         char *username;
48         int remapped;
49         int blacklisted;
50         unsigned lock_on_read:1;
51
52         rados_t cluster;
53
54         int state;
55         int running;
56         time_t time;
57         thread_fn *fn;
58         pthread_t thread;
59         pthread_mutex_t lock;
60         pthread_cond_t active;
61         pthread_spinlock_t hldr_lock;
62         int holders;
63         char message[CHECKER_MSG_LEN];
64 };
65
66 int libcheck_init(struct checker * c)
67 {
68         struct rbd_checker_context *ct;
69         struct udev_device *block_dev;
70         struct udev_device *bus_dev;
71         struct udev *udev;
72         struct stat sb;
73         const char *block_name, *addr, *config_info, *features_str;
74         const char *image, *pool, *snap, *username;
75         uint64_t features = 0;
76         char sysfs_path[PATH_SIZE];
77         int ret;
78
79         ct = malloc(sizeof(struct rbd_checker_context));
80         if (!ct)
81                 return 1;
82         memset(ct, 0, sizeof(struct rbd_checker_context));
83         ct->holders = 1;
84         pthread_cond_init_mono(&ct->active);
85         pthread_mutex_init(&ct->lock, NULL);
86         pthread_spin_init(&ct->hldr_lock, PTHREAD_PROCESS_PRIVATE);
87         c->context = ct;
88
89         /*
90          * The rbd block layer sysfs device is not linked to the rbd bus
91          * device that we interact with, so figure that out now.
92          */
93         if (fstat(c->fd, &sb) != 0)
94                 goto free_ct;
95
96         udev = udev_new();
97         if (!udev)
98                 goto free_ct;
99
100         block_dev = udev_device_new_from_devnum(udev, 'b', sb.st_rdev);
101         if (!block_dev)
102                 goto free_udev;
103
104         block_name  = udev_device_get_sysname(block_dev);
105         ret = sscanf(block_name, "rbd%d", &ct->rbd_bus_id);
106
107         udev_device_unref(block_dev);
108         if (ret != 1)
109                 goto free_udev;
110
111         snprintf(sysfs_path, sizeof(sysfs_path), "/sys/bus/rbd/devices/%d",
112                  ct->rbd_bus_id);
113         bus_dev = udev_device_new_from_syspath(udev, sysfs_path);
114         if (!bus_dev)
115                 goto free_udev;
116
117         addr = udev_device_get_sysattr_value(bus_dev, "client_addr");
118         if (!addr) {
119                 condlog(0, "rbd%d: Could not find client_addr in rbd sysfs. "
120                         "Try updating kernel", ct->rbd_bus_id);
121                 goto free_dev;
122         }
123
124         ct->client_addr = strdup(addr);
125         if (!ct->client_addr)
126                 goto free_dev;
127
128         features_str = udev_device_get_sysattr_value(bus_dev, "features");
129         if (!features_str)
130                 goto free_addr;
131         features = strtoll(features_str, NULL, 16);
132         if (!(features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
133                 condlog(3, "rbd%d: Exclusive lock not set.", ct->rbd_bus_id);
134                 goto free_addr;
135         }
136
137         config_info = udev_device_get_sysattr_value(bus_dev, "config_info");
138         if (!config_info)
139                 goto free_addr;
140
141         if (!strstr(config_info, "noshare")) {
142                 condlog(3, "rbd%d: Only nonshared clients supported.",
143                         ct->rbd_bus_id);
144                 goto free_addr;
145         }
146
147         if (strstr(config_info, "lock_on_read"))
148                 ct->lock_on_read = 1;
149
150         ct->config_info = strdup(config_info);
151         if (!ct->config_info)
152                 goto free_addr;
153
154         username = strstr(config_info, "name=");
155         if (username) {
156                 char *end;
157                 int len;
158
159                 username += 5;
160                 end = strchr(username, ',');
161                 if (!end)
162                         goto free_info;
163                 len = end - username;
164
165                 ct->username = malloc(len + 1);
166                 if (!ct->username)
167                         goto free_info;
168                 strncpy(ct->username, username, len);
169                 ct->username[len] = '\0';
170         }
171
172         image = udev_device_get_sysattr_value(bus_dev, "name");
173         if (!image)
174                 goto free_username;
175
176         ct->image = strdup(image);
177         if (!ct->image)
178                 goto free_username;
179
180         pool = udev_device_get_sysattr_value(bus_dev, "pool");
181         if (!pool)
182                 goto free_image;
183
184         ct->pool = strdup(pool);
185         if (!ct->pool)
186                 goto free_image;
187
188         snap = udev_device_get_sysattr_value(bus_dev, "current_snap");
189         if (!snap)
190                 goto free_pool;
191
192         if (strcmp("-", snap)) {
193                 ct->snap = strdup(snap);
194                 if (!ct->snap)
195                         goto free_pool;
196         }
197
198         if (rados_create(&ct->cluster, NULL) < 0) {
199                 condlog(0, "rbd%d: Could not create rados cluster",
200                         ct->rbd_bus_id);
201                 goto free_snap;
202         }
203
204         if (rados_conf_read_file(ct->cluster, NULL) < 0) {
205                 condlog(0, "rbd%d: Could not read rados conf", ct->rbd_bus_id);
206                 goto shutdown_rados;
207         }
208
209         ret = rados_connect(ct->cluster);
210         if (ret < 0) {
211                 condlog(0, "rbd%d: Could not connect to rados cluster",
212                         ct->rbd_bus_id);
213                 goto shutdown_rados;
214         }
215
216         udev_device_unref(bus_dev);
217         udev_unref(udev);
218
219         condlog(3, "rbd%d checker init %s %s/%s@%s %s", ct->rbd_bus_id,
220                 ct->client_addr, ct->pool, ct->image, ct->snap ? ct->snap : "-",
221                 ct->username ? ct->username : "none");
222         return 0;
223
224 shutdown_rados:
225         rados_shutdown(ct->cluster);
226 free_snap:
227         if (ct->snap)
228                 free(ct->snap);
229 free_pool:
230         free(ct->pool);
231 free_image:
232         free(ct->image);
233 free_username:
234         if (ct->username)
235                 free(ct->username);
236 free_info:
237         free(ct->config_info);
238 free_addr:
239         free(ct->client_addr);
240 free_dev:
241         udev_device_unref(bus_dev);
242 free_udev:
243         udev_unref(udev);
244 free_ct:
245         free(ct);
246         return 1;
247 }
248
249 static void cleanup_context(struct rbd_checker_context *ct)
250 {
251         pthread_mutex_destroy(&ct->lock);
252         pthread_cond_destroy(&ct->active);
253         pthread_spin_destroy(&ct->hldr_lock);
254
255         rados_shutdown(ct->cluster);
256
257         if (ct->username)
258                 free(ct->username);
259         if (ct->snap)
260                 free(ct->snap);
261         free(ct->pool);
262         free(ct->image);
263         free(ct->config_info);
264         free(ct->client_addr);
265         free(ct);
266 }
267
268 void libcheck_free(struct checker * c)
269 {
270         if (c->context) {
271                 struct rbd_checker_context *ct = c->context;
272                 int holders;
273                 pthread_t thread;
274
275                 pthread_spin_lock(&ct->hldr_lock);
276                 ct->holders--;
277                 holders = ct->holders;
278                 thread = ct->thread;
279                 pthread_spin_unlock(&ct->hldr_lock);
280                 if (holders)
281                         pthread_cancel(thread);
282                 else
283                         cleanup_context(ct);
284                 c->context = NULL;
285         }
286 }
287
288 static int rbd_is_blacklisted(struct rbd_checker_context *ct, char *msg)
289 {
290         char *addr_tok, *start, *save;
291         const char *cmd[2];
292         char *blklist, *stat;
293         size_t blklist_len, stat_len;
294         int ret;
295         char *end;
296
297         cmd[0] = "{\"prefix\": \"osd blacklist ls\"}";
298         cmd[1] = NULL;
299
300         ret = rados_mon_command(ct->cluster, (const char **)cmd, 1, "", 0,
301                                 &blklist, &blklist_len, &stat, &stat_len);
302         if (ret < 0) {
303                 RBD_MSG(msg, "checker failed: mon command failed %d", ret);
304                 return ret;
305         }
306
307         if (!blklist || !blklist_len)
308                 goto free_bufs;
309
310         /*
311          * parse list of addrs with the format
312          * ipv4:port/nonce date time\n
313          * or
314          * [ipv6]:port/nonce date time\n
315          */
316         ret = 0;
317         for (start = blklist; ; start = NULL) {
318                 addr_tok = strtok_r(start, "\n", &save);
319                 if (!addr_tok || !strlen(addr_tok))
320                         break;
321
322                 end = strchr(addr_tok, ' ');
323                 if (!end) {
324                         RBD_MSG(msg, "checker failed: invalid blacklist %s",
325                                  addr_tok);
326                         break;
327                 }
328                 *end = '\0';
329
330                 if (!strcmp(addr_tok, ct->client_addr)) {
331                         ct->blacklisted = 1;
332                         RBD_MSG(msg, "%s is blacklisted", ct->client_addr);
333                         ret = 1;
334                         break;
335                 }
336         }
337
338 free_bufs:
339         rados_buffer_free(blklist);
340         rados_buffer_free(stat);
341         return ret;
342 }
343
344 static int rbd_check(struct rbd_checker_context *ct, char *msg)
345 {
346         if (ct->blacklisted || rbd_is_blacklisted(ct, msg) == 1)
347                 return PATH_DOWN;
348
349         RBD_MSG(msg, "checker reports path is up");
350         /*
351          * Path may have issues, but the ceph cluster is at least
352          * accepting IO, so we can attempt to do IO.
353          *
354          * TODO: in future versions, we can run other tests to
355          * verify OSDs and networks.
356          */
357         return PATH_UP;
358 }
359
360 static int sysfs_write_rbd_bus(const char *which, const char *buf,
361                                size_t buf_len)
362 {
363         char sysfs_path[PATH_SIZE];
364         int fd;
365         int r;
366
367         /* we require newer kernels so single_major should always be there */
368         snprintf(sysfs_path, sizeof(sysfs_path),
369                  "/sys/bus/rbd/%s_single_major", which);
370         fd = open(sysfs_path, O_WRONLY);
371         if (fd < 0)
372                 return -errno;
373
374         r = safe_write(fd, buf, buf_len);
375         close(fd);
376         return r;
377 }
378
379 static int rbd_remap(struct rbd_checker_context *ct)
380 {
381         char *argv[11];
382         pid_t pid;
383         int ret = 0, i = 0;
384         int status;
385
386         pid = fork();
387         switch (pid) {
388         case 0:
389                 argv[i++] = "rbd";
390                 argv[i++] = "map";
391                 if (ct->lock_on_read)
392                         argv[i++] = "-o noshare,lock_on_read";
393                 else
394                         argv[i++] = "-o noshare";
395                 if (ct->username) {
396                         argv[i++] = "--id";
397                         argv[i++] = ct->username;
398                 }
399                 argv[i++] = "--pool";
400                 argv[i++] = ct->pool;
401                 if (ct->snap) {
402                         argv[i++] = "--snap";
403                         argv[i++] = ct->snap;
404                 }
405                 argv[i++] = ct->image;
406                 argv[i] = NULL;
407
408                 ret = execvp(argv[0], argv);
409                 condlog(0, "rbd%d: Error executing rbd: %s", ct->rbd_bus_id,
410                         strerror(errno));
411                 exit(-1);
412         case -1:
413                 condlog(0, "rbd%d: fork failed: %s", ct->rbd_bus_id,
414                         strerror(errno));
415                 return -1;
416         default:
417                 ret = -1;
418                 wait(&status);
419                 if (WIFEXITED(status)) {
420                         status = WEXITSTATUS(status);
421                         if (status == 0)
422                                 ret = 0;
423                         else
424                                 condlog(0, "rbd%d: failed with %d",
425                                         ct->rbd_bus_id, status);
426                 }
427         }
428
429         return ret;
430 }
431
432 static int sysfs_write_rbd_remove(const char *buf, int buf_len)
433 {
434         return sysfs_write_rbd_bus("remove", buf, buf_len);
435 }
436
437 static int rbd_rm_blacklist(struct rbd_checker_context *ct)
438 {
439         const char *cmd[2];
440         char *stat, *cmd_str;
441         size_t stat_len;
442         int ret;
443
444         ret = asprintf(&cmd_str, "{\"prefix\": \"osd blacklist\", \"blacklistop\": \"rm\", \"addr\": \"%s\"}",
445                        ct->client_addr);
446         if (ret == -1)
447                 return -ENOMEM;
448
449         cmd[0] = cmd_str;
450         cmd[1] = NULL;
451
452         ret = rados_mon_command(ct->cluster, (const char **)cmd, 1, "", 0,
453                                 NULL, NULL, &stat, &stat_len);
454         if (ret < 0) {
455                 condlog(1, "rbd%d: repair failed to remove blacklist for %s %d",
456                         ct->rbd_bus_id, ct->client_addr, ret);
457                 goto free_cmd;
458         }
459
460         condlog(1, "rbd%d: repair rm blacklist for %s",
461                ct->rbd_bus_id, ct->client_addr);
462         free(stat);
463 free_cmd:
464         free(cmd_str);
465         return ret;
466 }
467
468 static int rbd_repair(struct rbd_checker_context *ct, char *msg)
469 {
470         char del[17];
471         int ret;
472
473         if (!ct->blacklisted)
474                 return PATH_UP;
475
476         if (!ct->remapped) {
477                 ret = rbd_remap(ct);
478                 if (ret) {
479                         RBD_MSG(msg, "repair failed to remap. Err %d", ret);
480                         return PATH_DOWN;
481                 }
482         }
483         ct->remapped = 1;
484
485         snprintf(del, sizeof(del), "%d force", ct->rbd_bus_id);
486         ret = sysfs_write_rbd_remove(del, strlen(del) + 1);
487         if (ret) {
488                 RBD_MSG(msg, "repair failed to clean up. Err %d", ret);
489                 return PATH_DOWN;
490         }
491
492         ret = rbd_rm_blacklist(ct);
493         if (ret) {
494                 RBD_MSG(msg, "repair could not remove blacklist entry. Err %d",
495                         ret);
496                 return PATH_DOWN;
497         }
498
499         ct->remapped = 0;
500         ct->blacklisted = 0;
501
502         RBD_MSG(msg, "has been repaired");
503         return PATH_UP;
504 }
505
506 #define rbd_thread_cleanup_push(ct) pthread_cleanup_push(cleanup_func, ct)
507 #define rbd_thread_cleanup_pop(ct) pthread_cleanup_pop(1)
508
509 static void cleanup_func(void *data)
510 {
511         int holders;
512         struct rbd_checker_context *ct = data;
513         pthread_spin_lock(&ct->hldr_lock);
514         ct->holders--;
515         holders = ct->holders;
516         ct->thread = 0;
517         pthread_spin_unlock(&ct->hldr_lock);
518         if (!holders)
519                 cleanup_context(ct);
520 }
521
522 static void *rbd_thread(void *ctx)
523 {
524         struct rbd_checker_context *ct = ctx;
525         int state;
526
527         condlog(3, "rbd%d: thread starting up", ct->rbd_bus_id);
528
529         ct->message[0] = '\0';
530         /* This thread can be canceled, so setup clean up */
531         rbd_thread_cleanup_push(ct)
532
533         /* checker start up */
534         pthread_mutex_lock(&ct->lock);
535         ct->state = PATH_PENDING;
536         pthread_mutex_unlock(&ct->lock);
537
538         state = ct->fn(ct, ct->message);
539
540         /* checker done */
541         pthread_mutex_lock(&ct->lock);
542         ct->state = state;
543         pthread_cond_signal(&ct->active);
544         pthread_mutex_unlock(&ct->lock);
545
546         condlog(3, "rbd%d: thead finished, state %s", ct->rbd_bus_id,
547                 checker_state_name(state));
548         rbd_thread_cleanup_pop(ct);
549         return ((void *)0);
550 }
551
552 static void rbd_timeout(struct timespec *tsp)
553 {
554         clock_gettime(CLOCK_MONOTONIC, tsp);
555         tsp->tv_nsec += 1000 * 1000; /* 1 millisecond */
556         normalize_timespec(tsp);
557 }
558
559 static int rbd_exec_fn(struct checker *c, thread_fn *fn)
560 {
561         struct rbd_checker_context *ct = c->context;
562         struct timespec tsp;
563         pthread_attr_t attr;
564         int rbd_status, r;
565
566         if (c->sync)
567                 return fn(ct, c->message);
568         /*
569          * Async mode
570          */
571         r = pthread_mutex_lock(&ct->lock);
572         if (r != 0) {
573                 condlog(2, "rbd%d: mutex lock failed with %d", ct->rbd_bus_id,
574                         r);
575                 MSG(c, "rbd%d: thread failed to initialize", ct->rbd_bus_id);
576                 return PATH_WILD;
577         }
578
579         if (ct->running) {
580                 /* Check if checker is still running */
581                 if (ct->thread) {
582                         condlog(3, "rbd%d: thread not finished",
583                                 ct->rbd_bus_id);
584                         rbd_status = PATH_PENDING;
585                 } else {
586                         /* checker done */
587                         ct->running = 0;
588                         rbd_status = ct->state;
589                         strncpy(c->message, ct->message, CHECKER_MSG_LEN);
590                         c->message[CHECKER_MSG_LEN - 1] = '\0';
591                 }
592                 pthread_mutex_unlock(&ct->lock);
593         } else {
594                 /* Start new checker */
595                 ct->state = PATH_UNCHECKED;
596                 ct->fn = fn;
597                 pthread_spin_lock(&ct->hldr_lock);
598                 ct->holders++;
599                 pthread_spin_unlock(&ct->hldr_lock);
600                 setup_thread_attr(&attr, 32 * 1024, 1);
601                 r = pthread_create(&ct->thread, &attr, rbd_thread, ct);
602                 if (r) {
603                         pthread_mutex_unlock(&ct->lock);
604                         ct->thread = 0;
605                         ct->holders--;
606                         condlog(3, "rbd%d failed to start rbd thread, using sync mode",
607                                 ct->rbd_bus_id);
608                         return fn(ct, c->message);
609                 }
610                 pthread_attr_destroy(&attr);
611                 rbd_timeout(&tsp);
612                 r = pthread_cond_timedwait(&ct->active, &ct->lock, &tsp);
613                 rbd_status = ct->state;
614                 strncpy(c->message, ct->message,CHECKER_MSG_LEN);
615                 c->message[CHECKER_MSG_LEN -1] = '\0';
616                 pthread_mutex_unlock(&ct->lock);
617
618                 if (ct->thread &&
619                     (rbd_status == PATH_PENDING || rbd_status == PATH_UNCHECKED)) {
620                         condlog(3, "rbd%d: thread still running",
621                                 ct->rbd_bus_id);
622                         ct->running = 1;
623                         rbd_status = PATH_PENDING;
624                 }
625         }
626
627         return rbd_status;
628 }
629
630 void libcheck_repair(struct checker * c)
631 {
632         struct rbd_checker_context *ct = c->context;
633
634         if (!ct || !ct->blacklisted)
635                 return;
636         rbd_exec_fn(c, rbd_repair);
637 }
638
639 int libcheck_check(struct checker * c)
640 {
641         struct rbd_checker_context *ct = c->context;
642
643         if (!ct)
644                 return PATH_UNCHECKED;
645
646         if (ct->blacklisted)
647                 return PATH_DOWN;
648
649         return rbd_exec_fn(c, rbd_check);
650 }