/* $Header: /CVSROOT/public/scylla-charybdis/md5backup/md5backup.c,v 1.27 2006/10/03 21:16:33 tino Exp $ * * MD5 based harddrive to harddrive backup utility. * Compared to my other utilities it is incredible fast, * as it is designed for speed. * * It should run over NFS too, but this never has been tested. * * Copyright (C)2003-2005 by Valentin Hilbig * * This is release early code. Use at own risk. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * $Log: md5backup.c,v $ * Revision 1.27 2006/10/03 21:16:33 tino * See Changelog, commit for dist * * Revision 1.26 2005/03/04 00:45:47 tino * Medium internal restructuring and cleanups. * Some routines have been moved into tinolib. * * Revision 1.25 2005/03/03 18:23:54 tino * commit for new release * * Revision 1.24 2005/03/03 00:29:41 tino * working version * * Revision 1.23 2005/03/02 23:32:11 tino * first version for multi filestore * * Revision 1.22 2005/02/20 15:43:09 tino * commit for release * * Revision 1.21 2004/10/05 03:02:13 tino * "nice", security lack fixed, new sparse files handling, bin/compare.sh * Details see ChangeLog * * Revision 1.20 2004/09/29 00:01:40 tino * cosmetic changes, see ChangeLog * * Revision 1.19 2004/08/22 05:58:02 Administrator * Bug removed: CMP falsely returned true in case one file was truncated. * * Revision 1.18 2004/07/25 09:16:41 tino * see ChangeLog: bugfixes in bin/dobackup.sh and tinolib * * Revision 1.17 2004/07/05 23:53:52 tino * va_copy not defined in all systems * * Revision 1.16 2004/06/27 15:17:31 tino * removed superfluous ignore and printf * * Revision 1.15 2004/06/19 12:31:29 tino * minor improvements * * Revision 1.14 2004/06/18 23:51:16 tino * see ChangeLog * * Revision 1.13 2004/06/12 11:17:46 tino * removed too frequent printing of unneccessary warnings * * Revision 1.12 2004/05/23 10:58:35 tino * once again, access to (same) feed pointer. Now globally fixed I hope. * * Revision 1.11 2004/05/23 07:55:45 tino * harmless bug fixed: access to already freed memory * * Revision 1.10 2004/05/21 03:50:09 tino * known files are now linked into linked_file_store, too. * Needed to add warnings, as currently this solution is not perfect. * * Revision 1.9 2004/05/04 17:51:47 tino * O_DIRECT does not work, fix does not work either, so it's disabled now * * Revision 1.8 2004/05/01 04:37:36 tino * now it works as expected * * Revision 1.7 2004/05/01 02:15:01 tino * New feature: =/PATH/ * * Revision 1.6 2004/01/12 03:55:02 tino * Includes, excludes added but not tested. * * Revision 1.5 2004/01/11 22:26:08 tino * Version and compile date in usage now * * Revision 1.4 2004/01/11 22:04:32 tino * Some last bugs fixed * * Revision 1.3 2004/01/11 20:29:29 tino * Release candidate: Added features: * Autoignore and stay in local filesystem * * Revision 1.2 2004/01/07 12:05:09 tino * Scripts improved, some spelling corrections, and CVS headers added. * md5backup.c now has a sanity check which slows down things factor 10. */ #define _GNU_SOURCE 1 #include "tino/file.h" #include "tino/dir.h" #include "tino/debug.h" #include "tino/fatal.h" #include "tino/alloc.h" #include "tino/slist.h" #include "tino/str.h" #include "tino/strwild.h" #include "tino/sysfix.h" #include "md5.h" #include #include #include #include #include "md5backup_version.h" /* I added checks because I thought they are useful, but I was wrong: * These checks slow down the process by factors (10 or more)! * You really don't want and need this. Trust me ;) * Instead: * From while to while and in case check.sh finds errors * just delete the the database to get a fresh full backup. * (FYI the database is "/target/path/"dbm/"nodename") */ #undef MD5BACKUP_SANITY_CHECKS static struct { /* Runtime handles */ GDBM_FILE db; FILE *log, *md5; /* Runtime global counts */ time_t start, last; int targets; /* Number of targets scanned so far */ int ignored; /* Ignored objects (files+dirs) */ int errors; /* Errors like unreadable files */ int warnings; /* Warning like ignored spare files */ int notes; /* Notes like found sparse files */ int dirs; /* Directories scanned */ int examined; /* Examined files (db read accesses) */ int changed; /* Number of changes found */ int known; /* Number of files already known */ int doubled; /* Number of doubled known files */ int added; /* Number of backupped files */ unsigned long long copy; /* Number of bytes copied to temp file */ unsigned long long stored; /* Number of bytes kept in file store */ /* Runtime information */ int target_length; const char *target; int entriesleft; /* Runtime buffers */ TINO_SLIST dirlist; TINO_SLIST linkpath_list; TINO_GLIST ignore_list_dir, ignore_list_file; TINO_SLIST ignore_wild_dir, ignore_wild_file; TINO_SLIST target_list, outpaths; size_t bigbuf_max; void *bigbuf; int o_direct, create_file, create_dir; /* Basic config */ const char *path, *node; int width, nice; long long sparse_max; int sparse_percent; /* Calculated config */ char dbfile[PATH_MAX]; char outpath[PATH_MAX], tmpfile[PATH_MAX]; char logfile[PATH_MAX], md5file[PATH_MAX]; } config; /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* Console output stuff */ /* shall use tinolib here */ static void progress_clear(void) { /* Should only be send to a tty */ printf("%*s\r", config.width, ""); fflush(stdout); } static void progress(int poll) { char buf[200]; int n; time_t now; now = time(NULL); if (poll && config.last==now) return; config.last = now; now -= config.start; n = snprintf(buf, sizeof buf, "%02d:%02d %d+%d t%d i%d E%d W%d N%d d%d f%d m%d o%d c%d n%d %Ld/%LdMB ", (int)(now/60), (int)(now%60), config.dirlist->count, config.entriesleft, config.targets, config.ignored, config.errors, config.warnings, config.notes, config.dirs, config.examined, config.changed, config.doubled, config.known, config.added, config.stored>>20, config.copy>>20); fputs(buf, stdout); n = config.target_length-config.width+n; if (n<0) n = 0; if (config.target && n=max) tino_fatal("argument too long"); return s; } static int existsfile(const char *file) { struct stat64 st; return (!lstat64(file, &st) && S_ISREG(st.st_mode)); } static int existsdir(const char *dir) { struct stat s; if (stat(dir, &s)) return 0; return S_ISDIR(s.st_mode); } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* Database stuff */ struct data { unsigned long long mtime; int count; char md5[34]; }; static int dbfetch(const char *name, struct data *data) { char tmp[32+32+32]; datum d; config.examined++; d.dptr = (char *)name; d.dsize = strlen(name); d = gdbm_fetch(config.db, d); if (d.dptr==0) return 0; xDP(("db(%s) %d '%.*s'", name, d.dsize, d.dsize, d.dptr)); if (d.dsizemtime = tim; data->count = (int)cnt; memcpy(data->md5, tmp+p1, 32); data->md5[32] = 0; return 1; } } tino_err("database corrupt for %s", name); return 0; } static int dbstore(const char *name, struct data *data) { char tmp[32+32+32]; datum k,v; assemble(tmp, sizeof tmp, "%Lu %Lu %s", (unsigned long long)data->mtime, (unsigned long long)data->count, data->md5); k.dptr = (char *)name; k.dsize = strlen(name); v.dptr = tmp; v.dsize = strlen(tmp); if (!gdbm_store(config.db, k, v, GDBM_REPLACE)) return 1; tino_err("database not writeable for %s", name); return 0; } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* Physical low level IO */ static int readcmp(int fd, const char *buf, size_t len) { int pos, got, max; char blk[BUFSIZ*16]; for (pos=0; possizeof blk) max = sizeof blk; got = tino_file_read(fd, blk, max); if (got<=0) { if (got) return -1; return 1; } if (got>max) tino_fatal("syscall defect, read more than allowed"); if (memcmp(buf+pos, blk, got)) return 1; } return 0; } /* Compare open FD to backup tmpfile * * Returns: * <0 on error * 0 if identical * >0 if differs */ static int compare_file(int fd, const char *name) { int cmp; int res; int got; xDP(("compare_file(%d,%s)", fd,name)); if ((cmp=tino_file_open(config.tmpfile, O_RDONLY|O_NOFOLLOW))<0) tino_fatal("cannot read tmpfile: %s", config.tmpfile); res = 0; while ((got=tino_file_read(fd, config.bigbuf, config.bigbuf_max))>0) if ((res=readcmp(cmp, config.bigbuf, got))!=0) { if (res<0) tino_fatal("read error: %s", config.tmpfile); break; } if (got<0) return -1; if (!res) { char tmp[BUFSIZ]; /* Check for EOF on both files */ if ((res=tino_file_read(cmp, tmp, sizeof tmp))<0) tino_fatal("read error: %s", config.tmpfile); } if (close(cmp)) tino_fatal("close error: %s", config.tmpfile); return res; } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* md5backup helpers */ static char * md5filename(char *out, size_t len, const char *pref, const char *md5, int count) { assemble(out, len, "%s/%c%c/%c%c/%s.%d.md5", pref, md5[0], md5[1], md5[2], md5[3], md5, count); return out; } struct outpath_iterate_type { char *buf; size_t len; const char *md5; int count; }; static int outpath_iterate(const char *s, void *x) { const struct outpath_iterate_type *p=x; return existsfile(md5filename(p->buf, p->len, s, p->md5, p->count)); } static int outpath_scan(char *buf, size_t len, const char *md5, int count) { struct outpath_iterate_type x; if (existsfile(md5filename(buf, len, config.outpath, md5, count))) return 0; x.buf = buf; x.len = len; x.md5 = md5; x.count = count; if (tino_slist_iterate_0(config.outpaths, outpath_iterate, &x)>0) return 1; return -1; } #ifdef MD5BACKUP_SANITY_CHECKS /* Test if md5 file still is present * !!!UNTESTED COMPLETELY!!! */ static int md5exists(const char *md5, int count) { char out[PATH_MAX]; return outpath_scan(out, sizeof out, md5, count)>=0; } #endif /* pref and md5 is a little bit redundant as this is contained in out. * Perhaps later on we will elliminate it, * as this requires "out" to be something md5filename has created. * Upper directory is reduced by two / and lower directory the one by one / */ static int md5link(const char *file, const char *out, int logit) { char tmp[PATH_MAX]; if (!link(file, out)) return 0; if (errno!=ENOENT) return -1; /* Reduce arg by 2 slashes */ tino_strrcut(tino_strrcut(tino_strxcpy(tmp, out, sizeof tmp), '/'), '/'); if (!mkdir(tmp, config.create_dir) && logit) log("created directory %s", tmp); /* Should reduce out by 1 slashes */ tino_strrcut(tino_strxcpy(tmp, out, sizeof tmp), '/'); if (!mkdir(tmp, config.create_dir) && logit) log("created directory %s", tmp); return link(file, out); } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* ignore handling */ static void ignore_add(const char *name) { struct stat64 st; if (*name=='?') { if (name[strlen(name)-1]!='/') tino_slist_add(config.ignore_wild_file, name+1); else tino_slist_add(config.ignore_wild_dir, name+1); } else if (stat64(name, &st)) note("cannot stat ignore argument: %s", name); else { TINO_GLIST_ENT ent; TINO_GLIST g; if (S_ISDIR(st.st_mode)) g = config.ignore_list_dir; else if (S_ISREG(st.st_mode)) g = config.ignore_list_file; else { note("unsupported ignore argument: %s", name); return; } ent = tino_glist_add(g); *(struct stat64 *)ent->data = st; } } static void ignore_add2(const char *path, const char *name) { char tmp[PATH_MAX]; ignore_add(assemble(tmp, sizeof tmp, "%s%s", path, name)); } static int ignore_wild_cmp(const char *wild, const void *s) { return tino_strwildcmp(s, wild)==0; } static int is_ignored(TINO_GLIST list, TINO_SLIST slist, const char *name, const struct stat64 *st1, const char *type) { TINO_GLIST_ENT ent; dev_t dev = st1->st_dev; ino_t ino = st1->st_ino; int hit; /* Hope the optimizer makes this a goto *eg* */ hit = 0; for (ent=list->list; ent; ent=ent->next) { struct stat64 *st2; st2 = ent->data; if (dev==st2->st_dev && ino==st2->st_ino) { hit = 1; break; } } if (hit || tino_slist_iterate_0_c(slist, ignore_wild_cmp, name)) { #if 0 progress_clear(); printf("ignore %s: %s\n", type, name); #endif config.ignored++; log("ignore %s %s", type, name); return 1; } return 0; } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* linkpath handling (linked file store) */ struct linkpath_user { jmp_buf jmp; const char *md5; int count; const char *file; int force; }; /* add a linkpath (linked file store) * where new files are hardlinked to via the iterate-function below */ static void linkpath_add(const char *path) { if (!existsdir(path)) tino_fatal("hardlink directory does not exist: %s", path); ignore_add(path); tino_slist_add(config.linkpath_list, path); } static void linkpath_iterate_fn(const char *path, void *usr /* struct linkpath_user * */) { struct linkpath_user *u=usr; char tmp[PATH_MAX]; md5filename(tmp, sizeof tmp, path, u->md5, u->count); if (md5link(u->file, tmp, 0)) { if (errno!=EEXIST) tino_fatal("cannot hardlink to linked file store: %s", tmp); /* File exists. * If we must hardlink, then we have to look for another counter. * Not that I expect that this bend will ever be taken! */ if (u->force) longjmp(u->jmp, 1); /* We are in the linked file store where an old file still * exists. Warn if the files are not hardlinked. * Perhaps elliminate the longjmp above later, too, * however this would break the iteration in situations, * I don't want to break it for today. */ if (tino_file_lstat_diff(tmp, u->file)) warn("existing mismatch in file store: %s", tmp); } } /* Link file to the file store under an md5 sum and count. * * force!=0: return error if destination exists. Actually this is a * bug workaround, it should be the case, that we can detect if a * hardlink fails because both files are identical. If not we have an * error (always), else we can silently assume the hardlink is ok. * * return 0 on success, !=0 otherwise */ static int linkpath_run(const char *file, const char *md5, int count, int force) { struct linkpath_user u; u.md5 = md5; u.count = count; u.file = file; u.force = force; if (setjmp(u.jmp)) return -1; tino_slist_iterate(config.linkpath_list, linkpath_iterate_fn, (void *)&u); return 0; } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* md5backup specific */ /* Do a copy from source to tmpfile * with an md5 sum in between. * * XXX add support for sparse files XXX */ static long long md5copy(const char *name, char *md5) { MD5_CTX ctx; char digest[16]; int got, i; int in, out; int errstate; long long cnt; /* Cache the output as it's likely we must compare it */ if ((out=tino_file_create(config.tmpfile, O_WRONLY|O_NOFOLLOW, config.create_file))<0) { tino_err("cannot create temporary %s", config.tmpfile); return -1ll; } /* Don't cache the input as it's not likely read again */ if ((in=tino_file_open(name, O_RDONLY|O_NOFOLLOW|config.o_direct))<0) { close(out); tino_err("cannot open %s", name); return -1ll; } MD5_Init(&ctx); errstate = 0; cnt = 0; while ((got=tino_file_read(in, config.bigbuf, config.bigbuf_max))>0) { MD5_Update(&ctx, config.bigbuf, got); if (tino_file_write_all(out, config.bigbuf, got)!=got) { if (errno==ENOSPC) { unlink(config.tmpfile); errno = ENOSPC; tino_fatal("disk full at tmpfile: %s", config.tmpfile); } /* Hm .. what kind of nonpermanent reoccurring error? * NFS softmount dead? */ tino_err("cannot write to tmpfile: %s", config.tmpfile); errstate = 1; break; } cnt += got; } if (got<0) { tino_err("read error on input: %s", name); errstate = 1; } if (close(in)) { tino_err("error closing input: %s", name); errstate = 1; } if (close(out)) { tino_err("error closing tmpfile: %s", config.tmpfile); errstate = 1; } if (errstate) { unlink(config.tmpfile); return -1ll; } MD5_Final(digest, &ctx); for (i=0; i<16; i++) { *md5++ = "0123456789abcdef"[(digest[i]>>4)&0xf]; *md5++ = "0123456789abcdef"[(digest[i]>>0)&0xf]; } *md5 = 0; config.copy += cnt; return cnt; } /* Write the md5sum compatible log entry * Complex, as md5sum may use escaped syntax. */ static void md5log(const char *name, const char *md5) { int i; /* Escaped syntax on files containing \n or \\ */ for (i=0; name[i]; i++) if (name[i]=='\\' || name[i]=='\n') { fprintf(config.md5, "\\%s *", md5); for (i=0; name[i]; i++) if (name[i]=='\n') { fputc('\\', config.md5); fputc('n', config.md5); continue; } else { if (name[i]=='\\') fputc('\\', config.md5); fputc(name[i], config.md5); } fputc('\n', config.md5); } /* Ordinary output */ fprintf(config.md5, "%s *%s\n", md5, name); /* I often interrupt md5backup, which left incomplete lines to the * md5 logfile. This should fix it: */ if (fflush(config.md5)) tino_fatal("cannot append to md5 log %s", config.md5file); } /* Returns: * <0 hardlink failed or retryable error * ==0 mismatch (funny, I would rather suspect a corrupt file system) * >0 match or successful hardlink done * * Actually, this routine now is too complex and should be rewritten. * Important things are going on here for the main strategy, but this * is hidden in this deep and nested function call. To make it even * more non-understandable, there now is outpath_scan which is an * incomplete hack to support multi file store. */ static int compare_or_link(const char *md5, int count) { char out[PATH_MAX]; int in, multi; xDP(("compare_or_link(%s,%d)", md5,count)); if ((multi=outpath_scan(out, sizeof out, md5, count))>=0) { if ((in=tino_file_open(out, O_RDONLY|O_NOFOLLOW|config.o_direct))>=0) { int diff; /* XXX FIXME XXX * * Check for compressed type here in future. * * Note that the files are not stored compressed (yet), but * it shall be allowed that you compress the file store. * * Compressions supported shall be: * * gzip the standard * bzip2 best compression and error correction * lzo raw speed * builtin future (dd bs=4096 skip=1 | bunzip2) */ 000; /* This implicitely knows that comparision is done to * config.tmpfile */ diff = compare_file(in, out); /* In case of error we really don't want to continue as * something must be badly broken on the target. */ if (diff<0) tino_fatal("read error in backup file: %s", out); if (close(in)) tino_fatal("close error in backup file: %s", out); if (diff) return 0; /* if the file is in another outpath we must remember it * locally again */ if (!multi) { /* We now have an already known file. Hardlink it to * the linked file store. */ if (linkpath_run(out, md5, count, 0)) tino_err("cannot dup to linkpath: %s", out); config.known++; return 1; } } else if (multi || errno!=ENOENT) { /* Quite not funny, we hit some unexpected error. Be * conservative and don't try to disturb any more. */ tino_fatal("unexpected error accessing file: %s", out); } } /* out may be pointing to the additional file store. * Set it back to the local out path. */ md5filename(out, sizeof out, config.outpath, md5, count); if (linkpath_run(config.tmpfile, md5, count, 1)) return -1; if (!md5link(config.tmpfile, out, 1)) { if (multi>0) config.doubled++; else config.added++; return 1; } return -1; } /* A wrapper over compare_or_link to provide NFS compatibility. * * This routine is implicite bullshit. The whole process is not * intuitively clear and should be rewritten. Also the function name * is misleading. */ static int md5hunt(const char *md5) { int count; xDP(("md5hunt(%s)", md5)); for (count=0;; count++) { int ret, reloop; /* According to the NFS documentation the NFS server might go * out of business right before telling us that a hardlink was * done. So we have to reloop a bit here. * * Hopefully the subsystem waits long enough such that the fatal * never happens. */ for (reloop=0; (ret=compare_or_link(md5, count))<0; reloop++) if (reloop>4) tino_fatal("hardlink failed: %s %d", md5, count); /* We had a match, * Flush the tmpfile */ if (ret>0) { tino_file_flush(config.tmpfile); break; } } /* In either case, * we do not need the tmpfile any more */ unlink(config.tmpfile); return count; } /* Do the backup. Actually, this always is incremental. * * The process is not defined clearly enough and spreaded over all the * above routines. That's bad. The main strategy shall be written * clearly into one single function. */ static void backup(const char *name, unsigned long long mtime) { struct data d; char md5[34]; long long bytes; xDP(("backup(%s,%Lu)", name, mtime)); if (dbfetch(name, &d) && d.mtime==mtime #ifdef MD5BACKUP_SANITY_CHECKS && md5exists(d.md5, d.count) #endif ) { xDP(("backup() not newer")); return; } config.changed++; /* First: * * Copy the file to a tempfile to safe the contents. md5 is a * return value. This is currently not aware of sparse files. */ if ((bytes=md5copy(name, md5))!=-1ll) { int cnt; int known; xDP(("backup() copied %s", md5)); md5log(name, md5); /* ARGH .. this is quite a hack actually .. */ known = config.known; /* Second: * * Hunt for a known md5 file which is equal to the temp file. * * Third (sadly it's done in the second step, too): * * If non found, create one. */ cnt = md5hunt(md5); if (cnt>=0) { d.mtime = mtime; d.count = cnt; strcpy(d.md5, md5); if (dbstore(name, &d)) { if (known==config.known) { /* This commonly underestimates it a little bit as * FSes usually don't have 1 KB pages */ bytes += 1023ll; bytes &= ~1023ll; config.stored += bytes; log("md5 %LdKB %s %d %s", bytes>>10, md5, cnt, name); } else log("known md5 %s %d %s", md5, cnt, name); return; } } } log("failed backup %s", name); } /* dirlist_add is a poor and misleading name. * * Examine one entry: * * If it's ignored, ignore it. * If it's a directory, add it to the directory list to process later. * If it's a file, backup it. */ static void dirlist_add(const char *name, const struct stat64 *st) { if (S_ISDIR(st->st_mode)) { if (!is_ignored(config.ignore_list_dir, config.ignore_wild_dir, name, st, "dir")) tino_slist_add(config.dirlist, name); } else if (S_ISREG(st->st_mode)) { if (!is_ignored(config.ignore_list_file, config.ignore_wild_file, name, st, "file")) { /* Decide here if a file is sparse. These type of files * need special treatment, as those usually have a huge size * (possibly terabytes) with just nearly no data in it. * * Idea: Do not backup big (over 100 KB) sparse files with * just too few data (below a threshold, say 75%) in it - * only report them. Otherwise just backup them normally. * * Also sparse file with nothing in it are skipped, too. */ if (st->st_blocks*512llst_size) { if (!st->st_size || (st->st_size>config.sparse_max && (st->st_blocks * 512ll / config.sparse_percent * 100 ) < st->st_size)) { warn("ignoring sparse file: %s", name); return; } note("sparse file: %s", name); /* Missing special treatment of sparse files. Backup * shall be informed about the fact that this is a * sparse file. */ 000; /** XXX TODO XXX */ } backup(name, st->st_mtime); } } } /* Here is the old commentary: * * I don't like this, that stat is part of following routine. However * this is proof of concept code, so this is ok for now. * * My new comment: * * For a mature production system, the complete process has to be * redefined and made more clear. Maintainability must be increased. * And tinolib must be redefined to allow somethnig better than slists * and similar crap. * * What it does: * * Iterate over the directory list and read in (completely) one * directory at a time. Then process the entries in this directory * via "dirlist_add" (I did not find a better name yet). */ static void md5backup(const char *targ) { const char *dir; struct stat64 st; dev_t dev; xDP(("md5backup(%s)", targ->name)); if (stat64(targ, &st)) { tino_err("cannot stat target: %s", targ); return; } config.targets++; config.target = targ; config.target_length = strlen(targ); dev = st.st_dev; progress_clear(); log("start %s %s %s", config.path, config.node, targ); /* dirlist_add either adds directories to the list, * or does the backup of the files. * * Bad name for the routine. * * If you look for the path to the backup routine, follow * there. */ dirlist_add(targ, &st); for (; (dir=tino_slist_get(config.dirlist))!=0; tino_slist_free(dir)) { TINO_SLIST slist; int i; const char *file; /* read in the directory */ config.dirs++; progress(0); slist = tino_dir_read(dir); for (i=0; (file=tino_slist_get(slist))!=0; tino_slist_free(file)) { char buf[PATH_MAX+2]; tino_strxcpy(buf, dir, sizeof buf); if (!*buf || buf[strlen(buf)-1]!='/') tino_strxcat(buf, "/", sizeof buf); tino_strxcat(buf, file, sizeof buf); if (strlen(buf)>=PATH_MAX) { tino_err("filename too deep: %s/%s", dir, file); continue; } if (!(++i&127)) progress(1); if (lstat64(buf, &st)) { tino_err("cannot stat: %s", buf); continue; } config.entriesleft = slist->count; xDP(("md5backup() name=%s", buf)); /* dirlist_add either adds directories to the list, * or does the backup of the files. * * Bad name for the routine. * * If you look for the path to the backup routine, follow * there. */ if (st.st_dev==dev) dirlist_add(buf, &st); } tino_slist_destroy(slist); xDP(("md5backup() next")); } config.target = 0; } /* Perhaps, in a distant future, the initialization can be influenced * via a config file .. */ static void init(const char *path, const char *node) { char tmp[PATH_MAX]; int i; time(&config.start); config.dirlist = tino_slist_new(); config.linkpath_list = tino_slist_new(); config.ignore_list_dir = tino_glist_new(sizeof(struct stat64)); config.ignore_list_file = tino_glist_new(sizeof(struct stat64)); config.ignore_wild_dir = tino_slist_new(); config.ignore_wild_file = tino_slist_new(); config.target_list = tino_slist_new(); config.outpaths = tino_slist_new(); config.path = path; config.node = node; assemble(config.logfile, sizeof config.logfile, "%slog/%s", path, node); if ((config.log=fopen(config.logfile, "a"))==0) tino_fatal("cannot append to logfile %s", config.logfile); log("md5backup version %s compiled %s", MD5BACKUP_VERSION, __DATE__); ignore_add(config.logfile); ignore_add2(path, "log"); assemble(config.outpath, sizeof config.outpath, "%sout", path); if (!existsdir(config.outpath)) tino_fatal("output directory does not exist: %s", config.outpath); ignore_add(config.outpath); assemble(config.tmpfile, sizeof config.tmpfile, "%stmp/%s.%d", path, node, getpid()); ignore_add2(path, "tmp"); assemble(config.md5file, sizeof config.md5file, "%smd5/%s", path, node); if ((config.md5=fopen(config.md5file, "a"))==0) tino_fatal("cannot append to md5sum %s", config.md5file); ignore_add(config.md5file); ignore_add2(path, "md5"); assemble(config.dbfile, sizeof config.dbfile, "%sdbm/%s", path, node); if ((config.db=gdbm_open(config.dbfile, BUFSIZ, GDBM_WRCREAT|GDBM_SYNC, 0775, NULL))==0) tino_fatal("cannot open database: %s: %s", config.dbfile, gdbm_strerror(gdbm_errno)); ignore_add(config.dbfile); ignore_add2(path, "dbm"); /* Try to open a buffer suitable for O_DIRECT IO. * This must be aligned to a multiple of the sector size. * and IO must start on a sector boundary. * * Actually this is not done here exactly. * This code works only for filesystems up to a sector size of * the system's page size, which is 4096. * I don't know of any FS with a bigger sector size. * * Question which comes in mind: * What happens with sparse files? * * XXX FIXME XXX it does not work, don't know why */ config.bigbuf_max = 1<<20; #if 0 /* does not work, stay away */ config.o_direct = O_DIRECT; if (posix_memalign(&config.bigbuf, (size_t)sysconf(_SC_PAGE_SIZE), config.bigbuf_max)) #endif { config.bigbuf = tino_alloc(config.bigbuf_max); config.o_direct = 0; } config.width = 79; config.nice = 15; config.create_file = 0440; config.create_dir = 0750; config.sparse_max = 0x20000; /* 128 KB */ config.sparse_percent = 75; /* minimum fill */ /* Scan for outN files (n starting from 0) */ for (i=0; existsdir(assemble(tmp, sizeof tmp, "%sout%i/", path, i)); i++) { note("additional filestore out%i", i); tino_slist_add(config.outpaths, tmp); } } static void cleanup(void) { FILE *tmp; time_t now; gdbm_close(config.db); if (fclose(config.md5)) tino_err("close md5sum %s", config.md5file); unlink(config.tmpfile); config.copy += (1<<20)-1; /* round up */ config.stored += (1<<20)-1; /* round up */ time(&now); now -= config.start; log("ready %d:%02d, args=%d ign=%d err=%d warn=%d note=%d dir=%d file=%d mod=%d old=%d copy=%d new=%d MB=%Ld/%Ld", (int)(now/60), (int)(now%60), config.targets, config.ignored, config.errors, config.warnings, config.notes, config.dirs, config.examined, config.changed, config.known, config.doubled, config.added, config.stored>>20, config.copy>>20); progress_clear(); printf("%02d:%02d args=%d ign=%d err=%d warn=%d note=%d dir=%d file=%d mod=%d old=%d copy=%d new=%d %Ld/%LdMB\n", (int)(now/60), (int)(now%60), config.targets, config.ignored, config.errors, config.warnings, config.notes, config.dirs, config.examined, config.changed, config.known, config.doubled, config.added, config.stored>>20, config.copy>>20); tmp = config.log; config.log = 0; if (fclose(tmp)) tino_err("close logfile %s", config.logfile); } static void process_targets(void) { const char *s; for (; (s=tino_slist_get(config.target_list))!=0; tino_slist_free(s)) md5backup(s); } static void target(const char *arg, int level) { FILE *fd; char *tmp; switch (*arg) { case 0: case '#': break; case '=': linkpath_add(arg+1); break; case '@': if (++level>20) tino_err("recoursion too deep: %s", arg+1); fd = stdin; if (strcmp(arg, "@-") && (fd=fopen(arg+1, "rt"))==NULL) { tino_err("cannot open target list file: %s", arg+1); break; } tmp = tino_strdup(arg+1); while (fgets(config.bigbuf, config.bigbuf_max, fd)) { size_t len; if ((len=strlen(config.bigbuf))>0) { if (((char *)config.bigbuf)[--len]=='\n') ((char *)config.bigbuf)[len] = 0; /* This possibly invalidates arg */ target(config.bigbuf, level); } } if (ferror(fd)) tino_err("close target list file: %s", tmp); if (fd!=stdin && fclose(fd)) tino_err("close target list file: %s", tmp); free(tmp); break; case '-': ignore_add(arg+1); break; default: tino_slist_add(config.target_list, arg); break; } } int main(int argc, char **argv) { int i; if (argc<3) { printf("Usage: %s /backup/path/ nodename target...\n" "\tIncrementally backup files to backup path given.\n" "\tVersion %s compiled %s\n" "\n" "\tDon't forget the trailing / in the /backup/path/.\n" "\tNodename should be `hostname -f`.\n" "\tIf target is directory it is scanned for files.\n" "\tDirectory scan will not cross mount points.\n" "\tIf target is -path this is an (global) exclude.\n" "\tIf target is @file this file is read for targets.\n" "\t\tHere file can be - to be read from stdin.\n" "\tIf target is =path all newly created \"out/...\" files\n" "\twill be hardlinked there before they are moved to out/.\n" "\tThe sub-structure below this path is arbitrary!\n" "\n" "\tBackup structure below /backup/path/ (quick info):\n" "\tdbm/nodename\tDBM file (not needed for restore)\n" "\tlog/nodename\t(*) Logfile of action\n" "\tmd5/nodename\t(*) File suitable for md5sum --check\n" "\tout/... \t(*) Files are stored under their content's MD5 sum.\n" "\toutN/tlike out, readonly (N is a number starting from 0)\n" "\ttmp/ \tTemporary directory, should be empty\n" "\t(*) needed for restore (not programmed yet!)\n" , argv[0], MD5BACKUP_VERSION, __DATE__); return 1; } tino_verror_fn = verror; init(argv[1], argv[2]); nice(config.nice); /* Dump some parameters als second log line */ for (i=3; i