import io import os from datetime import datetime import re import tempfile import mgzip as gzip import tarfile import sqlite3 import pathlib import hashlib import collections as coll from b2sdk.v2 import B2Api from crypt import encrypt_file, decrypt_file ZFILL = 5 BACKBLAZE = 100 LOCAL = 101 class Backup: key = None buk = None tarball_size = "50M" save_location = "" def backblaze(self, key, bdd=None, app_key_id=None, app_key=None, bucket_id=None): self.key = key self.save_mode = BACKBLAZE if None in (app_key_id, app_key, bucket_id): raise Exception("Some arguments are not filled") self.b2 = B2Api() self.b2.authorize_account("production", app_key_id, app_key) self.buk = self.b2.get_bucket_by_id(bucket_id) self.bdd = DataBase(bdd) return self def local(self, key, bdd=None, save_location=None): self.key = key self.save_mode = LOCAL if None in (save_location, bdd): raise Exception("Some arguments are not filled") self.save_location = save_location self.bdd = DataBase(bdd) return self def update(self, path, recurse=True): self.clear(path, recurse=recurse) self.__save(path, recurse=recurse) return self def save(self, path, recurse=True): self.__save(path, recurse=recurse) return self def clear(self, path, recurse=True): files = self.bdd.get_files(path) if not recurse: files = [f for f in files if (pathlib.Path(f['path'])).parent == pathlib.Path(path)] for file in files: file_path = pathlib.Path(file['path']) if not os.path.isfile(file_path): # delete it print("Clear deleted file :", file['path']) self.bdd.delete_file(file) orphans = self.bdd.get_orphan_crypt() for orphan in orphans: # Delete orphan crypt self.__delete_file(str(orphan['id']).zfill(ZFILL)) self.bdd.delete_crypt(orphan['id']) nocrypts = self.bdd.get_file_no_crypt() for nocrypt in nocrypts: self.bdd.delete_file(nocrypt) return self def check(self, path, recurse=True): #paths = [paths] if not isinstance(paths, list) else paths # files = self.bdd.get_files(path) crypts = self.bdd.get_crypts(path) # if not recurse: # crypts = [f for f in crypts if (pathlib.Path(f['path'])).parent == pathlib.Path(paths)] for crypt in crypts: if crypt['id'] is not None: encrypted_file = self.__download_file(str(crypt['id']).zfill(ZFILL)) file_hash = get_hash(encrypted_file) if crypt['sha1sum'] != file_hash: print("Hash mismatch", str(crypt['id']).zfill(ZFILL)) self.__delete_file(str(crypt['id']).zfill(ZFILL)) self.bdd.delete_crypt(crypt['id']) return self def __save(self, path, recurse=True): tarball_size = parse_size(self.tarball_size) files = [] for f in os.listdir(path): uri = os.path.join(path, f) if os.path.isfile(uri): size = os.path.getsize(uri) m_date = datetime.fromtimestamp(os.path.getmtime(uri)).strftime("%Y-%m-%d %H:%M:%S.%f") c_date = datetime.fromtimestamp(os.path.getctime(uri)).strftime("%Y-%m-%d %H:%M:%S.%f") if size > tarball_size: crypt_id = self.bdd.add([{'name': f, 'path': pathlib.Path(uri).as_posix(), 'size': size, 'm_date': m_date, 'c_date': c_date}]) if crypt_id is not None: print("Proceed", uri, ' ==> ', crypt_id) enc = crypt(compress(uri), self.key) self.bdd.set_crypt_attr(crypt_id, compress_mode="gz", sha1sum=get_hash(enc)) print(" Size :", get_size(enc)) self.__upload_file(enc, file_name=str(crypt_id).zfill(ZFILL)) else: files.append({'name': f, 'path': pathlib.Path(uri).as_posix(), 'size': size, 'm_date': m_date, 'c_date': c_date}) elif os.path.isdir(uri) and recurse: self.__save(uri, recurse=recurse) if len(files) > 0: crypt_id = self.bdd.add(files) if crypt_id is not None: print("Proceed", path, ":", [file['name'] for file in files], ' ==> ', crypt_id) tarball = tar([file['path'] for file in files]) enc = crypt(compress(tarball), self.key) self.bdd.set_crypt_attr(crypt_id, compress_mode="tar.gz", sha1sum=get_hash(enc)) print(" Size :", get_size(enc)) self.__upload_file(enc, file_name=str(crypt_id).zfill(ZFILL)) def recover_file(self, paths, parents=False, save_path=os.getcwd()): files = self.bdd.get_crypt_name(paths) for file in files: if file['crypt_id'] is not None: encrypted_file = self.__download_file(str(file['crypt_id']).zfill(ZFILL)) file_hash = get_hash(encrypted_file) if file['sha1sum'] == file_hash: if parents: save_path = os.path.join(save_path, file['path']) uncompress(uncrypt(encrypted_file, self.key), file['name'], save_path, file['compress_mode']) print("Recovered :", file['name']) else: print("Checksum don't match:") print("{} {}".format(file['sha1sum'], "BDD")) print("{} {}".format(file_hash, "File")) print() def __upload_file(self, file, file_name): if self.save_mode == BACKBLAZE: self.buk.upload_bytes(file.read(), file_name) elif self.save_mode == LOCAL: save(file, os.path.join(self.save_location, file_name)) def __download_file(self, file): dl = tempfile.SpooledTemporaryFile() if self.save_mode == BACKBLAZE: self.buk.download_file_by_name(file).save(dl) elif self.save_mode == LOCAL: try: with open(os.path.join(self.save_location, file), 'rb') as infile: dl.write(infile.read()) except FileNotFoundError: print("Fichier", file, "introuvable") return dl def __delete_file(self, file): if self.save_mode == BACKBLAZE: truc = self.buk.get_file_info_by_name(file) #self.buk.delete_file_version(None, file) elif self.save_mode == LOCAL: try: os.remove(os.path.join(self.save_location, file)) except FileNotFoundError: print("Fichier", file, "introuvable") def get_size(in_file): if type(in_file) is str: filesize = os.path.getsize(in_file) elif type(in_file) is tempfile.SpooledTemporaryFile: in_file.seek(0, 2) filesize = in_file.tell() in_file.seek(0) return human_size(filesize, decimal_places=1, unit='si') def get_hash(infile): if type(infile) is str: file = open(infile, 'rb') elif type(infile) is tempfile.SpooledTemporaryFile: infile.seek(0) file = infile return hashlib.sha1(file.read()).hexdigest() def tar(files): tarball = tempfile.SpooledTemporaryFile() with tarfile.open(fileobj=tarball, mode='w') as zipfile: for file in files: zipfile.add(file, arcname=os.path.basename(file)) return tarball def untar(tar_file, file, save_path): if type(tar_file) is tempfile.SpooledTemporaryFile: tar_file.seek(0) zipfile = tarfile.open(fileobj=tar_file, mode='r') else: zipfile = tarfile.open(tar_file, 'r') if not os.path.isdir(save_path): os.mkdir(save_path) zipfile.extract(file, path=save_path) zipfile.close() def compress(file): if type(file) is str: infile = open(file, 'rb') elif type(file) is tempfile.SpooledTemporaryFile: file.seek(0) infile = file compressed_file = tempfile.SpooledTemporaryFile() with gzip.open(compressed_file, 'wb') as zipfile: zipfile.write(infile.read()) return compressed_file def uncompress(data, file_name, save_path, compress_mode): modes = compress_mode.split('.') if 'gz' in modes: data = ungz(data) if 'tar' in modes: untar(data, file_name, save_path) else: save(data, os.path.join(save_path, file_name)) def ungz(data): if type(data) is tempfile.SpooledTemporaryFile: data.seek(0) decompressed_file = tempfile.SpooledTemporaryFile() with gzip.open(data, 'rb') as zipfile: decompressed_file.write(zipfile.read()) return decompressed_file def crypt(file, key): encrypted_file = tempfile.SpooledTemporaryFile() encrypt_file(key, file, encrypted_file) return encrypted_file def uncrypt(file, key): decrypted_file = tempfile.SpooledTemporaryFile() decrypt_file(key, file, decrypted_file) return decrypted_file def save(file, save_path): if not os.path.isdir(os.path.dirname(save_path)): os.mkdir(os.path.dirname(save_path)) if type(file) is io.BufferedRandom or tempfile.SpooledTemporaryFile: file.seek(0) with open(save_path, 'wb') as save: while chunk := file.read(64 * 1024): save.write(chunk) else: print("Unable to save " + str(file) + " of type " + str(type(file))) return class DataBase: def __init__(self, base_file): self.conn = sqlite3.connect(base_file) self.__create_table() def __del__(self): self.conn.commit() self.conn.close() def __create_table(self): cursor = self.conn.cursor() # cursor.execute("""DROP TABLE IF EXISTS files""") # # cursor.execute("""DROP TABLE IF EXISTS crypt""") # # self.conn.commit() cursor.execute(""" CREATE TABLE IF NOT EXISTS files( id INTEGER PRIMARY KEY UNIQUE NOT NULL, name TEXT, path TEXT, size INTEGER, m_date DATE, c_date DATE, crypt_id INTEGER, CONSTRAINT files_crypt_FK FOREIGN KEY (crypt_id) REFERENCES crypt(id) ) """) cursor.execute(""" CREATE TABLE IF NOT EXISTS crypt( id INTEGER PRIMARY KEY UNIQUE NOT NULL, compress_mode TEXT, sha1sum TEXT ) """) self.conn.row_factory = dict_factory # self.conn.set_trace_callback(print) self.conn.commit() def get_crypt_name(self, list_file): cursor = self.conn.cursor() crypt_list = [] # for path in [file['path'] for file in list_file]: for path in list_file: path = pathlib.PureWindowsPath(path).as_posix() cursor.execute("""SELECT crypt_id, compress_mode, sha1sum FROM files INNER JOIN crypt ON files.crypt_id = crypt.id WHERE path=?""", (path,)) retval = cursor.fetchone() try: crypt_list.append({'name': os.path.basename(path), 'path': path, 'crypt_id': retval['crypt_id'], 'compress_mode': retval['compress_mode'], 'sha1sum': retval['sha1sum']}) except TypeError: crypt_list.append({'path': path, 'crypt': None}) return crypt_list def get_crypts(self, path): cursor = self.conn.cursor() cursor.execute("""SELECT DISTINCT crypt.id, crypt.sha1sum FROM crypt LEFT JOIN files ON files.crypt_id = crypt.id WHERE path LIKE ?""", (path + "%", )) return cursor.fetchall() def __get_crypt_id(self, list_file): cursor = self.conn.cursor() crypt_id_list = [] for file in list_file: cursor.execute("""SELECT crypt_id FROM files WHERE name=? AND path=?""", (file['name'], file['path'])) try: crypt_id_list.append(cursor.fetchone()['crypt_id']) except TypeError: pass if len(crypt_id_list) != 0: id = most_common(crypt_id_list) else: # if not already/find in bdd cursor.execute("""SELECT IFNULL(max(id) + 1, 0) as crypt_id FROM crypt""") return cursor.fetchone()['crypt_id'] params = {'id': id, 'name': ', '.join([f"'{file['name']}'" for file in list_file]), 'path': ', '.join([f"'{file['path']}'" for file in list_file])} cursor.execute("""SELECT 1 FROM files WHERE crypt_id=? AND name NOT IN (?) AND path NOT IN (?)""", (params['id'], params['name'], params['path'])) neighbour = cursor.fetchall() # if they have a neighbour don't overwrite it if len(neighbour) > 0: cursor.execute("""SELECT IFNULL(max(id) + 1, 0) as crypt_id FROM crypt""") return cursor.fetchone()['crypt_id'] else: # if they are different, define the same id for the files of this archive if len(set(crypt_id_list)) > 1: cursor.execute("""UPDATE files SET crypt_id=? WHERE name IN (?) AND path IN (?)""", (params['id'], params['name'], params['path'])) return id def get_files(self, path): cursor = self.conn.cursor() cursor.execute("""SELECT id, name, path FROM files WHERE path LIKE ?""", (path + "%", )) list_file = cursor.fetchall() return list_file def delete_file(self, file): cursor = self.conn.cursor() cursor.execute("""DELETE FROM files WHERE id=? AND name=? AND path=?""", (file['id'], file['name'], file['path'])) self.conn.commit() def get_orphan_crypt(self): cursor = self.conn.cursor() cursor.execute("""SELECT crypt.id FROM crypt LEFT JOIN files ON files.crypt_id = crypt.id WHERE files.id IS NULL""") return cursor.fetchall() def get_file_no_crypt(self): cursor = self.conn.cursor() cursor.execute("""SELECT files.id, files.name, files.path FROM files LEFT JOIN crypt ON files.crypt_id = crypt.id WHERE crypt.id IS NULL""") return cursor.fetchall() def delete_crypt(self, crypt_id): cursor = self.conn.cursor() cursor.execute("""DELETE FROM crypt WHERE id=?""", (crypt_id,)) self.conn.commit() def exist(self, file): cursor = self.conn.cursor() cursor.execute("""SELECT EXISTS (SELECT 1 FROM files WHERE name=? AND path=?) as exist""", (file['name'], file['path'])) return cursor.fetchone()['exist'] def modified(self, file): cursor = self.conn.cursor() cursor.execute("""SELECT name, path, size, m_date, c_date FROM files WHERE name=? AND path=?""", (file['name'], file['path'])) bdd_file = cursor.fetchone() # for key in ['m_date', 'c_date']: # bdd_file[key] = datetime.strptime(bdd_file[key], "%Y-%m-%d %H:%M:%S.%f") # bdd_file['size'] = int(bdd_file['size']) if file == bdd_file: return False else: return True def set_crypt_attr(self, crypt_id, compress_mode=None, sha1sum=None): cursor = self.conn.cursor() cursor.execute("""UPDATE crypt SET compress_mode=?, sha1sum=? WHERE id=?""", (compress_mode, sha1sum, crypt_id)) self.conn.commit() def add(self, list_file): cursor = self.conn.cursor() crypt_id = self.__get_crypt_id(list_file) cursor.execute("""SELECT IFNULL(max(id) + 1, 0) as files_id FROM files""") file_id = cursor.fetchone()['files_id'] proceed = False for file in list_file: if self.exist(file): if self.modified(file): cursor.execute("""UPDATE files SET size=?, m_date=?, c_date=?, crypt_id=? WHERE name=? AND path=?""", (file['size'], file['m_date'], file['c_date'], crypt_id, file['name'], file['path'])) proceed = True else: cursor.execute("""INSERT INTO files VALUES(?, ?, ?, ?, ?, ?, ?)""", (file_id, file['name'], file['path'], file['size'], file['m_date'], file['c_date'], crypt_id)) file_id += 1 proceed = True if proceed: cursor.execute("""INSERT OR IGNORE INTO crypt (id) VALUES(?)""", (crypt_id,)) self.conn.commit() return crypt_id else: return None def human_size(size, decimal_places=0, unit=None): unit_tab = ['B', 'K', 'M', 'G', 'T'] format = 1024.0 if unit == 'iec': format = 1024.0 unit_tab = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] elif unit == 'si': format = 1000.0 unit_tab = ['B', 'KB', 'MB', 'GB', 'TB'] for unit in unit_tab: if size < format: break size /= format return f"{size:.{decimal_places}f}{unit}" def parse_size(size): units = {"B": 1, "K": 2**10, "M": 2**20, "G": 2**30, "T": 2**40} if size[-1].isdigit(): size = size + 'K' number, unit = re.match(r"([0-9]+)([BKMGT])", size, re.I).groups() return int(float(number)*units[unit]) def most_frequent(list): return max(set(list), key=list.count) def most_common(lst): if len(set(lst)) == 1: return lst[0] data = coll.Counter(lst) most = {'count': 0, 'item': 99999} for item, count in data.items(): if count > most['count'] and item < most['item']: most['count'] = count most['item'] = item return most['item'] def dict_factory(cursor, row): d = {} for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d