#!/usr/bin/env python3
import scriptconfig as scfg
"""
The following describes a DVC Error I ran into.
The cache files were corrupted, and dvc does not seem to have a way to
check for this (so I wrote this script).
But after I removed the corrupted cache files on machine2, I tried to push
them from machine1 (with a good cache) to machine2, but dvc didn't realize
that machine1 had files that machine2 was missing. Perhaps this is because
the missing files were behind a .dir object in the cache?
It would be good to get a MWE for this.
"""
import ubelt as ub
[docs]
class DvcCacheValidateCLI(scfg.DataConfig):
"""
Checks for corruption in the dvc cache.
"""
__command__ = 'cache_validate'
path = scfg.Value(None, help='path to a dvc repo or file to validate the cache for', position=1)
[docs]
@classmethod
def main(cls, cmdline=1, **kwargs):
"""
Ignore:
from simple_dvc.cache_validate import *
path = '/home/joncrall/remote/toothbrush/data/dvc-repos/smart_data_dvc-ssd/Drop7-Cropped2GSD'
path = '/data/joncrall/dvc-repos/smart_data_dvc/Drop7-Cropped2GSD'
cmdline = 0
kwargs = dict(path=path)
cls = DvcCacheValidateCLI
Example:
>>> # xdoctest: +SKIP
>>> from simple_dvc.cache_validate import * # NOQA
>>> cmdline = 0
>>> kwargs = dict(path='.')
>>> cls = DvcCacheValidateCLI
>>> cls.main(cmdline=cmdline, **kwargs)
"""
import rich
config = cls.cli(cmdline=cmdline, data=kwargs, strict=True)
rich.print('config = ' + ub.urepr(config, nl=1))
from simple_dvc import SimpleDVC
dvc = SimpleDVC.coerce(config.path)
rich.print(f'config.path: [link={config.path}]{config.path}[/link]')
rich.print(f'dvc.dvc_root: [link={dvc.dvc_root}]{dvc.dvc_root}[/link]')
rich.print(f'dvc.cache_dir: [link={dvc.cache_dir}]{dvc.cache_dir}[/link]')
# list(dvc.find_file_tracker(ub.Path(config.path).absolute()))
# TODO: better way to list all the cache files associated with a
# directory or a dvc file.
path = ub.Path(config.path).absolute()
sidecar_paths = list(ub.ProgIter(dvc.sidecar_paths(path), desc='list sidecars'))
corrupt_checks = True
corrupt_fpaths = []
valid_fpaths = []
missing_fpaths = []
maybe_valid_fpaths = []
# if 0:
# for d in valid_fpaths:
# if d['cache_fpath'].name == '5e6e84d75213fd149aa6956935dce5.dir':
# raise Exception
for sidecar_fpath in ub.ProgIter(sidecar_paths, verbose=3, desc='iter sidecars'):
print('sidecar_fpath = {}'.format(ub.urepr(sidecar_fpath, nl=1)))
# print(f'{len(corrupt_fpaths)=}')
# print(f'{len(valid_fpaths)=}')
# print(f'{len(missing_fpaths)=}')
# print(f'{len(maybe_valid_fpaths)=}')
for cache_fpath in dvc.resolve_cache_paths(sidecar_fpath):
item = {'cache_fpath': cache_fpath, 'sidecar_fpath': sidecar_fpath}
if not cache_fpath.exists():
print('issue with cache_fpath = {}'.format(ub.urepr(cache_fpath, nl=1)))
missing_fpaths.append(item)
else:
is_dirfile = cache_fpath.name.endswith('.dir')
if corrupt_checks:
md5_hash = ub.hash_file(cache_fpath, hasher='md5')
prefix, suffix = md5_hash[0:2], md5_hash[2:]
if is_dirfile:
suffix = suffix + '.dir'
file_prefix = cache_fpath.parent.name
file_suffix = cache_fpath.name
if prefix != file_prefix or suffix != file_suffix:
print('CORRUPT FILE: ' + str(cache_fpath))
corrupt_fpaths.append(item)
else:
valid_fpaths.append(item)
else:
maybe_valid_fpaths.append(item)
do_delete = 0
if do_delete:
for p in corrupt_fpaths:
p.delete()
if 0:
from kwutil.copy_manager import CopyManager
cman = CopyManager()
seen_ = set()
# A hacky fixup for missing files
existing_cache_dir = ub.Path('/home/joncrall/remote/toothbrush/data/dvc-repos/smart_data_dvc/.dvc/cache')
for info in ub.ProgIter(missing_fpaths):
info['cache_fpath'].exists()
rel_fpath = info['cache_fpath'].relative_to(dvc.cache_dir)
if not rel_fpath.startswith('files/md5'):
rel_fpath = ub.Path('files/md5') / rel_fpath
other_fpath = (existing_cache_dir / rel_fpath)
this_fpath = dvc.cache_dir / rel_fpath
if this_fpath.exists():
continue
if this_fpath in seen_:
print("skip")
continue
seen_.add(this_fpath)
assert other_fpath.exists()
assert not this_fpath.exists()
assert info['sidecar_fpath'].exists()
cman.submit(other_fpath, this_fpath)
print(f'Copying {len(cman)} files')
cman.run()
[docs]
def find_cached_fpaths(dvc, dpath):
for fpath in dvc.find_sidecar_paths(dpath):
yield from dvc.resolve_cache_paths(ub.Path(fpath))
r"""
Notes:
raw_bands/QA_C001/ave_B05_B06_B07_B09_B8A_blue_cirrus_coastal_green_nir_red_swir16_swir22/ave_20170301T070000Z_049_B05_B06_B07_B09_B8A_blue_cirrus_coastal_green_nir_red_swir16_swir22_S2_595ec869d8df6b23_a313a3e080fccf50.tif
tofix = '''
63/9de8ed3afd3f739e652b28f0c4fcbc
d3/40f349ada479900e95ee0028bf1695
67/c9d784622d3eaed3608d46c941e22d
64/9b8af71be219bec8bf370ce65c4235
51/7de88c346d6493225013594da02643
57/113aa658d600f7e49242e06df1a346
1d/4a2102b434cf17cc61e3071d62190f
96/6154ab7ed358cebb77822f7020a784
22/6fee2363722bcbd2b7599c290a9199
d3/5eea786fe4a20ead51dff651df6fef
'''.strip().split('\n')
p1 = '/home/joncrall/remote/namek/data/dvc-repos/smart_data_dvc/.dvc/cache/files/md5/'
p2 = '/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/'
for s in tofix:
src = 'namek:' + p1 + s
dst = p2 + s
command = f'scp {src} {dst}'
# command = f'rm {dst}'
print(command)
rsync -avpr
63/9de8ed3afd3f739e652b28f0c4fcbc
63/9de8ed3afd3f739e652b28f0c4fcbc
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/d3/40f349ada479900e95ee0028bf1695
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/67/c9d784622d3eaed3608d46c941e22d
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/64/9b8af71be219bec8bf370ce65c4235
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/51/7de88c346d6493225013594da02643
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/57/113aa658d600f7e49242e06df1a346
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/1d/4a2102b434cf17cc61e3071d62190f
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/96/6154ab7ed358cebb77822f7020a784
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/22/6fee2363722bcbd2b7599c290a9199
/media/joncrall/flash1/smart_data_dvc/.dvc/cache/files/md5/d3/5eea786fe4a20ead51dff651df6fef
"""
__cli__ = DvcCacheValidateCLI
main = __cli__.main
if __name__ == '__main__':
"""
CommandLine:
python ~/code/watch/dev/poc/dvc_cache_validate.py
python -m dvc_cache_validate
"""
main()