Unable to call untar_data on complete MNIST dataset URL

Hey guys, when I try use the untar_data function and point it to the URL of the complete dataset I get the following error:


InvalidHeaderError Traceback (most recent call last)
/opt/conda/envs/fastai/lib/python3.7/tarfile.py in next(self)
2288 try:
-> 2289 tarinfo = self.tarinfo.fromtarfile(self)
2290 except EOFHeaderError as e:

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in fromtarfile(cls, tarfile)
1094 buf = tarfile.fileobj.read(BLOCKSIZE)
-> 1095 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1096 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in frombuf(cls, buf, encoding, errors)
1038 if chksum not in calc_chksums(buf):
-> 1039 raise InvalidHeaderError(“bad checksum”)
1040

InvalidHeaderError: bad checksum

During handling of the above exception, another exception occurred:

ReadError Traceback (most recent call last)
in
----> 1 path_train = untar_data(‘http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz’, dest = ‘/MNIST_full’)
2 #path_train_labels = untar_data(‘http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz’)
3 #path_test = untar_data(‘http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz’)
4 #path_test_labels = untar_data(‘http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz’)

/opt/conda/envs/fastai/lib/python3.7/site-packages/fastai2/data/external.py in untar_data(url, fname, dest, c_key, force_download, extract_func)
229 if _get_check(url) and _check_file(fname) != _get_check(url):
230 print(f"File downloaded is broken. Remove {fname} and try again.")
–> 231 extract_func(fname, dest.parent)
232 return dest

/opt/conda/envs/fastai/lib/python3.7/site-packages/fastai2/data/external.py in file_extract(fname, dest)
202 if dest is None: dest = Path(fname).parent
203 fname = str(fname)
–> 204 if fname.endswith(‘gz’): tarfile.open(fname, ‘r:gz’).extractall(dest)
205 elif fname.endswith(‘zip’): zipfile.ZipFile(fname ).extractall(dest)
206 else: raise Exception(f’Unrecognized archive: {fname}’)

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1589 else:
1590 raise CompressionError(“unknown compression type %r” % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1592
1593 elif “|” in mode:

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1643
1644 try:
-> 1645 t = cls.taropen(name, mode, fileobj, **kwargs)
1646 except OSError:
1647 fileobj.close()

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1619 if mode not in (“r”, “a”, “w”, “x”):
1620 raise ValueError(“mode must be ‘r’, ‘a’, ‘w’ or ‘x’”)
-> 1621 return cls(name, mode, fileobj, **kwargs)
1622
1623 @classmethod

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in init(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1482 if self.mode == “r”:
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1485
1486 if self.mode == “a”:

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in next(self)
2299 continue
2300 elif self.offset == 0:
-> 2301 raise ReadError(str(e))
2302 except EmptyHeaderError:
2303 if self.offset == 0:

ReadError: bad checksum

What am I doing wrong?

Bad checksum means a corrupted file, probably in your case meaning that it didn’t download properly. Try deleting it and downloading it again.

2 Likes

Thanks for replying Joe.

The files in questions are the full MNIST dataset, found on the official website of Yann LeCun: http://yann.lecun.com/exdb/mnist/
Could it be that the file is corrupt, or is there something else at play here?

I have also tried downloading the dataset, uploading it to my server and using fastai’s file_extract() function but I am still getting this bad checksum error

Oh, it’s a gzip file, not a tar file. You need to use gzip.open().

1 Like

Hi

my issue with untar_data is a different,
actually i install every thing with conda (version 4.9.2) in a new environment.

running jupyter notebooks in VSCODE, everything seems ok (import fastai was ok) but when try to import dataset with untar_ data it gives me an errer:

ImportError                               Traceback (most recent call last)

in
----> 1 from fastai.data import untar_data
2 path = untar_data(URLs.PETS)
3 path

ImportError: cannot import name ‘untar_data’ from ‘fastai.data’
(C:\Users------\miniconda3\envs\fastai_env\lib\site-packages\fastai\data_init_.py)

please advise me