Unable to call untar_data on complete MNIST dataset URL

Hey guys, when I try use the untar_data function and point it to the URL of the complete dataset I get the following error:

InvalidHeaderError Traceback (most recent call last)
/opt/conda/envs/fastai/lib/python3.7/tarfile.py in next(self)
2288 try:
-> 2289 tarinfo = self.tarinfo.fromtarfile(self)
2290 except EOFHeaderError as e:

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in fromtarfile(cls, tarfile)
1094 buf = tarfile.fileobj.read(BLOCKSIZE)
-> 1095 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1096 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in frombuf(cls, buf, encoding, errors)
1038 if chksum not in calc_chksums(buf):
-> 1039 raise InvalidHeaderError(“bad checksum”)

InvalidHeaderError: bad checksum

During handling of the above exception, another exception occurred:

ReadError Traceback (most recent call last)
----> 1 path_train = untar_data(‘http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz’, dest = ‘/MNIST_full’)
2 #path_train_labels = untar_data(‘http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz’)
3 #path_test = untar_data(‘http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz’)
4 #path_test_labels = untar_data(‘http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz’)

/opt/conda/envs/fastai/lib/python3.7/site-packages/fastai2/data/external.py in untar_data(url, fname, dest, c_key, force_download, extract_func)
229 if _get_check(url) and _check_file(fname) != _get_check(url):
230 print(f"File downloaded is broken. Remove {fname} and try again.")
–> 231 extract_func(fname, dest.parent)
232 return dest

/opt/conda/envs/fastai/lib/python3.7/site-packages/fastai2/data/external.py in file_extract(fname, dest)
202 if dest is None: dest = Path(fname).parent
203 fname = str(fname)
–> 204 if fname.endswith(‘gz’): tarfile.open(fname, ‘r:gz’).extractall(dest)
205 elif fname.endswith(‘zip’): zipfile.ZipFile(fname ).extractall(dest)
206 else: raise Exception(f’Unrecognized archive: {fname}’)

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1589 else:
1590 raise CompressionError(“unknown compression type %r” % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1593 elif “|” in mode:

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1644 try:
-> 1645 t = cls.taropen(name, mode, fileobj, **kwargs)
1646 except OSError:
1647 fileobj.close()

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1619 if mode not in (“r”, “a”, “w”, “x”):
1620 raise ValueError(“mode must be ‘r’, ‘a’, ‘w’ or ‘x’”)
-> 1621 return cls(name, mode, fileobj, **kwargs)
1623 @classmethod

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in init(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1482 if self.mode == “r”:
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1486 if self.mode == “a”:

/opt/conda/envs/fastai/lib/python3.7/tarfile.py in next(self)
2299 continue
2300 elif self.offset == 0:
-> 2301 raise ReadError(str(e))
2302 except EmptyHeaderError:
2303 if self.offset == 0:

ReadError: bad checksum

What am I doing wrong?

Bad checksum means a corrupted file, probably in your case meaning that it didn’t download properly. Try deleting it and downloading it again.


Thanks for replying Joe.

The files in questions are the full MNIST dataset, found on the official website of Yann LeCun: http://yann.lecun.com/exdb/mnist/
Could it be that the file is corrupt, or is there something else at play here?

I have also tried downloading the dataset, uploading it to my server and using fastai’s file_extract() function but I am still getting this bad checksum error

Oh, it’s a gzip file, not a tar file. You need to use gzip.open().

1 Like


my issue with untar_data is a different,
actually i install every thing with conda (version 4.9.2) in a new environment.

running jupyter notebooks in VSCODE, everything seems ok (import fastai was ok) but when try to import dataset with untar_ data it gives me an errer:

ImportError                               Traceback (most recent call last)

----> 1 from fastai.data import untar_data
2 path = untar_data(URLs.PETS)
3 path

ImportError: cannot import name ‘untar_data’ from ‘fastai.data’

please advise me