Using torch.quantization

I’m trying to use the new torch.quantization features to quantize my model before JIT exporting it, but I’m currently not seeing any improvement in terms of inference speed. Does anyone happen to know the proper way to do this? I’m testing on one of the new AWS G4 gpus (NVIDIA T4).

Here’s a minimum reproducible example (using any fastai-trained cnn):

import numpy as np
import PIL
import torch
from timeit import default_timer as timer
from time import sleep
from torchvision import transforms

# Load model
learn.load(model_path);

# Load some fake images for testing
image_size = 224
normalize = transforms.Normalize(
  mean=[0.485, 0.456, 0.406],
  std=[0.229, 0.224, 0.225])
resize_type = {'nearest': 0, 'lanczos': 1, 'bilinear': 2, 'bicubic': 3}
preprocessor = transforms.Compose([
  transforms.Resize(image_size, interpolation=resize_type['bilinear']),
  transforms.ToTensor(),
  normalize
])

test_img_count = 100
test_images = []
for _ in range(test_img_count):
  imarray = np.random.randint(0,256,(image_size,image_size,3))
  img = PIL.Image.fromarray(imarray.astype('uint8')).convert('RGB')
  img_tensor = preprocessor(img)
  img_tensor = img_tensor.unsqueeze(0)    
  if torch.cuda.is_available():
    img_tensor = img_tensor.to(torch.device('cuda'))
    test_images.append(img_tensor)
    
# JIT export standard model
learn.model.eval()
trace_input = torch.ones(1,3,image_size,image_size).cuda()
jit_model = torch.jit.trace(learn.model.float(), trace_input)
output_path = 'model_jit.pth'
torch.jit.save(jit_model, output_path)

# Test standard model speed a few times
regular_model = torch.jit.load(output_path, map_location=torch.device('cuda'))
regular_model.eval()
for _ in range(3):
  start = timer()
  for i in test_images: 
    predict_values = regular_model(i)
  print(f'Regular JIT model time: {timer()-start}')

######### Quantized version ######### 

# Reload model
learn.load(model_path);
model = learn.model

# quantize model
torch.quantization.quantize_dynamic(model.cpu(), dtype=torch.qint8);

# JIT export quantized model
quantized_output_path = 'model_quantized_jit.pth'
trace_input = torch.ones(1,3,image_size,image_size).cuda()
jit_model = torch.jit.trace(model.cuda(), trace_input)
torch.jit.save(jit_model, quantized_output_path)

# Test quantized model speed a few times
quantized_model = torch.jit.load(quantized_output_path, map_location=torch.device('cuda'))
quantized_model.eval()
for _ in range(5):
  start = timer()
  for i in test_images: 
    predict_values = quantized_model(i)
  print(f'Regular JIT model time: {timer()-start}') # similar speed?

I end up getting nearly the same exact inference times for the standard and quantized version. Am I quantizing the model incorrectly, or not understanding some other aspect of this?

2 Likes

Also the saved net size in megabytes is the same.

Looks like pytorch added experimental quantization tutorials:

https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html
https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html