DUC and HDC
Understanding Convolution for Semantic Segmentation
Recent advances in deep learning, especially deep convolutional neural networks (CNNs), have led to significant improvement over previous semantic segmentation systems. Here we show how to improve pixel-wise semantic segmentation by manipulating convolution-related operations that are of both theoretical and practical value. First, we design dense upsampling convolution (DUC) to generate pixel-level prediction, which is able to capture and decode more detailed information that is generally missing in bilinear upsampling. Second, we propose a hybrid dilated convolution (HDC) framework in the encoding phase. This framework 1) effectively enlarges the receptive fields (RF) of the network to aggregate global information; 2) alleviates what we call the “gridding issue”caused by the standard dilated convolution operation. We evaluate our approaches thoroughly on the Cityscapes dataset, and achieve a state-of-art result of 80.1% mIOU in the test set at the time of submission. We also have achieved state-of-theart overall on the KITTI road estimation benchmark and the PASCAL VOC2012 segmentation task.
Implementations
TuSimple-DUC
This repository is for Understanding Convolution for Semantic Segmentation (WACV 2018), which achieved state-of-the-art result on the CityScapes, PASCAL VOC 2012, and Kitti Road benchmark.

PyTorch for Semantic Segmentation
import torch
from torch import nn
from torchvision import models
from .config import res152_path
class _DenseUpsamplingConvModule(nn.Module):
def __init__(self, down_factor, in_dim, num_classes):
super(_DenseUpsamplingConvModule, self).__init__()
upsample_dim = (down_factor ** 2) * num_classes
self.conv = nn.Conv2d(in_dim, upsample_dim, kernel_size=3, padding=1)
self.bn = nn.BatchNorm2d(upsample_dim)
self.relu = nn.ReLU(inplace=True)
self.pixel_shuffle = nn.PixelShuffle(down_factor)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
x = self.pixel_shuffle(x)
return x
class ResNetDUC(nn.Module):
# the size of image should be multiple of 8
def __init__(self, num_classes, pretrained=True):
super(ResNetDUC, self).__init__()
resnet = models.resnet152()
if pretrained:
resnet.load_state_dict(torch.load(res152_path))
self.layer0 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool)
self.layer1 = resnet.layer1
self.layer2 = resnet.layer2
self.layer3 = resnet.layer3
self.layer4 = resnet.layer4
for n, m in self.layer3.named_modules():
if 'conv2' in n:
m.dilation = (2, 2)
m.padding = (2, 2)
m.stride = (1, 1)
elif 'downsample.0' in n:
m.stride = (1, 1)
for n, m in self.layer4.named_modules():
if 'conv2' in n:
m.dilation = (4, 4)
m.padding = (4, 4)
m.stride = (1, 1)
elif 'downsample.0' in n:
m.stride = (1, 1)
self.duc = _DenseUpsamplingConvModule(8, 2048, num_classes)
def forward(self, x):
x = self.layer0(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.duc(x)
return x
class ResNetDUCHDC(nn.Module):
# the size of image should be multiple of 8
def __init__(self, num_classes, pretrained=True):
super(ResNetDUCHDC, self).__init__()
resnet = models.resnet152()
if pretrained:
resnet.load_state_dict(torch.load(res152_path))
self.layer0 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool)
self.layer1 = resnet.layer1
self.layer2 = resnet.layer2
self.layer3 = resnet.layer3
self.layer4 = resnet.layer4
for n, m in self.layer3.named_modules():
if 'conv2' in n or 'downsample.0' in n:
m.stride = (1, 1)
for n, m in self.layer4.named_modules():
if 'conv2' in n or 'downsample.0' in n:
m.stride = (1, 1)
layer3_group_config = [1, 2, 5, 9]
for idx in range(len(self.layer3)):
self.layer3[idx].conv2.dilation = (layer3_group_config[idx % 4], layer3_group_config[idx % 4])
self.layer3[idx].conv2.padding = (layer3_group_config[idx % 4], layer3_group_config[idx % 4])
layer4_group_config = [5, 9, 17]
for idx in range(len(self.layer4)):
self.layer4[idx].conv2.dilation = (layer4_group_config[idx], layer4_group_config[idx])
self.layer4[idx].conv2.padding = (layer4_group_config[idx], layer4_group_config[idx])
self.duc = _DenseUpsamplingConvModule(8, 2048, num_classes)
def forward(self, x):
x = self.layer0(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.duc(x)
return x
Github: https://github.com/ZijunDeng/pytorch-semantic-segmentation
