DilatedNet
Multi-scale context aggregation by dilated convolutions
State-of-the-art models for semantic segmentation are based on adaptations of convolutional networks that had originally been designed for image classification. However, dense prediction problems such as semantic segmentation are structurally different from image classification. In this work, we develop a new convolutional network module that is specifically designed for dense prediction. The presented module uses dilated convolutions to systematically aggregate multiscale contextual information without losing resolution. The architecture is based on the fact that dilated convolutions support exponential expansion of the receptive field without loss of resolution or coverage. We show that the presented context module increases the accuracy of state-of-the-art semantic segmentation systems. In addition, we examine the adaptation of image classification networks to dense prediction and show that simplifying the adapted network can increase accuracy
Implementations
Overview
This code provides various models combining dilated convolutions with residual networks. Our models can achieve better performance with less parameters than ResNet on image classification and semantic segmentation.
import pdb
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
BatchNorm = nn.BatchNorm2d
# __all__ = ['DRN', 'drn26', 'drn42', 'drn58']
webroot = 'https://tigress-web.princeton.edu/~fy/drn/models/'
model_urls = {
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'drn-c-26': webroot + 'drn_c_26-ddedf421.pth',
'drn-c-42': webroot + 'drn_c_42-9d336e8c.pth',
'drn-c-58': webroot + 'drn_c_58-0a53a92c.pth',
'drn-d-22': webroot + 'drn_d_22-4bd2f8ea.pth',
'drn-d-38': webroot + 'drn_d_38-eebb45f0.pth',
'drn-d-54': webroot + 'drn_d_54-0e0534ff.pth',
'drn-d-105': webroot + 'drn_d_105-12b40979.pth'
}
def conv3x3(in_planes, out_planes, stride=1, padding=1, dilation=1):
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=padding, bias=False, dilation=dilation)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None,
dilation=(1, 1), residual=True):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride,
padding=dilation[0], dilation=dilation[0])
self.bn1 = BatchNorm(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes,
padding=dilation[1], dilation=dilation[1])
self.bn2 = BatchNorm(planes)
self.downsample = downsample
self.stride = stride
self.residual = residual
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
if self.residual:
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None,
dilation=(1, 1), residual=True):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = BatchNorm(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=dilation[1], bias=False,
dilation=dilation[1])
self.bn2 = BatchNorm(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = BatchNorm(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class DRN(nn.Module):
def __init__(self, block, layers, num_classes=1000,
channels=(16, 32, 64, 128, 256, 512, 512, 512),
out_map=False, out_middle=False, pool_size=28, arch='D'):
super(DRN, self).__init__()
self.inplanes = channels[0]
self.out_map = out_map
self.out_dim = channels[-1]
self.out_middle = out_middle
self.arch = arch
if arch == 'C':
self.conv1 = nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
padding=3, bias=False)
self.bn1 = BatchNorm(channels[0])
self.relu = nn.ReLU(inplace=True)
self.layer1 = self._make_layer(
BasicBlock, channels[0], layers[0], stride=1)
self.layer2 = self._make_layer(
BasicBlock, channels[1], layers[1], stride=2)
elif arch == 'D':
self.layer0 = nn.Sequential(
nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3,
bias=False),
BatchNorm(channels[0]),
nn.ReLU(inplace=True)
)
self.layer1 = self._make_conv_layers(
channels[0], layers[0], stride=1)
self.layer2 = self._make_conv_layers(
channels[1], layers[1], stride=2)
self.layer3 = self._make_layer(block, channels[2], layers[2], stride=2)
self.layer4 = self._make_layer(block, channels[3], layers[3], stride=2)
self.layer5 = self._make_layer(block, channels[4], layers[4],
dilation=2, new_level=False)
self.layer6 = None if layers[5] == 0 else \
self._make_layer(block, channels[5], layers[5], dilation=4,
new_level=False)
if arch == 'C':
self.layer7 = None if layers[6] == 0 else \
self._make_layer(BasicBlock, channels[6], layers[6], dilation=2,
new_level=False, residual=False)
self.layer8 = None if layers[7] == 0 else \
self._make_layer(BasicBlock, channels[7], layers[7], dilation=1,
new_level=False, residual=False)
elif arch == 'D':
self.layer7 = None if layers[6] == 0 else \
self._make_conv_layers(channels[6], layers[6], dilation=2)
self.layer8 = None if layers[7] == 0 else \
self._make_conv_layers(channels[7], layers[7], dilation=1)
if num_classes > 0:
self.avgpool = nn.AvgPool2d(pool_size)
self.fc = nn.Conv2d(self.out_dim, num_classes, kernel_size=1,
stride=1, padding=0, bias=True)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, BatchNorm):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1,
new_level=True, residual=True):
assert dilation == 1 or dilation % 2 == 0
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
BatchNorm(planes * block.expansion),
)
layers = list()
layers.append(block(
self.inplanes, planes, stride, downsample,
dilation=(1, 1) if dilation == 1 else (
dilation // 2 if new_level else dilation, dilation),
residual=residual))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, residual=residual,
dilation=(dilation, dilation)))
return nn.Sequential(*layers)
def _make_conv_layers(self, channels, convs, stride=1, dilation=1):
modules = []
for i in range(convs):
modules.extend([
nn.Conv2d(self.inplanes, channels, kernel_size=3,
stride=stride if i == 0 else 1,
padding=dilation, bias=False, dilation=dilation),
BatchNorm(channels),
nn.ReLU(inplace=True)])
self.inplanes = channels
return nn.Sequential(*modules)
def forward(self, x):
y = list()
if self.arch == 'C':
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
elif self.arch == 'D':
x = self.layer0(x)
x = self.layer1(x)
y.append(x)
x = self.layer2(x)
y.append(x)
x = self.layer3(x)
y.append(x)
x = self.layer4(x)
y.append(x)
x = self.layer5(x)
y.append(x)
if self.layer6 is not None:
x = self.layer6(x)
y.append(x)
if self.layer7 is not None:
x = self.layer7(x)
y.append(x)
if self.layer8 is not None:
x = self.layer8(x)
y.append(x)
if self.out_map:
x = self.fc(x)
else:
x = self.avgpool(x)
x = self.fc(x)
x = x.view(x.size(0), -1)
if self.out_middle:
return x, y
else:
return x
class DRN_A(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(DRN_A, self).__init__()
self.out_dim = 512 * block.expansion
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
dilation=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
dilation=4)
self.avgpool = nn.AvgPool2d(28, stride=1)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, BatchNorm):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes,
dilation=(dilation, dilation)))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def drn_a_50(pretrained=False, **kwargs):
model = DRN_A(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def drn_c_26(pretrained=False, **kwargs):
model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='C', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-c-26']))
return model
def drn_c_42(pretrained=False, **kwargs):
model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-c-42']))
return model
def drn_c_58(pretrained=False, **kwargs):
model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-c-58']))
return model
def drn_d_22(pretrained=False, **kwargs):
model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-22']))
return model
def drn_d_24(pretrained=False, **kwargs):
model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 2, 2], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-24']))
return model
def drn_d_38(pretrained=False, **kwargs):
model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-38']))
return model
def drn_d_40(pretrained=False, **kwargs):
model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-40']))
return model
def drn_d_54(pretrained=False, **kwargs):
model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-54']))
return model
def drn_d_56(pretrained=False, **kwargs):
model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-56']))
return model
def drn_d_105(pretrained=False, **kwargs):
model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 1, 1], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-105']))
return model
def drn_d_107(pretrained=False, **kwargs):
model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 2, 2], arch='D', **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['drn-d-107']))
return model
Github: https://github.com/fyu/drn#semantic-image-segmentataion
Multi-Scale Context Aggregation by Dilated Convolutions
Properties of dilated convolution are discussed in our ICLR 2016 conference paper. This repository contains the network definitions and the trained models. You can use this code together with vanilla Caffe to segment images using the pre-trained models. If you want to train the models yourself, please check out the document for training.
If you are looking for dilation models with state-of-the-art performance and Python implementation, please check out Dilated Residual Networks.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, division
from caffe import layers as L
from caffe import params as P
__author__ = 'Fisher Yu'
__copyright__ = 'Copyright (c) 2016, Fisher Yu'
__email__ = 'i@yf.io'
__license__ = 'MIT'
def make_image_label_data(image_list_path, label_list_path, batch_size,
mirror, crop_size, mean_pixel,
label_stride=8, margin=186):
label_dim = (crop_size - margin * 2) // label_stride
data, label = L.ImageLabelData(
transform_param=dict(mirror=mirror, mean_value=mean_pixel,
crop_size=crop_size),
image_label_data_param=dict(
image_list_path=image_list_path, label_list_path=label_list_path,
shuffle=True, batch_size=batch_size,
padding=P.ImageLabelData.REFLECT,
label_slice=dict(dim=[label_dim, label_dim],
stride=[label_stride, label_stride],
offset=[margin, margin])),
ntop=2)
return data, label
def make_bin_label_data(bin_list_path, label_list_path, batch_size,
label_shape, label_stride):
data, label = L.BinLabelData(
bin_label_data_param=dict(
bin_list_path=bin_list_path, label_list_path=label_list_path,
shuffle=True, batch_size=batch_size,
label_slice=dict(stride=[label_stride, label_stride],
dim=label_shape)),
ntop=2)
return data, label
def make_input_data(input_size, channels=3):
return L.Input(input_param=dict(shape=dict(
dim=[1, channels, input_size[0], input_size[1]])))
def make_softmax_loss(bottom, label):
return L.SoftmaxWithLoss(bottom, label,
loss_param=dict(ignore_label=255,
normalization=P.Loss.VALID))
def make_accuracy(bottom, label):
return L.Accuracy(bottom, label, accuracy_param=dict(ignore_label=255))
def make_prob(bottom):
return L.Softmax(bottom)
def make_upsample(bottom, num_classes):
return L.Deconvolution(
bottom,
param=[dict(lr_mult=0, decay_mult=0)],
convolution_param=dict(
bias_term=False, num_output=num_classes, kernel_size=16, stride=8,
group=num_classes, pad=4, weight_filler=dict(type="bilinear")))
def build_frontend_vgg(net, bottom, num_classes):
prev_layer = bottom
num_convolutions = [2, 2, 3, 3, 3]
dilations = [0, 0, 0, 0, 2, 4]
for l in range(5):
num_outputs = min(64 * 2 ** l, 512)
for i in range(0, num_convolutions[l]):
conv_name = 'conv{0}_{1}'.format(l+1, i+1)
relu_name = 'relu{0}_{1}'.format(l+1, i+1)
if dilations[l] == 0:
setattr(net, conv_name,
L.Convolution(
prev_layer,
param=[dict(lr_mult=1, decay_mult=1),
dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(num_output=num_outputs,
kernel_size=3)))
else:
setattr(net, conv_name,
L.Convolution(
prev_layer,
param=[dict(lr_mult=1, decay_mult=1),
dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(num_output=num_outputs,
kernel_size=3,
dilation=dilations[l])))
setattr(net, relu_name,
L.ReLU(getattr(net, conv_name), in_place=True))
prev_layer = getattr(net, relu_name)
if dilations[l+1] == 0:
pool_name = 'pool{0}'.format(l+1)
setattr(net, pool_name, L.Pooling(
prev_layer, pool=P.Pooling.MAX, kernel_size=2, stride=2))
prev_layer = getattr(net, pool_name)
net.fc6 = L.Convolution(
prev_layer,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(num_output=4096, kernel_size=7,
dilation=dilations[5]))
net.relu6 = L.ReLU(net.fc6, in_place=True)
net.drop6 = L.Dropout(net.relu6, in_place=True, dropout_ratio=0.5)
net.fc7 = L.Convolution(
net.drop6,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(num_output=4096, kernel_size=1))
net.relu7 = L.ReLU(net.fc7, in_place=True)
net.drop7 = L.Dropout(net.relu7, in_place=True, dropout_ratio=0.5)
net.final = L.Convolution(
net.drop7,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(
num_output=num_classes, kernel_size=1,
weight_filler=dict(type='gaussian', std=0.001),
bias_filler=dict(type='constant', value=0)))
return net.final, 'final'
def build_context(net, bottom, num_classes, layers=8):
prev_layer = bottom
multiplier = 1
for i in range(1, 3):
conv_name = 'ctx_conv1_{}'.format(i)
relu_name = 'ctx_relu1_{}'.format(i)
setattr(net, conv_name,
L.Convolution(
*([] if prev_layer is None else [prev_layer]),
param=[dict(lr_mult=1, decay_mult=1),
dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(
num_output=num_classes * multiplier, kernel_size=3,
pad=1,
weight_filler=dict(type='identity',
num_groups=num_classes, std=0.01),
bias_filler=dict(type='constant', value=0))))
setattr(net, relu_name,
L.ReLU(getattr(net, conv_name), in_place=True))
prev_layer = getattr(net, relu_name)
for i in range(2, layers - 2):
dilation = 2 ** (i - 1)
multiplier = 1
conv_name = 'ctx_conv{}_1'.format(i)
relu_name = 'ctx_relu{}_1'.format(i)
setattr(net, conv_name,
L.Convolution(
prev_layer,
param=[dict(lr_mult=1, decay_mult=1),
dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(
num_output=num_classes * multiplier, kernel_size=3,
dilation=dilation, pad=dilation,
weight_filler=dict(type='identity',
num_groups=num_classes,
std=0.01 / multiplier),
bias_filler=dict(type='constant', value=0))))
setattr(net, relu_name,
L.ReLU(getattr(net, conv_name), in_place=True))
prev_layer = getattr(net, relu_name)
net.ctx_fc1 = L.Convolution(
prev_layer,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(
num_output=num_classes * multiplier, kernel_size=3, pad=1,
weight_filler=dict(type='identity',
num_groups=num_classes,
std=0.01 / multiplier),
bias_filler=dict(type='constant', value=0)))
net.ctx_fc1_relu = L.ReLU(net.ctx_fc1, in_place=True)
net.ctx_final = L.Convolution(
net.ctx_fc1_relu,
param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
convolution_param=dict(
num_output=num_classes, kernel_size=1,
weight_filler=dict(type='identity',
num_groups=num_classes,
std=0.01 / multiplier),
bias_filler=dict(type='constant', value=0)))
return net.ctx_final, 'ctx_final'
Github: https://github.com/fyu/dilation/blob/master/network.py
This repository contains multiple implementations of DilatedNet with different languages
Github: https://github.com/mrgloom/awesome-semantic-segmentation