123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428 |
- # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # =============================================================================
- """MobileNet v1.
- MobileNet is a general architecture and can be used for multiple use cases.
- Depending on the use case, it can use different input layer size and different
- head (for example: embeddings, localization and classification).
- As described in https://arxiv.org/abs/1704.04861.
- MobileNets: Efficient Convolutional Neural Networks for
- Mobile Vision Applications
- Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang,
- Tobias Weyand, Marco Andreetto, Hartwig Adam
- 100% Mobilenet V1 (base) with input size 224x224:
- See mobilenet_v1()
- Layer params macs
- --------------------------------------------------------------------------------
- MobilenetV1/Conv2d_0/Conv2D: 864 10,838,016
- MobilenetV1/Conv2d_1_depthwise/depthwise: 288 3,612,672
- MobilenetV1/Conv2d_1_pointwise/Conv2D: 2,048 25,690,112
- MobilenetV1/Conv2d_2_depthwise/depthwise: 576 1,806,336
- MobilenetV1/Conv2d_2_pointwise/Conv2D: 8,192 25,690,112
- MobilenetV1/Conv2d_3_depthwise/depthwise: 1,152 3,612,672
- MobilenetV1/Conv2d_3_pointwise/Conv2D: 16,384 51,380,224
- MobilenetV1/Conv2d_4_depthwise/depthwise: 1,152 903,168
- MobilenetV1/Conv2d_4_pointwise/Conv2D: 32,768 25,690,112
- MobilenetV1/Conv2d_5_depthwise/depthwise: 2,304 1,806,336
- MobilenetV1/Conv2d_5_pointwise/Conv2D: 65,536 51,380,224
- MobilenetV1/Conv2d_6_depthwise/depthwise: 2,304 451,584
- MobilenetV1/Conv2d_6_pointwise/Conv2D: 131,072 25,690,112
- MobilenetV1/Conv2d_7_depthwise/depthwise: 4,608 903,168
- MobilenetV1/Conv2d_7_pointwise/Conv2D: 262,144 51,380,224
- MobilenetV1/Conv2d_8_depthwise/depthwise: 4,608 903,168
- MobilenetV1/Conv2d_8_pointwise/Conv2D: 262,144 51,380,224
- MobilenetV1/Conv2d_9_depthwise/depthwise: 4,608 903,168
- MobilenetV1/Conv2d_9_pointwise/Conv2D: 262,144 51,380,224
- MobilenetV1/Conv2d_10_depthwise/depthwise: 4,608 903,168
- MobilenetV1/Conv2d_10_pointwise/Conv2D: 262,144 51,380,224
- MobilenetV1/Conv2d_11_depthwise/depthwise: 4,608 903,168
- MobilenetV1/Conv2d_11_pointwise/Conv2D: 262,144 51,380,224
- MobilenetV1/Conv2d_12_depthwise/depthwise: 4,608 225,792
- MobilenetV1/Conv2d_12_pointwise/Conv2D: 524,288 25,690,112
- MobilenetV1/Conv2d_13_depthwise/depthwise: 9,216 451,584
- MobilenetV1/Conv2d_13_pointwise/Conv2D: 1,048,576 51,380,224
- --------------------------------------------------------------------------------
- Total: 3,185,088 567,716,352
- 75% Mobilenet V1 (base) with input size 128x128:
- See mobilenet_v1_075()
- Layer params macs
- --------------------------------------------------------------------------------
- MobilenetV1/Conv2d_0/Conv2D: 648 2,654,208
- MobilenetV1/Conv2d_1_depthwise/depthwise: 216 884,736
- MobilenetV1/Conv2d_1_pointwise/Conv2D: 1,152 4,718,592
- MobilenetV1/Conv2d_2_depthwise/depthwise: 432 442,368
- MobilenetV1/Conv2d_2_pointwise/Conv2D: 4,608 4,718,592
- MobilenetV1/Conv2d_3_depthwise/depthwise: 864 884,736
- MobilenetV1/Conv2d_3_pointwise/Conv2D: 9,216 9,437,184
- MobilenetV1/Conv2d_4_depthwise/depthwise: 864 221,184
- MobilenetV1/Conv2d_4_pointwise/Conv2D: 18,432 4,718,592
- MobilenetV1/Conv2d_5_depthwise/depthwise: 1,728 442,368
- MobilenetV1/Conv2d_5_pointwise/Conv2D: 36,864 9,437,184
- MobilenetV1/Conv2d_6_depthwise/depthwise: 1,728 110,592
- MobilenetV1/Conv2d_6_pointwise/Conv2D: 73,728 4,718,592
- MobilenetV1/Conv2d_7_depthwise/depthwise: 3,456 221,184
- MobilenetV1/Conv2d_7_pointwise/Conv2D: 147,456 9,437,184
- MobilenetV1/Conv2d_8_depthwise/depthwise: 3,456 221,184
- MobilenetV1/Conv2d_8_pointwise/Conv2D: 147,456 9,437,184
- MobilenetV1/Conv2d_9_depthwise/depthwise: 3,456 221,184
- MobilenetV1/Conv2d_9_pointwise/Conv2D: 147,456 9,437,184
- MobilenetV1/Conv2d_10_depthwise/depthwise: 3,456 221,184
- MobilenetV1/Conv2d_10_pointwise/Conv2D: 147,456 9,437,184
- MobilenetV1/Conv2d_11_depthwise/depthwise: 3,456 221,184
- MobilenetV1/Conv2d_11_pointwise/Conv2D: 147,456 9,437,184
- MobilenetV1/Conv2d_12_depthwise/depthwise: 3,456 55,296
- MobilenetV1/Conv2d_12_pointwise/Conv2D: 294,912 4,718,592
- MobilenetV1/Conv2d_13_depthwise/depthwise: 6,912 110,592
- MobilenetV1/Conv2d_13_pointwise/Conv2D: 589,824 9,437,184
- --------------------------------------------------------------------------------
- Total: 1,800,144 106,002,432
- """
- # Tensorflow mandates these.
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- from collections import namedtuple
- import functools
- import tensorflow as tf
- slim = tf.contrib.slim
- # Conv and DepthSepConv namedtuple define layers of the MobileNet architecture
- # Conv defines 3x3 convolution layers
- # DepthSepConv defines 3x3 depthwise convolution followed by 1x1 convolution.
- # stride is the stride of the convolution
- # depth is the number of channels or filters in a layer
- Conv = namedtuple('Conv', ['kernel', 'stride', 'depth'])
- DepthSepConv = namedtuple('DepthSepConv', ['kernel', 'stride', 'depth'])
- # _CONV_DEFS specifies the MobileNet body
- _CONV_DEFS = [
- Conv(kernel=[3, 3], stride=2, depth=32),
- DepthSepConv(kernel=[3, 3], stride=1, depth=64),
- DepthSepConv(kernel=[3, 3], stride=2, depth=128),
- DepthSepConv(kernel=[3, 3], stride=1, depth=128),
- DepthSepConv(kernel=[3, 3], stride=2, depth=256),
- DepthSepConv(kernel=[3, 3], stride=1, depth=256),
- DepthSepConv(kernel=[3, 3], stride=2, depth=512),
- DepthSepConv(kernel=[3, 3], stride=1, depth=512),
- DepthSepConv(kernel=[3, 3], stride=1, depth=512),
- DepthSepConv(kernel=[3, 3], stride=1, depth=512),
- DepthSepConv(kernel=[3, 3], stride=1, depth=512),
- DepthSepConv(kernel=[3, 3], stride=1, depth=512),
- DepthSepConv(kernel=[3, 3], stride=2, depth=1024),
- DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
- ]
- def mobilenet_v1_base(inputs,
- final_endpoint='Conv2d_13_pointwise',
- min_depth=8,
- depth_multiplier=1.0,
- conv_defs=None,
- output_stride=None,
- scope=None):
- """Mobilenet v1.
- Constructs a Mobilenet v1 network from inputs to the given final endpoint.
- Args:
- inputs: a tensor of shape [batch_size, height, width, channels].
- final_endpoint: specifies the endpoint to construct the network up to. It
- can be one of ['Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise',
- 'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5'_pointwise,
- 'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise',
- 'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise',
- 'Conv2d_12_pointwise', 'Conv2d_13_pointwise'].
- min_depth: Minimum depth value (number of channels) for all convolution ops.
- Enforced when depth_multiplier < 1, and not an active constraint when
- depth_multiplier >= 1.
- depth_multiplier: Float multiplier for the depth (number of channels)
- for all convolution ops. The value must be greater than zero. Typical
- usage will be to set this value in (0, 1) to reduce the number of
- parameters or computation cost of the model.
- conv_defs: A list of ConvDef namedtuples specifying the net architecture.
- output_stride: An integer that specifies the requested ratio of input to
- output spatial resolution. If not None, then we invoke atrous convolution
- if necessary to prevent the network from reducing the spatial resolution
- of the activation maps. Allowed values are 8 (accurate fully convolutional
- mode), 16 (fast fully convolutional mode), 32 (classification mode).
- scope: Optional variable_scope.
- Returns:
- tensor_out: output tensor corresponding to the final_endpoint.
- end_points: a set of activations for external use, for example summaries or
- losses.
- Raises:
- ValueError: if final_endpoint is not set to one of the predefined values,
- or depth_multiplier <= 0, or the target output_stride is not
- allowed.
- """
- depth = lambda d: max(int(d * depth_multiplier), min_depth)
- end_points = {}
- # Used to find thinned depths for each layer.
- if depth_multiplier <= 0:
- raise ValueError('depth_multiplier is not greater than zero.')
- if conv_defs is None:
- conv_defs = _CONV_DEFS
- if output_stride is not None and output_stride not in [8, 16, 32]:
- raise ValueError('Only allowed output_stride values are 8, 16, 32.')
- with tf.variable_scope(scope, 'MobilenetV1', [inputs]):
- with slim.arg_scope([slim.conv2d, slim.separable_conv2d], padding='SAME'):
- # The current_stride variable keeps track of the output stride of the
- # activations, i.e., the running product of convolution strides up to the
- # current network layer. This allows us to invoke atrous convolution
- # whenever applying the next convolution would result in the activations
- # having output stride larger than the target output_stride.
- current_stride = 1
- # The atrous convolution rate parameter.
- rate = 1
- net = inputs
- for i, conv_def in enumerate(conv_defs):
- end_point_base = 'Conv2d_%d' % i
- if output_stride is not None and current_stride == output_stride:
- # If we have reached the target output_stride, then we need to employ
- # atrous convolution with stride=1 and multiply the atrous rate by the
- # current unit's stride for use in subsequent layers.
- layer_stride = 1
- layer_rate = rate
- rate *= conv_def.stride
- else:
- layer_stride = conv_def.stride
- layer_rate = 1
- current_stride *= conv_def.stride
- if isinstance(conv_def, Conv):
- end_point = end_point_base
- net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel,
- stride=conv_def.stride,
- normalizer_fn=slim.batch_norm,
- scope=end_point)
- end_points[end_point] = net
- if end_point == final_endpoint:
- return net, end_points
- elif isinstance(conv_def, DepthSepConv):
- end_point = end_point_base + '_depthwise'
- # By passing filters=None
- # separable_conv2d produces only a depthwise convolution layer
- net = slim.separable_conv2d(net, None, conv_def.kernel,
- depth_multiplier=1,
- stride=layer_stride,
- rate=layer_rate,
- normalizer_fn=slim.batch_norm,
- scope=end_point)
- end_points[end_point] = net
- if end_point == final_endpoint:
- return net, end_points
- end_point = end_point_base + '_pointwise'
- net = slim.conv2d(net, depth(conv_def.depth), [1, 1],
- stride=1,
- normalizer_fn=slim.batch_norm,
- scope=end_point)
- end_points[end_point] = net
- if end_point == final_endpoint:
- return net, end_points
- else:
- raise ValueError('Unknown convolution type %s for layer %d'
- % (conv_def.ltype, i))
- raise ValueError('Unknown final endpoint %s' % final_endpoint)
- def mobilenet_v1(inputs,
- num_classes=1000,
- dropout_keep_prob=0.999,
- is_training=True,
- min_depth=8,
- depth_multiplier=1.0,
- conv_defs=None,
- prediction_fn=tf.contrib.layers.softmax,
- spatial_squeeze=True,
- reuse=None,
- scope='MobilenetV1',
- global_pool=False):
- """Mobilenet v1 model for classification.
- Args:
- inputs: a tensor of shape [batch_size, height, width, channels].
- num_classes: number of predicted classes. If 0 or None, the logits layer
- is omitted and the input features to the logits layer (before dropout)
- are returned instead.
- dropout_keep_prob: the percentage of activation values that are retained.
- is_training: whether is training or not.
- min_depth: Minimum depth value (number of channels) for all convolution ops.
- Enforced when depth_multiplier < 1, and not an active constraint when
- depth_multiplier >= 1.
- depth_multiplier: Float multiplier for the depth (number of channels)
- for all convolution ops. The value must be greater than zero. Typical
- usage will be to set this value in (0, 1) to reduce the number of
- parameters or computation cost of the model.
- conv_defs: A list of ConvDef namedtuples specifying the net architecture.
- prediction_fn: a function to get predictions out of logits.
- spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
- of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
- reuse: whether or not the network and its variables should be reused. To be
- able to reuse 'scope' must be given.
- scope: Optional variable_scope.
- global_pool: Optional boolean flag to control the avgpooling before the
- logits layer. If false or unset, pooling is done with a fixed window
- that reduces default-sized inputs to 1x1, while larger inputs lead to
- larger outputs. If true, any input size is pooled down to 1x1.
- Returns:
- net: a 2D Tensor with the logits (pre-softmax activations) if num_classes
- is a non-zero integer, or the non-dropped-out input to the logits layer
- if num_classes is 0 or None.
- end_points: a dictionary from components of the network to the corresponding
- activation.
- Raises:
- ValueError: Input rank is invalid.
- """
- input_shape = inputs.get_shape().as_list()
- if len(input_shape) != 4:
- raise ValueError('Invalid input tensor rank, expected 4, was: %d' %
- len(input_shape))
- with tf.variable_scope(scope, 'MobilenetV1', [inputs], reuse=reuse) as scope:
- with slim.arg_scope([slim.batch_norm, slim.dropout],
- is_training=is_training):
- net, end_points = mobilenet_v1_base(inputs, scope=scope,
- min_depth=min_depth,
- depth_multiplier=depth_multiplier,
- conv_defs=conv_defs)
- with tf.variable_scope('Logits'):
- if global_pool:
- # Global average pooling.
- net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
- end_points['global_pool'] = net
- else:
- # Pooling with a fixed kernel size.
- kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7])
- net = slim.avg_pool2d(net, kernel_size, padding='VALID',
- scope='AvgPool_1a')
- end_points['AvgPool_1a'] = net
- if not num_classes:
- return net, end_points
- # 1 x 1 x 1024
- net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
- logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
- normalizer_fn=None, scope='Conv2d_1c_1x1')
- if spatial_squeeze:
- logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
- end_points['Logits'] = logits
- if prediction_fn:
- end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
- return logits, end_points
- mobilenet_v1.default_image_size = 224
- def wrapped_partial(func, *args, **kwargs):
- partial_func = functools.partial(func, *args, **kwargs)
- functools.update_wrapper(partial_func, func)
- return partial_func
- mobilenet_v1_075 = wrapped_partial(mobilenet_v1, depth_multiplier=0.75)
- mobilenet_v1_050 = wrapped_partial(mobilenet_v1, depth_multiplier=0.50)
- mobilenet_v1_025 = wrapped_partial(mobilenet_v1, depth_multiplier=0.25)
- def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
- """Define kernel size which is automatically reduced for small input.
- If the shape of the input images is unknown at graph construction time this
- function assumes that the input images are large enough.
- Args:
- input_tensor: input tensor of size [batch_size, height, width, channels].
- kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
- Returns:
- a tensor with the kernel size.
- """
- shape = input_tensor.get_shape().as_list()
- if shape[1] is None or shape[2] is None:
- kernel_size_out = kernel_size
- else:
- kernel_size_out = [min(shape[1], kernel_size[0]),
- min(shape[2], kernel_size[1])]
- return kernel_size_out
- def mobilenet_v1_arg_scope(is_training=True,
- weight_decay=0.00004,
- stddev=0.09,
- regularize_depthwise=False):
- """Defines the default MobilenetV1 arg scope.
- Args:
- is_training: Whether or not we're training the model.
- weight_decay: The weight decay to use for regularizing the model.
- stddev: The standard deviation of the trunctated normal weight initializer.
- regularize_depthwise: Whether or not apply regularization on depthwise.
- Returns:
- An `arg_scope` to use for the mobilenet v1 model.
- """
- batch_norm_params = {
- 'is_training': is_training,
- 'center': True,
- 'scale': True,
- 'decay': 0.9997,
- 'epsilon': 0.001,
- }
- # Set weight_decay for weights in Conv and DepthSepConv layers.
- weights_init = tf.truncated_normal_initializer(stddev=stddev)
- regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
- if regularize_depthwise:
- depthwise_regularizer = regularizer
- else:
- depthwise_regularizer = None
- with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
- weights_initializer=weights_init,
- activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm):
- with slim.arg_scope([slim.batch_norm], **batch_norm_params):
- with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer):
- with slim.arg_scope([slim.separable_conv2d],
- weights_regularizer=depthwise_regularizer) as sc:
- return sc
|