mobilenet_v1.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # =============================================================================
  15. """MobileNet v1.
  16. MobileNet is a general architecture and can be used for multiple use cases.
  17. Depending on the use case, it can use different input layer size and different
  18. head (for example: embeddings, localization and classification).
  19. As described in https://arxiv.org/abs/1704.04861.
  20. MobileNets: Efficient Convolutional Neural Networks for
  21. Mobile Vision Applications
  22. Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang,
  23. Tobias Weyand, Marco Andreetto, Hartwig Adam
  24. 100% Mobilenet V1 (base) with input size 224x224:
  25. See mobilenet_v1()
  26. Layer params macs
  27. --------------------------------------------------------------------------------
  28. MobilenetV1/Conv2d_0/Conv2D: 864 10,838,016
  29. MobilenetV1/Conv2d_1_depthwise/depthwise: 288 3,612,672
  30. MobilenetV1/Conv2d_1_pointwise/Conv2D: 2,048 25,690,112
  31. MobilenetV1/Conv2d_2_depthwise/depthwise: 576 1,806,336
  32. MobilenetV1/Conv2d_2_pointwise/Conv2D: 8,192 25,690,112
  33. MobilenetV1/Conv2d_3_depthwise/depthwise: 1,152 3,612,672
  34. MobilenetV1/Conv2d_3_pointwise/Conv2D: 16,384 51,380,224
  35. MobilenetV1/Conv2d_4_depthwise/depthwise: 1,152 903,168
  36. MobilenetV1/Conv2d_4_pointwise/Conv2D: 32,768 25,690,112
  37. MobilenetV1/Conv2d_5_depthwise/depthwise: 2,304 1,806,336
  38. MobilenetV1/Conv2d_5_pointwise/Conv2D: 65,536 51,380,224
  39. MobilenetV1/Conv2d_6_depthwise/depthwise: 2,304 451,584
  40. MobilenetV1/Conv2d_6_pointwise/Conv2D: 131,072 25,690,112
  41. MobilenetV1/Conv2d_7_depthwise/depthwise: 4,608 903,168
  42. MobilenetV1/Conv2d_7_pointwise/Conv2D: 262,144 51,380,224
  43. MobilenetV1/Conv2d_8_depthwise/depthwise: 4,608 903,168
  44. MobilenetV1/Conv2d_8_pointwise/Conv2D: 262,144 51,380,224
  45. MobilenetV1/Conv2d_9_depthwise/depthwise: 4,608 903,168
  46. MobilenetV1/Conv2d_9_pointwise/Conv2D: 262,144 51,380,224
  47. MobilenetV1/Conv2d_10_depthwise/depthwise: 4,608 903,168
  48. MobilenetV1/Conv2d_10_pointwise/Conv2D: 262,144 51,380,224
  49. MobilenetV1/Conv2d_11_depthwise/depthwise: 4,608 903,168
  50. MobilenetV1/Conv2d_11_pointwise/Conv2D: 262,144 51,380,224
  51. MobilenetV1/Conv2d_12_depthwise/depthwise: 4,608 225,792
  52. MobilenetV1/Conv2d_12_pointwise/Conv2D: 524,288 25,690,112
  53. MobilenetV1/Conv2d_13_depthwise/depthwise: 9,216 451,584
  54. MobilenetV1/Conv2d_13_pointwise/Conv2D: 1,048,576 51,380,224
  55. --------------------------------------------------------------------------------
  56. Total: 3,185,088 567,716,352
  57. 75% Mobilenet V1 (base) with input size 128x128:
  58. See mobilenet_v1_075()
  59. Layer params macs
  60. --------------------------------------------------------------------------------
  61. MobilenetV1/Conv2d_0/Conv2D: 648 2,654,208
  62. MobilenetV1/Conv2d_1_depthwise/depthwise: 216 884,736
  63. MobilenetV1/Conv2d_1_pointwise/Conv2D: 1,152 4,718,592
  64. MobilenetV1/Conv2d_2_depthwise/depthwise: 432 442,368
  65. MobilenetV1/Conv2d_2_pointwise/Conv2D: 4,608 4,718,592
  66. MobilenetV1/Conv2d_3_depthwise/depthwise: 864 884,736
  67. MobilenetV1/Conv2d_3_pointwise/Conv2D: 9,216 9,437,184
  68. MobilenetV1/Conv2d_4_depthwise/depthwise: 864 221,184
  69. MobilenetV1/Conv2d_4_pointwise/Conv2D: 18,432 4,718,592
  70. MobilenetV1/Conv2d_5_depthwise/depthwise: 1,728 442,368
  71. MobilenetV1/Conv2d_5_pointwise/Conv2D: 36,864 9,437,184
  72. MobilenetV1/Conv2d_6_depthwise/depthwise: 1,728 110,592
  73. MobilenetV1/Conv2d_6_pointwise/Conv2D: 73,728 4,718,592
  74. MobilenetV1/Conv2d_7_depthwise/depthwise: 3,456 221,184
  75. MobilenetV1/Conv2d_7_pointwise/Conv2D: 147,456 9,437,184
  76. MobilenetV1/Conv2d_8_depthwise/depthwise: 3,456 221,184
  77. MobilenetV1/Conv2d_8_pointwise/Conv2D: 147,456 9,437,184
  78. MobilenetV1/Conv2d_9_depthwise/depthwise: 3,456 221,184
  79. MobilenetV1/Conv2d_9_pointwise/Conv2D: 147,456 9,437,184
  80. MobilenetV1/Conv2d_10_depthwise/depthwise: 3,456 221,184
  81. MobilenetV1/Conv2d_10_pointwise/Conv2D: 147,456 9,437,184
  82. MobilenetV1/Conv2d_11_depthwise/depthwise: 3,456 221,184
  83. MobilenetV1/Conv2d_11_pointwise/Conv2D: 147,456 9,437,184
  84. MobilenetV1/Conv2d_12_depthwise/depthwise: 3,456 55,296
  85. MobilenetV1/Conv2d_12_pointwise/Conv2D: 294,912 4,718,592
  86. MobilenetV1/Conv2d_13_depthwise/depthwise: 6,912 110,592
  87. MobilenetV1/Conv2d_13_pointwise/Conv2D: 589,824 9,437,184
  88. --------------------------------------------------------------------------------
  89. Total: 1,800,144 106,002,432
  90. """
  91. # Tensorflow mandates these.
  92. from __future__ import absolute_import
  93. from __future__ import division
  94. from __future__ import print_function
  95. from collections import namedtuple
  96. import functools
  97. import tensorflow as tf
  98. slim = tf.contrib.slim
  99. # Conv and DepthSepConv namedtuple define layers of the MobileNet architecture
  100. # Conv defines 3x3 convolution layers
  101. # DepthSepConv defines 3x3 depthwise convolution followed by 1x1 convolution.
  102. # stride is the stride of the convolution
  103. # depth is the number of channels or filters in a layer
  104. Conv = namedtuple('Conv', ['kernel', 'stride', 'depth'])
  105. DepthSepConv = namedtuple('DepthSepConv', ['kernel', 'stride', 'depth'])
  106. # _CONV_DEFS specifies the MobileNet body
  107. _CONV_DEFS = [
  108. Conv(kernel=[3, 3], stride=2, depth=32),
  109. DepthSepConv(kernel=[3, 3], stride=1, depth=64),
  110. DepthSepConv(kernel=[3, 3], stride=2, depth=128),
  111. DepthSepConv(kernel=[3, 3], stride=1, depth=128),
  112. DepthSepConv(kernel=[3, 3], stride=2, depth=256),
  113. DepthSepConv(kernel=[3, 3], stride=1, depth=256),
  114. DepthSepConv(kernel=[3, 3], stride=2, depth=512),
  115. DepthSepConv(kernel=[3, 3], stride=1, depth=512),
  116. DepthSepConv(kernel=[3, 3], stride=1, depth=512),
  117. DepthSepConv(kernel=[3, 3], stride=1, depth=512),
  118. DepthSepConv(kernel=[3, 3], stride=1, depth=512),
  119. DepthSepConv(kernel=[3, 3], stride=1, depth=512),
  120. DepthSepConv(kernel=[3, 3], stride=2, depth=1024),
  121. DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
  122. ]
  123. def mobilenet_v1_base(inputs,
  124. final_endpoint='Conv2d_13_pointwise',
  125. min_depth=8,
  126. depth_multiplier=1.0,
  127. conv_defs=None,
  128. output_stride=None,
  129. scope=None):
  130. """Mobilenet v1.
  131. Constructs a Mobilenet v1 network from inputs to the given final endpoint.
  132. Args:
  133. inputs: a tensor of shape [batch_size, height, width, channels].
  134. final_endpoint: specifies the endpoint to construct the network up to. It
  135. can be one of ['Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise',
  136. 'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5'_pointwise,
  137. 'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise',
  138. 'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise',
  139. 'Conv2d_12_pointwise', 'Conv2d_13_pointwise'].
  140. min_depth: Minimum depth value (number of channels) for all convolution ops.
  141. Enforced when depth_multiplier < 1, and not an active constraint when
  142. depth_multiplier >= 1.
  143. depth_multiplier: Float multiplier for the depth (number of channels)
  144. for all convolution ops. The value must be greater than zero. Typical
  145. usage will be to set this value in (0, 1) to reduce the number of
  146. parameters or computation cost of the model.
  147. conv_defs: A list of ConvDef namedtuples specifying the net architecture.
  148. output_stride: An integer that specifies the requested ratio of input to
  149. output spatial resolution. If not None, then we invoke atrous convolution
  150. if necessary to prevent the network from reducing the spatial resolution
  151. of the activation maps. Allowed values are 8 (accurate fully convolutional
  152. mode), 16 (fast fully convolutional mode), 32 (classification mode).
  153. scope: Optional variable_scope.
  154. Returns:
  155. tensor_out: output tensor corresponding to the final_endpoint.
  156. end_points: a set of activations for external use, for example summaries or
  157. losses.
  158. Raises:
  159. ValueError: if final_endpoint is not set to one of the predefined values,
  160. or depth_multiplier <= 0, or the target output_stride is not
  161. allowed.
  162. """
  163. depth = lambda d: max(int(d * depth_multiplier), min_depth)
  164. end_points = {}
  165. # Used to find thinned depths for each layer.
  166. if depth_multiplier <= 0:
  167. raise ValueError('depth_multiplier is not greater than zero.')
  168. if conv_defs is None:
  169. conv_defs = _CONV_DEFS
  170. if output_stride is not None and output_stride not in [8, 16, 32]:
  171. raise ValueError('Only allowed output_stride values are 8, 16, 32.')
  172. with tf.variable_scope(scope, 'MobilenetV1', [inputs]):
  173. with slim.arg_scope([slim.conv2d, slim.separable_conv2d], padding='SAME'):
  174. # The current_stride variable keeps track of the output stride of the
  175. # activations, i.e., the running product of convolution strides up to the
  176. # current network layer. This allows us to invoke atrous convolution
  177. # whenever applying the next convolution would result in the activations
  178. # having output stride larger than the target output_stride.
  179. current_stride = 1
  180. # The atrous convolution rate parameter.
  181. rate = 1
  182. net = inputs
  183. for i, conv_def in enumerate(conv_defs):
  184. end_point_base = 'Conv2d_%d' % i
  185. if output_stride is not None and current_stride == output_stride:
  186. # If we have reached the target output_stride, then we need to employ
  187. # atrous convolution with stride=1 and multiply the atrous rate by the
  188. # current unit's stride for use in subsequent layers.
  189. layer_stride = 1
  190. layer_rate = rate
  191. rate *= conv_def.stride
  192. else:
  193. layer_stride = conv_def.stride
  194. layer_rate = 1
  195. current_stride *= conv_def.stride
  196. if isinstance(conv_def, Conv):
  197. end_point = end_point_base
  198. net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel,
  199. stride=conv_def.stride,
  200. normalizer_fn=slim.batch_norm,
  201. scope=end_point)
  202. end_points[end_point] = net
  203. if end_point == final_endpoint:
  204. return net, end_points
  205. elif isinstance(conv_def, DepthSepConv):
  206. end_point = end_point_base + '_depthwise'
  207. # By passing filters=None
  208. # separable_conv2d produces only a depthwise convolution layer
  209. net = slim.separable_conv2d(net, None, conv_def.kernel,
  210. depth_multiplier=1,
  211. stride=layer_stride,
  212. rate=layer_rate,
  213. normalizer_fn=slim.batch_norm,
  214. scope=end_point)
  215. end_points[end_point] = net
  216. if end_point == final_endpoint:
  217. return net, end_points
  218. end_point = end_point_base + '_pointwise'
  219. net = slim.conv2d(net, depth(conv_def.depth), [1, 1],
  220. stride=1,
  221. normalizer_fn=slim.batch_norm,
  222. scope=end_point)
  223. end_points[end_point] = net
  224. if end_point == final_endpoint:
  225. return net, end_points
  226. else:
  227. raise ValueError('Unknown convolution type %s for layer %d'
  228. % (conv_def.ltype, i))
  229. raise ValueError('Unknown final endpoint %s' % final_endpoint)
  230. def mobilenet_v1(inputs,
  231. num_classes=1000,
  232. dropout_keep_prob=0.999,
  233. is_training=True,
  234. min_depth=8,
  235. depth_multiplier=1.0,
  236. conv_defs=None,
  237. prediction_fn=tf.contrib.layers.softmax,
  238. spatial_squeeze=True,
  239. reuse=None,
  240. scope='MobilenetV1',
  241. global_pool=False):
  242. """Mobilenet v1 model for classification.
  243. Args:
  244. inputs: a tensor of shape [batch_size, height, width, channels].
  245. num_classes: number of predicted classes. If 0 or None, the logits layer
  246. is omitted and the input features to the logits layer (before dropout)
  247. are returned instead.
  248. dropout_keep_prob: the percentage of activation values that are retained.
  249. is_training: whether is training or not.
  250. min_depth: Minimum depth value (number of channels) for all convolution ops.
  251. Enforced when depth_multiplier < 1, and not an active constraint when
  252. depth_multiplier >= 1.
  253. depth_multiplier: Float multiplier for the depth (number of channels)
  254. for all convolution ops. The value must be greater than zero. Typical
  255. usage will be to set this value in (0, 1) to reduce the number of
  256. parameters or computation cost of the model.
  257. conv_defs: A list of ConvDef namedtuples specifying the net architecture.
  258. prediction_fn: a function to get predictions out of logits.
  259. spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
  260. of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
  261. reuse: whether or not the network and its variables should be reused. To be
  262. able to reuse 'scope' must be given.
  263. scope: Optional variable_scope.
  264. global_pool: Optional boolean flag to control the avgpooling before the
  265. logits layer. If false or unset, pooling is done with a fixed window
  266. that reduces default-sized inputs to 1x1, while larger inputs lead to
  267. larger outputs. If true, any input size is pooled down to 1x1.
  268. Returns:
  269. net: a 2D Tensor with the logits (pre-softmax activations) if num_classes
  270. is a non-zero integer, or the non-dropped-out input to the logits layer
  271. if num_classes is 0 or None.
  272. end_points: a dictionary from components of the network to the corresponding
  273. activation.
  274. Raises:
  275. ValueError: Input rank is invalid.
  276. """
  277. input_shape = inputs.get_shape().as_list()
  278. if len(input_shape) != 4:
  279. raise ValueError('Invalid input tensor rank, expected 4, was: %d' %
  280. len(input_shape))
  281. with tf.variable_scope(scope, 'MobilenetV1', [inputs], reuse=reuse) as scope:
  282. with slim.arg_scope([slim.batch_norm, slim.dropout],
  283. is_training=is_training):
  284. net, end_points = mobilenet_v1_base(inputs, scope=scope,
  285. min_depth=min_depth,
  286. depth_multiplier=depth_multiplier,
  287. conv_defs=conv_defs)
  288. with tf.variable_scope('Logits'):
  289. if global_pool:
  290. # Global average pooling.
  291. net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
  292. end_points['global_pool'] = net
  293. else:
  294. # Pooling with a fixed kernel size.
  295. kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7])
  296. net = slim.avg_pool2d(net, kernel_size, padding='VALID',
  297. scope='AvgPool_1a')
  298. end_points['AvgPool_1a'] = net
  299. if not num_classes:
  300. return net, end_points
  301. # 1 x 1 x 1024
  302. net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
  303. logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
  304. normalizer_fn=None, scope='Conv2d_1c_1x1')
  305. if spatial_squeeze:
  306. logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
  307. end_points['Logits'] = logits
  308. if prediction_fn:
  309. end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
  310. return logits, end_points
  311. mobilenet_v1.default_image_size = 224
  312. def wrapped_partial(func, *args, **kwargs):
  313. partial_func = functools.partial(func, *args, **kwargs)
  314. functools.update_wrapper(partial_func, func)
  315. return partial_func
  316. mobilenet_v1_075 = wrapped_partial(mobilenet_v1, depth_multiplier=0.75)
  317. mobilenet_v1_050 = wrapped_partial(mobilenet_v1, depth_multiplier=0.50)
  318. mobilenet_v1_025 = wrapped_partial(mobilenet_v1, depth_multiplier=0.25)
  319. def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
  320. """Define kernel size which is automatically reduced for small input.
  321. If the shape of the input images is unknown at graph construction time this
  322. function assumes that the input images are large enough.
  323. Args:
  324. input_tensor: input tensor of size [batch_size, height, width, channels].
  325. kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
  326. Returns:
  327. a tensor with the kernel size.
  328. """
  329. shape = input_tensor.get_shape().as_list()
  330. if shape[1] is None or shape[2] is None:
  331. kernel_size_out = kernel_size
  332. else:
  333. kernel_size_out = [min(shape[1], kernel_size[0]),
  334. min(shape[2], kernel_size[1])]
  335. return kernel_size_out
  336. def mobilenet_v1_arg_scope(is_training=True,
  337. weight_decay=0.00004,
  338. stddev=0.09,
  339. regularize_depthwise=False):
  340. """Defines the default MobilenetV1 arg scope.
  341. Args:
  342. is_training: Whether or not we're training the model.
  343. weight_decay: The weight decay to use for regularizing the model.
  344. stddev: The standard deviation of the trunctated normal weight initializer.
  345. regularize_depthwise: Whether or not apply regularization on depthwise.
  346. Returns:
  347. An `arg_scope` to use for the mobilenet v1 model.
  348. """
  349. batch_norm_params = {
  350. 'is_training': is_training,
  351. 'center': True,
  352. 'scale': True,
  353. 'decay': 0.9997,
  354. 'epsilon': 0.001,
  355. }
  356. # Set weight_decay for weights in Conv and DepthSepConv layers.
  357. weights_init = tf.truncated_normal_initializer(stddev=stddev)
  358. regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
  359. if regularize_depthwise:
  360. depthwise_regularizer = regularizer
  361. else:
  362. depthwise_regularizer = None
  363. with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
  364. weights_initializer=weights_init,
  365. activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm):
  366. with slim.arg_scope([slim.batch_norm], **batch_norm_params):
  367. with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer):
  368. with slim.arg_scope([slim.separable_conv2d],
  369. weights_regularizer=depthwise_regularizer) as sc:
  370. return sc