Just for a little of background, I trained a custom object detector model using your train.py
code. After that I tested it on inference.py
, I filled out the necessary terminal flags to make sure that my model is used.
import gi
from model import MobileDetectNetModel
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
gi.require_version('Gst', '1.0')
@plac.annotations(
inference_type=("Type of inference to test (TF, FP32, FP16, INT8)", 'option', 'T', str),
batch_size=("Size of the TensorRT batch", 'option', 'B', int),
weights=("Model weights", 'option', 'W', str),
test_path=("Test images path", 'option', 'I', str),
merge=("Test images only: Merge detected regions", 'flag', 'm', bool),
stage=("Test images only: Augmentation training stage", 'option', 's', str),
limit=("Test images only: Max number of images to run inference on", 'option', 'l', int),
confidence=("Test images only: Minimum confidence in coverage to draw bbox", "option", "c", float),
visualize=("Visualize the inference", "option", "V", bool),
camera=("Use camera feed. Ignores test_path. Boolean.", "option", "C", bool)
)
# Set inference_type to FP16 to use TensorRT
def main(inference_type: str = "FP16",
batch_size: int = 1,
test_path: str = None,
weights: str = None,
merge: bool = False,
stage: str = "test",
limit: int = 20,
confidence: float = 0.1,
visualize: bool = True,
camera: bool = False):
keras_model = MobileDetectNetModel.complete_model()
if weights is not None:
keras_model.load_weights(weights, by_name=True)
images_done = 0
if test_path is not None:
# import cv2
if stage != 'test':
from generator import MobileDetectNetSequence
seq = MobileDetectNetSequence.create_augmenter(stage)
else:
seq = None
images_full = []
images_input = []
images_scale = []
for r, d, f in os.walk(test_path):
for file in f:
image_full = cv2.imread(os.path.join(r, file))
image_input = cv2.resize(image_full, (224, 224))
scale_width = image_full.shape[1] / 224
scale_height = image_full.shape[0] / 224
images_scale.append((scale_width, scale_height))
if stage != 'test':
seq_det = seq.to_deterministic()
image_aug = (seq_det.augment_image(image_input).astype(np.float32) / 127.5) - 1.
else:
image_aug = image_input.astype(np.float32) / 127.5 - 1.
images_full.append(image_full)
images_input.append(image_aug)
images_done += 1
if images_done == limit:
break
if images_done == limit:
break
x_test = np.array(images_input)
else:
#x_test = np.random.random((limit, 224, 224, 3))
x_test = np.random.random((224, 224, 3))
# x_test = np.random.random((224, 224, 3))
x_cold = np.random.random((batch_size, 224, 224, 3))
print(f'Inference Type is {inference_type}')
if inference_type == 'K':
keras_model.predict(x_cold)
t0 = time.time()
model_outputs = keras_model.predict(x_test)
t1 = time.time()
elif inference_type == 'TF':
tf_engine = keras_model.tf_engine()
tf_engine.infer(x_cold)
t0 = time.time()
model_outputs = tf_engine.infer(x_test)
t1 = time.time()
elif inference_type == 'FP32':
tftrt_engine = keras_model.tftrt_engine(precision='FP32', batch_size=batch_size)
tftrt_engine.infer(x_cold)
t0 = time.time()
model_outputs = tftrt_engine.infer(x_test)
t1 = time.time()
# WE ARE USING THIS INFERENCE TYPE, TFTRT
elif inference_type == 'FP16':
tftrt_engine = keras_model.tftrt_engine(precision='FP16', batch_size=batch_size)
tftrt_engine.infer(x_cold)
#t0 = time.time()
#model_outputs = tftrt_engine.infer(x_test)
#t1 = time.time()
elif inference_type == 'INT8':
tftrt_engine = keras_model.tftrt_engine(precision='INT8', batch_size=batch_size)
tftrt_engine.infer(x_cold)
t0 = time.time()
model_outputs = tftrt_engine.infer(x_test)
t1 = time.time()
else:
raise ValueError("Invalid inference type")
#print('Time: ', t1 - t0)
#print('FPS: ', x_test.shape[0]/(t1 - t0))
if not visualize:
return
# if len(model_outputs) == 2:
# classes, bboxes = model_outputs
# TF / TensorRT models won't output regions (not useful for production)
#elif len(model_outputs) == 3:
# regions, bboxes, classes = model_outputs
#else:
# raise ValueError("Invalid model length output")
if test_path is not None and camera is False:
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
# get colormap
ncolors = 256
color_array = plt.get_cmap('viridis')(range(ncolors))
# change alpha values
color_array[:, -1] = np.linspace(0.0, 1.0, ncolors)
# create a colormap object
map_object = LinearSegmentedColormap.from_list(name='viridis_alpha', colors=color_array)
# register this new colormap with matplotlib
plt.register_cmap(cmap=map_object)
for idx in range(0, len(images_full)):
rectangles = []
# Does this only get the first 7 items?
for y in range(0, 7):
for x in range(0, 7):
if classes[idx, y, x, 0] >= confidence:
rect = [
int(bboxes[idx, int(y), int(x), 0] * 224),
int(bboxes[idx, int(y), int(x), 1] * 224),
int(bboxes[idx, int(y), int(x), 2] * 224),
int(bboxes[idx, int(y), int(x), 3] * 224)]
rectangles.append(rect)
if merge:
rectangles, merges = cv2.groupRectangles(rectangles, 1, eps=0.75)
scale_width, scale_height = images_scale[idx]
for rect in rectangles:
cv2.rectangle(images_full[idx],
(int(rect[0]*scale_width), int(rect[1]*scale_height)),
(int(rect[2]*scale_width), int(rect[3]*scale_height)),
(0, 255, 0), 5)
plt.imshow(cv2.cvtColor(images_full[idx], cv2.COLOR_BGR2RGB), alpha=1.0, aspect='auto')
plt.imshow(
cv2.resize(classes[idx].reshape((7, 7)),
(images_full[idx].shape[1], images_full[idx].shape[0])),
interpolation='nearest', alpha=0.5, cmap='viridis_alpha', aspect='auto')
plt.show()
font = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (10, 500)
fontScale = 1
fontColor = (255, 255, 255)
lineType = 2
if camera is True:
print('camera flag detected!')
#cap = cv2.VideoCapture("nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)1280, height=(int)720, format=(string)NV12, framerate=(fraction)21/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx, width=(int)960, height=(int)616 ! videoconvert ! video/x-raw, format=(string)BGR ! appsink")
cap = cv2.VideoCapture("nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)1280, height=(int)720, format=(string)NV12, framerate=(fraction)60/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx, width=(int)960, height=(int)616 ! videoconvert ! appsink")
if cap.isOpened():
cv2.namedWindow("demo")
while True:
ret_val, image_np = cap.read()
image_raw = image_np
#print(f'*** original shape is {image_np.shape}')
# Expand the dimensions
image_np_expanded = np.expand_dims(image_np, axis=0)
#print(f'*** image expanded shape is {image_np_expanded.shape}')
images_full = []
images_input = []
images_scale = []
dim = (224, 224)
image_input = cv2.resize(image_raw, (224, 224))
#image_input = image_np_expanded
#print(f'image_raw shape is = {image_input.shape}')
image_full = np.expand_dims(image_input, axis=0)
#print(f'image_full shape after expanding is = {image_full.shape}')
#scale_width = image_full.shape[1] / 224
#scale_height = image_full.shape[0] / 224
#images_scale.append((scale_width, scale_height))
if stage != 'test':
seq_det = seq.to_deterministic()
image_aug = (seq_det.augment_image(image_input).astype(np.float32) / 127.5) - 1.
else:
image_aug = image_input.astype(np.float32) / 127.5 - 1.
#images_full.append(image_full)
#images_full.append(image_aug)
t0 = time.time()
#print(f'shape of image full before sending to ')
model_outputs = tftrt_engine.infer(image_full)
t1 = time.time()
rectangles = []
#print(f'length of model_outputs is = {len(model_outputs)}')
if len(model_outputs) == 2:
classes, bboxes = model_outputs
# TF / TensorRT models won't output regions (not useful for production)
elif len(model_outputs) == 3:
regions, bboxes, classes = model_outputs
else:
raise ValueError("Invalid model length output")
framerate = 1.0/(t1 - t0)
#print('Time: ', t1 - t0)
#print('FPS: ', framerate)
print()
for y in range(0, 7):
for x in range(0, 7):
#print(f'confidence is = {classes[0, y, x, 0]}')
if classes[0, y, x, 0] >= confidence:
#print('confidence is enough!')
rect = [
int(bboxes[0, int(y), int(x), 0] * 224),
int(bboxes[0, int(y), int(x), 1] * 224),
int(bboxes[0, int(y), int(x), 2] * 224),
int(bboxes[0, int(y), int(x), 3] * 224)]
print(f'rectangle is = {rect}')
rectangles.append(rect)
#else:
# print('confidence not high enough')
rectangles, merges = cv2.groupRectangles(rectangles, 1, eps=0.75)
#scale_width, scale_height = images_scale[idx]
if len(rectangles) > 0:
print(f'rectangle count is = {len(rectangles)}')
for rect in rectangles:
cv2.rectangle(image_raw,
(int(rect[0]), int(rect[1])),
(int(rect[2]), int(rect[3])),
(0, 255, 0), 5)
cv2.putText(image_raw, "FPS: {0:.2f}".format(framerate), bottomLeftCornerOfText, font, fontScale, fontColor, lineType)
cv2.imshow("demo", image_raw)
if cv2.waitKey(1) == ord('q'):
break
else:
print('camera open failed')
cv2.destroyAllWindows()
if __name__ == '__main__':
plac.call(main)
Basically what happens is that I get the captured frame, and run it through inference. When I run the script (again, using my own model), the camera feed opens just fine, BUT when I make it view a photo of the object I trained it on, the same photos from the folder that I test it with, it doesn't detect my object anymore.