
2024 RoboCup@Home 比赛现场code工作记录

约 835 个字 • 324 行代码



person1 = answer['person_info'][0]['attributes']

seleted_attributes = [
    "gender",              # 性别
    "upper_color",         # 上身服饰颜色
    "upper_wear",          # 上身服饰
    "upper_wear_texture",  # 上身服饰纹理
    "lower_color",         # 下身服饰颜色
    "lower_wear",          # 下身服饰
    "glasses",             # 是否戴眼镜
    "cellphone",           # 是否使用手机

results = []
for attribute in seleted_attributes:


调用百度api返回的 answer (字典)中,'person_info' 中包含图像中所有人的信息,所以取第1个人的指定的属性


官方说明文档给出的示例 Python (intelrealsense.com)

这个示例比较有用 opencv_viewer_example

## License: Apache 2.0. See LICENSE file in root directory.
## Copyright(c) 2015-2017 Intel Corporation. All Rights Reserved.

##      Open CV and Numpy integration        ##

import pyrealsense2 as rs
import numpy as np
import cv2

# Configure depth and color streams
pipeline = rs.pipeline()
config = rs.config()

# Get device product line for setting a supporting resolution
pipeline_wrapper = rs.pipeline_wrapper(pipeline)
pipeline_profile = config.resolve(pipeline_wrapper)
device = pipeline_profile.get_device()
device_product_line = str(device.get_info(rs.camera_info.product_line))

found_rgb = False
for s in device.sensors:
    if s.get_info(rs.camera_info.name) == 'RGB Camera':
        found_rgb = True
if not found_rgb:
    print("The demo requires Depth camera with Color sensor")

config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)

# Start streaming

    while True:

        # Wait for a coherent pair of frames: depth and color
        frames = pipeline.wait_for_frames()
        depth_frame = frames.get_depth_frame()
        color_frame = frames.get_color_frame()
        if not depth_frame or not color_frame:

        # Convert images to numpy arrays
        depth_image = np.asanyarray(depth_frame.get_data())
        color_image = np.asanyarray(color_frame.get_data())



    # Stop streaming


import pyrealsense2 as rs
import numpy as np
import cv2

# Configure depth and color streams
pipeline = rs.pipeline()
config = rs.config()

# Get device product line for setting a supporting resolution
pipeline_wrapper = rs.pipeline_wrapper(pipeline)
pipeline_profile = config.resolve(pipeline_wrapper)
device = pipeline_profile.get_device()
device_product_line = str(device.get_info(rs.camera_info.product_line))

found_rgb = False
for s in device.sensors:
    if s.get_info(rs.camera_info.name) == 'RGB Camera':
        found_rgb = True
if not found_rgb:
    print("The demo requires Depth camera with Color sensor")

config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)

# Start streaming


# Wait for a coherent pair of frames: depth and color
frames = pipeline.wait_for_frames()
depth_frame = frames.get_depth_frame()
color_frame = frames.get_color_frame()
if not depth_frame or not color_frame:

# Convert images to numpy arrays
depth_image = np.asanyarray(depth_frame.get_data())
color_image = np.asanyarray(color_frame.get_data())


results = model(source=...)

使用YOLOv8进行预测,得到的 results 是一个列表,列表中的元素都是 ultralytics.engine.results.Results 类,


class Results(SimpleClass):
    A class for storing and manipulating inference results.

        orig_img (numpy.ndarray): Original image as a numpy array.
        orig_shape (tuple): Original image shape in (height, width) format.
        boxes (Boxes, optional): Object containing detection bounding boxes.
        masks (Masks, optional): Object containing detection masks.
        probs (Probs, optional): Object containing class probabilities for classification tasks.
        keypoints (Keypoints, optional): Object containing detected keypoints for each object.
        speed (dict): Dictionary of preprocess, inference, and postprocess speeds (ms/image).
        names (dict): Dictionary of class names.
        path (str): Path to the image file.


如果只是对一张图片进行检测的话,那么 results 中只会有一个元素(即 results[0] )。

比较重要的东西是 boxesnames (其他感觉暂时用不上),

names 是一个字典,键是类别的序号,值是对应的类别的名称,因此只要是使用同一个训练好的模型权重进行检测,那么 names 都是一样的(是在训练模型时通过yaml文件设置的),


{0: 'chip', 1: 'biscuit', 2: 'bread', 3: 'sprite', 4: 'cola', 5: 'water', 6: 'dishsoap', 7: 'handwash', 8: 'shampoo', 9: 'cookie', 10: 'lays', 11: 'orange juice'}

boxes 是一个 ultralytics.engine.results.Boxes 类,


class Boxes(BaseTensor):
    Manages detection boxes, providing easy access and manipulation of box coordinates, confidence scores, class
    identifiers, and optional tracking IDs. Supports multiple formats for box coordinates, including both absolute and
    normalized forms.

        data (torch.Tensor): The raw tensor containing detection boxes and their associated data.
        orig_shape (tuple): The original image size as a tuple (height, width), used for normalization.
        is_track (bool): Indicates whether tracking IDs are included in the box data.

        xyxy (torch.Tensor | numpy.ndarray): Boxes in [x1, y1, x2, y2] format.
        conf (torch.Tensor | numpy.ndarray): Confidence scores for each box.
        cls (torch.Tensor | numpy.ndarray): Class labels for each box.
        id (torch.Tensor | numpy.ndarray, optional): Tracking IDs for each box, if available.
        xywh (torch.Tensor | numpy.ndarray): Boxes in [x, y, width, height] format, calculated on demand.
        xyxyn (torch.Tensor | numpy.ndarray): Normalized [x1, y1, x2, y2] boxes, relative to `orig_shape`.
        xywhn (torch.Tensor | numpy.ndarray): Normalized [x, y, width, height] boxes, relative to `orig_shape`.

        cpu(): Moves the boxes to CPU memory.
        numpy(): Converts the boxes to a numpy array format.
        cuda(): Moves the boxes to CUDA (GPU) memory.
        to(device, dtype=None): Moves the boxes to the specified device.

boxes 中比较重要一些的就是 xyxy (或者 xywh ) confcls (都是 torch.Tensor 类型,可以使用 .tolist() 方法转换成列表,其中的元素按照顺序一一对应),

xyxyxywh 记录检测出的物品在图像中的位置(像素坐标),即刚好框住物品的一个矩形框,xyxy 是对角的两个点的坐标(猜测可能是左上和右下),xywh 应该是中心点(也有可能是左上角)的坐标以及矩形框的

conf 是检测出的物品的置信度(范围为0-1),

cls 是检测出的物品对应的类别的序号

一个从 results 中获取检测出的类别的名称的示例

results = model(source=...)

names = results[0].names  # dict of names
ids = results[0].boxes.cls.tolist()  # list of ids
boxes = results[0].boxes.xyxy.tolist()  # list of xyxy of boxes
conf = results[0].boxes.conf.tolist()  # list of confidences

objs = []

for i in range(len(ids)):


import cv2

...  # 配置和启动摄像头

index = 1

while True:
    frame = ...  # 读取图像

    cv2.imshow("frame", frame)

    # press 'y' to take a picture and save it, named like 00001.jpg, 00002.jpg, ...
    if cv2.waitKey(1) & 0xFF == ord('y'):
        cv2.imwrite(f"{str(index).zfill(5)}.jpg", frame)
        index += 1
    elif cv2.waitKey(1) & 0xFF == ord('q'):

这个代码的作用是,显示摄像头当前实时的图像,按 Y 键则保存当前的图像


大致的任务是需要返回出 最大最小最左最右三个最轻三个最重 的东西,而由于以前的代码太💩了,所以临时重新写了一下(比较粗糙,算是半成品),

from ultralytics import YOLO
import cv2
from random import randint

# Load the model
model = YOLO("517best_m.pt")

# set the order of the names
size_names = ['sprite', 'cola', 'water', 'orange juice', 'lays', 'chip', 'dishsoap', 'biscuit', 'cookie', 'handwash', 'shampoo', 'bread']
size_order = {name: rank for rank, name in zip(range(len(size_names)), size_names)}
weight_names = ['sprite', 'cola', 'water', 'orange juice', 'handwash', 'dishsoap', 'shampoo', 'lays', 'chip', 'cookie', 'biscuit', 'bread']
weight_order = {name: rank for rank, name in zip(range(len(weight_names)), weight_names)}

def do_detect(model):
    Detect objects in an image

    model: the YOLOv8 model to be used for detection

    names: dict of names of the detected objects
    ids: list of ids of the detected objects
    boxes: list of xyxy of the detected objects
    conf: list of confidences of the detected objects

    results = model(source="color.jpg")

    names = results[0].names  # dict of names
    ids = results[0].boxes.cls.tolist()  # list of ids
    boxes = results[0].boxes.xyxy.tolist()  # list of xyxy of boxes
    conf = results[0].boxes.conf.tolist()  # list of confidences

    return names, ids, boxes, conf

# process different adjs
def la(names, ids, boxes, conf):

def sma(names, ids, boxes, conf):

def th_he(names, ids, boxes, conf):

def th_li(names, ids, boxes, conf):

def ri_mo(names, ids, boxes, conf):

def le_mo(names, ids, boxes, conf):

def result_filter(names, ids, boxes, conf):
    only keep the objects whose confidence is larger than 0.5.

def do_gpsr(adj):
    # start the camera and get the image

    # detect objects in the image
    global model
    names, ids, boxes, conf = do_detect(model)

    names, ids, boxes, conf = result_filter(names, ids, boxes, conf)

    if len(ids) == 0:
        print("No objects detected")

    adj2func = {
        "la": la,
        "sma": sma,
        "th_he": th_he,
        "th_li": th_li,
        "ri_mo": ri_mo,
        "le_mo": le_mo

    if adj in adj2func:
        return adj2func[adj](names, ids, boxes, conf)
    elif adj == "count":
        name_res = [names[i] for i in ids]
        return name_res




gesture_recognizer.ipynb - Colab (google.com)


Python 手势识别指南 | Google AI Edge | Google for Developers

import mediapipe as mp
import cv2
# from visualize_util import display_batch_of_images_with_gestures_and_hand_landmarks

BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Create a gesture recognizer instance with the image mode:
options = GestureRecognizerOptions(

cap = cv2.VideoCapture(0)

# with GestureRecognizer.create_from_options(options) as recognizer:
# The detector is initialized. Use it here.
# ... 

recognizer = GestureRecognizer.create_from_options(options)
if recognizer:
    while True:
        ret, frame = cap.read()

        if not ret:
            print('No frame')

        cv2.imshow('image', frame)
        cv2.imwrite("color.jpg", frame)

        # Load the input image from an image file.
        mp_image = mp.Image.create_from_file('color.jpg')
        # Perform gesture recognition on the provided single image.
        # The gesture recognizer must be created with the image mode.
        gesture_recognition_result = recognizer.recognize(mp_image)

        # print(gesture_recognition_result)
        if gesture_recognition_result.gestures:

        if cv2.waitKey(1) & 0xFF == ord('q'):




0 - Unrecognized gesture, label: Unknown
1 - Closed fist, label: Closed_Fist
2 - Open palm, label: Open_Palm
3 - Pointing up, label: Pointing_Up
4 - Thumbs down, label: Thumb_Down
5 - Thumbs up, label: Thumb_Up
6 - Victory, label: Victory
7 - Love, label: ILoveYou


最后更新: 2024-09-17
创建日期: 2024-06-22
