|
270 | 270 | "## Zero-Shot Classification with RAM and Tag2Text" |
271 | 271 | ] |
272 | 272 | }, |
| 273 | + { |
| 274 | + "cell_type": "code", |
| 275 | + "execution_count": 70, |
| 276 | + "id": "a6c6331c-179f-4614-b761-f6dcbadb724f", |
| 277 | + "metadata": { |
| 278 | + "tags": [] |
| 279 | + }, |
| 280 | + "outputs": [ |
| 281 | + { |
| 282 | + "name": "stderr", |
| 283 | + "output_type": "stream", |
| 284 | + "text": [ |
| 285 | + "INFO:fastdup.models.ram:Loading model checkpoint from - /home/dnth/ram_swin_large_14m.pth\n", |
| 286 | + "INFO:fastdup.models.ram:Model loaded to device - cuda\n" |
| 287 | + ] |
| 288 | + } |
| 289 | + ], |
| 290 | + "source": [ |
| 291 | + "from fastdup.models_ram import RecognizeAnythingModel\n", |
| 292 | + "\n", |
| 293 | + "model = RecognizeAnythingModel()\n", |
| 294 | + "result = model.run_inference(\"coco_minitrain_25k/images/val2017/000000382734.jpg\")" |
| 295 | + ] |
| 296 | + }, |
| 297 | + { |
| 298 | + "cell_type": "code", |
| 299 | + "execution_count": 71, |
| 300 | + "id": "420c05ad-da3b-4b5f-b76e-3f4c49bba510", |
| 301 | + "metadata": { |
| 302 | + "tags": [] |
| 303 | + }, |
| 304 | + "outputs": [ |
| 305 | + { |
| 306 | + "data": { |
| 307 | + "text/plain": [ |
| 308 | + "'bath . bathroom . doorway . drain . floor . glass door . room . screen door . shower . white'" |
| 309 | + ] |
| 310 | + }, |
| 311 | + "execution_count": 71, |
| 312 | + "metadata": {}, |
| 313 | + "output_type": "execute_result" |
| 314 | + } |
| 315 | + ], |
| 316 | + "source": [ |
| 317 | + "result" |
| 318 | + ] |
| 319 | + }, |
| 320 | + { |
| 321 | + "cell_type": "code", |
| 322 | + "execution_count": 73, |
| 323 | + "id": "c01bf8fa-21f6-4bee-b513-28ce3c839165", |
| 324 | + "metadata": { |
| 325 | + "tags": [] |
| 326 | + }, |
| 327 | + "outputs": [ |
| 328 | + { |
| 329 | + "name": "stderr", |
| 330 | + "output_type": "stream", |
| 331 | + "text": [ |
| 332 | + "INFO:fastdup.model.tag2text:Loading model checkpoint from - /home/dnth/tag2text_swin_14m.pth\n", |
| 333 | + "INFO:fastdup.model.tag2text:Model loaded to device - cuda\n" |
| 334 | + ] |
| 335 | + } |
| 336 | + ], |
| 337 | + "source": [ |
| 338 | + "from fastdup.models_tag2text import Tag2TextModel\n", |
| 339 | + "model = Tag2TextModel()\n", |
| 340 | + "result = model.run_inference(\"coco_minitrain_25k/images/val2017/000000382734.jpg\")" |
| 341 | + ] |
| 342 | + }, |
| 343 | + { |
| 344 | + "cell_type": "code", |
| 345 | + "execution_count": 74, |
| 346 | + "id": "02f68243-bcdf-404c-8b22-742d1706d194", |
| 347 | + "metadata": { |
| 348 | + "tags": [] |
| 349 | + }, |
| 350 | + "outputs": [ |
| 351 | + { |
| 352 | + "data": { |
| 353 | + "text/plain": [ |
| 354 | + "('room | floor | bathroom | shower | wall | toilet | green | white',\n", |
| 355 | + " None,\n", |
| 356 | + " 'a bathroom with green walls and a white toilet')" |
| 357 | + ] |
| 358 | + }, |
| 359 | + "execution_count": 74, |
| 360 | + "metadata": {}, |
| 361 | + "output_type": "execute_result" |
| 362 | + } |
| 363 | + ], |
| 364 | + "source": [ |
| 365 | + "result" |
| 366 | + ] |
| 367 | + }, |
273 | 368 | { |
274 | 369 | "cell_type": "code", |
275 | 370 | "execution_count": 37, |
|
612 | 707 | "## Zero-Shot Detection with Grounding DINO" |
613 | 708 | ] |
614 | 709 | }, |
| 710 | + { |
| 711 | + "cell_type": "code", |
| 712 | + "execution_count": 53, |
| 713 | + "id": "5cc1e1b8-799c-461f-b08e-a95590be9a60", |
| 714 | + "metadata": { |
| 715 | + "tags": [] |
| 716 | + }, |
| 717 | + "outputs": [ |
| 718 | + { |
| 719 | + "name": "stderr", |
| 720 | + "output_type": "stream", |
| 721 | + "text": [ |
| 722 | + "INFO:fastdup.models.grounding_dino:Loading model checkpoint from - /home/dnth/groundingdino_swint_ogc.pth\n" |
| 723 | + ] |
| 724 | + }, |
| 725 | + { |
| 726 | + "name": "stdout", |
| 727 | + "output_type": "stream", |
| 728 | + "text": [ |
| 729 | + "final text_encoder_type: bert-base-uncased\n" |
| 730 | + ] |
| 731 | + }, |
| 732 | + { |
| 733 | + "name": "stderr", |
| 734 | + "output_type": "stream", |
| 735 | + "text": [ |
| 736 | + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']\n", |
| 737 | + "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", |
| 738 | + "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", |
| 739 | + "INFO:fastdup.models.grounding_dino:Model loaded on device - cuda\n" |
| 740 | + ] |
| 741 | + }, |
| 742 | + { |
| 743 | + "name": "stdout", |
| 744 | + "output_type": "stream", |
| 745 | + "text": [ |
| 746 | + "final text_encoder_type: bert-base-uncased\n" |
| 747 | + ] |
| 748 | + } |
| 749 | + ], |
| 750 | + "source": [ |
| 751 | + "from fastdup.models_grounding_dino import GroundingDINO\n", |
| 752 | + "\n", |
| 753 | + "model = GroundingDINO()\n", |
| 754 | + "results = model.run_inference(image_path=\"coco_minitrain_25k/images/val2017/000000449996.jpg\",\n", |
| 755 | + " text_prompt=\"air field . airliner . plane . airport . airport runway . airport terminal . jet . land . park . raceway . sky . tarmac . terminal\",\n", |
| 756 | + " box_threshold=0.3,\n", |
| 757 | + " text_threshold=0.25)" |
| 758 | + ] |
| 759 | + }, |
| 760 | + { |
| 761 | + "cell_type": "code", |
| 762 | + "execution_count": 54, |
| 763 | + "id": "9fdd1d2f-4df4-422d-89a7-bbec9bf8a1dc", |
| 764 | + "metadata": { |
| 765 | + "tags": [] |
| 766 | + }, |
| 767 | + "outputs": [ |
| 768 | + { |
| 769 | + "data": { |
| 770 | + "text/plain": [ |
| 771 | + "{'labels': ['sky',\n", |
| 772 | + " 'airport terminal',\n", |
| 773 | + " 'plane',\n", |
| 774 | + " 'airliner',\n", |
| 775 | + " 'jet',\n", |
| 776 | + " 'jet',\n", |
| 777 | + " 'tarmac'],\n", |
| 778 | + " 'scores': [0.5286, 0.3451, 0.3822, 0.4872, 0.3853, 0.3502, 0.3026],\n", |
| 779 | + " 'boxes': [(1.47, 1.45, 638.46, 241.37),\n", |
| 780 | + " (329.38, 291.55, 468.1, 319.69),\n", |
| 781 | + " (142.03, 247.3, 261.96, 296.55),\n", |
| 782 | + " (443.6, 111.93, 495.47, 130.84),\n", |
| 783 | + " (113.85, 290.28, 246.55, 340.23),\n", |
| 784 | + " (391.59, 271.73, 465.1, 295.48),\n", |
| 785 | + " (2.35, 277.69, 637.63, 425.32)]}" |
| 786 | + ] |
| 787 | + }, |
| 788 | + "execution_count": 54, |
| 789 | + "metadata": {}, |
| 790 | + "output_type": "execute_result" |
| 791 | + } |
| 792 | + ], |
| 793 | + "source": [ |
| 794 | + "results" |
| 795 | + ] |
| 796 | + }, |
615 | 797 | { |
616 | 798 | "cell_type": "code", |
617 | 799 | "execution_count": 41, |
|
903 | 1085 | "## Zero-Shot Segmentation with SAM" |
904 | 1086 | ] |
905 | 1087 | }, |
| 1088 | + { |
| 1089 | + "cell_type": "markdown", |
| 1090 | + "id": "dc58c743-d8e3-45b8-ae32-7cfc6474afd1", |
| 1091 | + "metadata": {}, |
| 1092 | + "source": [ |
| 1093 | + "For single image and single bounding box." |
| 1094 | + ] |
| 1095 | + }, |
| 1096 | + { |
| 1097 | + "cell_type": "code", |
| 1098 | + "execution_count": 68, |
| 1099 | + "id": "2eaaf4f7-9ff8-46f5-89b6-d9568be7b625", |
| 1100 | + "metadata": { |
| 1101 | + "tags": [] |
| 1102 | + }, |
| 1103 | + "outputs": [ |
| 1104 | + { |
| 1105 | + "name": "stderr", |
| 1106 | + "output_type": "stream", |
| 1107 | + "text": [ |
| 1108 | + "INFO:fastdup.model.sam:Loading model checkpoint from - /home/dnth/sam_vit_h_4b8939.pth\n" |
| 1109 | + ] |
| 1110 | + } |
| 1111 | + ], |
| 1112 | + "source": [ |
| 1113 | + "from fastdup.models_sam import SegmentAnythingModel\n", |
| 1114 | + "import torch\n", |
| 1115 | + "\n", |
| 1116 | + "model = SegmentAnythingModel()\n", |
| 1117 | + "result = model.run_inference(image_path=\"coco_minitrain_25k/images/val2017/000000449996.jpg\", bboxes=torch.tensor((1.47, 1.45, 638.46, 241.37)))" |
| 1118 | + ] |
| 1119 | + }, |
| 1120 | + { |
| 1121 | + "cell_type": "markdown", |
| 1122 | + "id": "d137da5d-ac81-4af1-9b5a-1b4d7b79464d", |
| 1123 | + "metadata": {}, |
| 1124 | + "source": [ |
| 1125 | + "For multiple images and multiple bounding boxes in a DataFrame." |
| 1126 | + ] |
| 1127 | + }, |
906 | 1128 | { |
907 | 1129 | "cell_type": "code", |
908 | 1130 | "execution_count": 45, |
|
946 | 1168 | "plot_annotations(df, image_col='filename', tags_col='ram_tags', bbox_col='grounding_dino_bboxes', scores_col='grounding_dino_scores', labels_col='grounding_dino_labels', masks_col='sam_masks')" |
947 | 1169 | ] |
948 | 1170 | }, |
949 | | - { |
950 | | - "cell_type": "code", |
951 | | - "execution_count": null, |
952 | | - "id": "cdfccccf-2b7d-47e1-bc77-c759bee7177c", |
953 | | - "metadata": {}, |
954 | | - "outputs": [], |
955 | | - "source": [] |
956 | | - }, |
957 | | - { |
958 | | - "cell_type": "code", |
959 | | - "execution_count": null, |
960 | | - "id": "47f2cb08-7bf5-4deb-bf97-0b80f8072f94", |
961 | | - "metadata": {}, |
962 | | - "outputs": [], |
963 | | - "source": [] |
964 | | - }, |
965 | | - { |
966 | | - "cell_type": "code", |
967 | | - "execution_count": 30, |
968 | | - "id": "70eff0bb-77d7-4f87-86d9-9d7d78888181", |
969 | | - "metadata": { |
970 | | - "tags": [] |
971 | | - }, |
972 | | - "outputs": [], |
973 | | - "source": [ |
974 | | - "from fastdup.models_grounding_dino import GroundingDINO" |
975 | | - ] |
976 | | - }, |
977 | | - { |
978 | | - "cell_type": "code", |
979 | | - "execution_count": 31, |
980 | | - "id": "b9937a36-42e9-4b07-81ec-7f1786bc9420", |
981 | | - "metadata": { |
982 | | - "tags": [] |
983 | | - }, |
984 | | - "outputs": [], |
985 | | - "source": [ |
986 | | - "from fastdup.models_ram import RecognizeAnythingModel" |
987 | | - ] |
988 | | - }, |
989 | | - { |
990 | | - "cell_type": "code", |
991 | | - "execution_count": 32, |
992 | | - "id": "1cc05bd1-d2b3-45a9-be7d-0978b5e1e0ca", |
993 | | - "metadata": { |
994 | | - "tags": [] |
995 | | - }, |
996 | | - "outputs": [], |
997 | | - "source": [ |
998 | | - "from fastdup.models_tag2text import Tag2TextModel" |
999 | | - ] |
1000 | | - }, |
1001 | | - { |
1002 | | - "cell_type": "code", |
1003 | | - "execution_count": 33, |
1004 | | - "id": "8679b319-a659-48ec-8338-dbc917877bd4", |
1005 | | - "metadata": { |
1006 | | - "tags": [] |
1007 | | - }, |
1008 | | - "outputs": [], |
1009 | | - "source": [ |
1010 | | - "from fastdup.models_sam import SegmentAnythingModel" |
1011 | | - ] |
1012 | | - }, |
1013 | | - { |
1014 | | - "cell_type": "code", |
1015 | | - "execution_count": null, |
1016 | | - "id": "f0312455-26ba-451c-b62a-ccfe7fabf80b", |
1017 | | - "metadata": {}, |
1018 | | - "outputs": [], |
1019 | | - "source": [] |
1020 | | - }, |
1021 | 1171 | { |
1022 | 1172 | "cell_type": "markdown", |
1023 | 1173 | "id": "0f4b137a-25ea-44ab-b44e-ed734428de86", |
|
1039 | 1189 | "convert_to_coco_format(df, bbox_col='grounding_dino_bboxes', label_col='grounding_dino_labels', json_filename='grounding_dino_annot_coco_format.json')" |
1040 | 1190 | ] |
1041 | 1191 | }, |
1042 | | - { |
1043 | | - "cell_type": "code", |
1044 | | - "execution_count": null, |
1045 | | - "id": "0b9f42d3-ef18-4e3d-b981-c624bb2b52a9", |
1046 | | - "metadata": {}, |
1047 | | - "outputs": [], |
1048 | | - "source": [] |
1049 | | - }, |
1050 | | - { |
1051 | | - "cell_type": "code", |
1052 | | - "execution_count": null, |
1053 | | - "id": "d99d5c9b-1f4a-4b3b-b534-41447de54a89", |
1054 | | - "metadata": {}, |
1055 | | - "outputs": [], |
1056 | | - "source": [] |
1057 | | - }, |
1058 | | - { |
1059 | | - "cell_type": "code", |
1060 | | - "execution_count": null, |
1061 | | - "id": "53d98947-60a3-4ff6-a900-3bd154a45302", |
1062 | | - "metadata": {}, |
1063 | | - "outputs": [], |
1064 | | - "source": [] |
1065 | | - }, |
1066 | 1192 | { |
1067 | 1193 | "cell_type": "markdown", |
1068 | 1194 | "id": "4bf9291f-c022-44f6-be5b-6bbb798f0c55", |
|
0 commit comments