|
168 | 168 | "hide_score": false, |
169 | 169 | "quarantine": false, |
170 | 170 | "link_flair_text_color": "light", |
171 | | - "upvote_ratio": 0.82, |
| 171 | + "upvote_ratio": 0.87, |
172 | 172 | "author_flair_background_color": null, |
173 | | - "ups": 13, |
| 173 | + "ups": 21, |
174 | 174 | "domain": "reddit.com", |
175 | 175 | "media_embed": {}, |
176 | 176 | "thumbnail_width": 140, |
|
200 | 200 | }, |
201 | 201 | "link_flair_text": "Discussion", |
202 | 202 | "can_mod_post": false, |
203 | | - "score": 13, |
| 203 | + "score": 21, |
204 | 204 | "approved_by": null, |
205 | 205 | "is_created_from_ads_ui": false, |
206 | 206 | "author_premium": false, |
|
256 | 256 | "report_reasons": null, |
257 | 257 | "author": "Old_Mathematician107", |
258 | 258 | "discussion_type": null, |
259 | | - "num_comments": 2, |
| 259 | + "num_comments": 4, |
260 | 260 | "send_replies": true, |
261 | 261 | "media": null, |
262 | 262 | "contest_mode": false, |
|
265 | 265 | "permalink": "/r/LocalLLaMA/comments/1lsi0gj/opensourced_image_description_models_object/", |
266 | 266 | "stickied": false, |
267 | 267 | "url": "https://www.reddit.com/gallery/1lsi0gj", |
268 | | - "subreddit_subscribers": 494986, |
| 268 | + "subreddit_subscribers": 495396, |
269 | 269 | "created_utc": 1751743870, |
270 | 270 | "num_crossposts": 0, |
271 | 271 | "mod_reports": [], |
|
299 | 299 | "subreddit": "LocalLLaMA", |
300 | 300 | "author_flair_template_id": null, |
301 | 301 | "likes": null, |
302 | | - "replies": "", |
| 302 | + "replies": { |
| 303 | + "kind": "Listing", |
| 304 | + "data": { |
| 305 | + "after": null, |
| 306 | + "dist": null, |
| 307 | + "modhash": "", |
| 308 | + "geo_filter": "", |
| 309 | + "children": [ |
| 310 | + { |
| 311 | + "kind": "t1", |
| 312 | + "data": { |
| 313 | + "subreddit_id": "t5_81eyvm", |
| 314 | + "approved_at_utc": null, |
| 315 | + "author_is_blocked": false, |
| 316 | + "comment_type": null, |
| 317 | + "awarders": [], |
| 318 | + "mod_reason_by": null, |
| 319 | + "banned_by": null, |
| 320 | + "author_flair_type": "text", |
| 321 | + "total_awards_received": 0, |
| 322 | + "subreddit": "LocalLLaMA", |
| 323 | + "author_flair_template_id": null, |
| 324 | + "likes": null, |
| 325 | + "replies": "", |
| 326 | + "user_reports": [], |
| 327 | + "saved": false, |
| 328 | + "id": "n1mpf1g", |
| 329 | + "banned_at_utc": null, |
| 330 | + "mod_reason_title": null, |
| 331 | + "gilded": 0, |
| 332 | + "archived": false, |
| 333 | + "collapsed_reason_code": null, |
| 334 | + "no_follow": true, |
| 335 | + "author": "Old_Mathematician107", |
| 336 | + "can_mod_post": false, |
| 337 | + "created_utc": 1751807991, |
| 338 | + "send_replies": true, |
| 339 | + "parent_id": "t1_n1mg2ig", |
| 340 | + "score": 2, |
| 341 | + "author_fullname": "t2_dlk476nn6", |
| 342 | + "removal_reason": null, |
| 343 | + "approved_by": null, |
| 344 | + "mod_note": null, |
| 345 | + "all_awardings": [], |
| 346 | + "body": "Hi, thanks a lot. Making it 100% local is one of the end goals, but it is quite hard task, because you need to find strong enough VLM to understand the structure and long inputs (screenshot and its description) and light enough to run on phones. But making it 100% text only is possible but I think it will decrease its accuracy. So, the best way is to use VLM.\n\nTo run VLM locally you need to have very good, fine-tuned VLM on this specific tasks (agentic capabilities). It is actually quite hard but I think it is possible.\n\nYes, actually I don't use accessibility trees, adbs etc. Only screenshot and accessibility services to do the tasks remotely. So, it is vision-only and can be used in prod (if you invest enough money on renting backend servers and improve UI/UX of agentic app).\n\nDataset for YOLO was prepared by me, it consists of 486 images (train) and 60 for testing. For dataset I created bounding boxes for all 4 classes (View, ImageView, Text, Line). Screenshots used in this dataset are mostly screenshots from popular apps like youtube music, whatsapp etc. and apps that I made for various clients and companies throughout my career.", |
| 347 | + "edited": false, |
| 348 | + "top_awarded_type": null, |
| 349 | + "author_flair_css_class": null, |
| 350 | + "name": "t1_n1mpf1g", |
| 351 | + "is_submitter": true, |
| 352 | + "downs": 0, |
| 353 | + "author_flair_richtext": [], |
| 354 | + "author_patreon_flair": false, |
| 355 | + "body_html": "<div class=\"md\"><p>Hi, thanks a lot. Making it 100% local is one of the end goals, but it is quite hard task, because you need to find strong enough VLM to understand the structure and long inputs (screenshot and its description) and light enough to run on phones. But making it 100% text only is possible but I think it will decrease its accuracy. So, the best way is to use VLM.</p>\n\n<p>To run VLM locally you need to have very good, fine-tuned VLM on this specific tasks (agentic capabilities). It is actually quite hard but I think it is possible.</p>\n\n<p>Yes, actually I don&#39;t use accessibility trees, adbs etc. Only screenshot and accessibility services to do the tasks remotely. So, it is vision-only and can be used in prod (if you invest enough money on renting backend servers and improve UI/UX of agentic app).</p>\n\n<p>Dataset for YOLO was prepared by me, it consists of 486 images (train) and 60 for testing. For dataset I created bounding boxes for all 4 classes (View, ImageView, Text, Line). Screenshots used in this dataset are mostly screenshots from popular apps like youtube music, whatsapp etc. and apps that I made for various clients and companies throughout my career.</p>\n</div>", |
| 356 | + "gildings": {}, |
| 357 | + "collapsed_reason": null, |
| 358 | + "distinguished": null, |
| 359 | + "associated_award": null, |
| 360 | + "stickied": false, |
| 361 | + "author_premium": false, |
| 362 | + "can_gild": false, |
| 363 | + "link_id": "t3_1lsi0gj", |
| 364 | + "unrepliable_reason": null, |
| 365 | + "author_flair_text_color": null, |
| 366 | + "score_hidden": false, |
| 367 | + "permalink": "/r/LocalLLaMA/comments/1lsi0gj/opensourced_image_description_models_object/n1mpf1g/", |
| 368 | + "subreddit_type": "public", |
| 369 | + "locked": false, |
| 370 | + "report_reasons": null, |
| 371 | + "created": 1751807991, |
| 372 | + "author_flair_text": null, |
| 373 | + "treatment_tags": [], |
| 374 | + "collapsed": false, |
| 375 | + "subreddit_name_prefixed": "r/LocalLLaMA", |
| 376 | + "controversiality": 0, |
| 377 | + "depth": 1, |
| 378 | + "author_flair_background_color": null, |
| 379 | + "collapsed_because_crowd_control": null, |
| 380 | + "mod_reports": [], |
| 381 | + "num_reports": null, |
| 382 | + "ups": 2 |
| 383 | + } |
| 384 | + } |
| 385 | + ], |
| 386 | + "before": null |
| 387 | + } |
| 388 | + }, |
| 389 | + "user_reports": [], |
| 390 | + "saved": false, |
| 391 | + "id": "n1mg2ig", |
| 392 | + "banned_at_utc": null, |
| 393 | + "mod_reason_title": null, |
| 394 | + "gilded": 0, |
| 395 | + "archived": false, |
| 396 | + "collapsed_reason_code": null, |
| 397 | + "no_follow": true, |
| 398 | + "author": "phhusson", |
| 399 | + "can_mod_post": false, |
| 400 | + "created_utc": 1751804177, |
| 401 | + "send_replies": true, |
| 402 | + "parent_id": "t3_1lsi0gj", |
| 403 | + "score": 2, |
| 404 | + "author_fullname": "t2_qwewv", |
| 405 | + "approved_by": null, |
| 406 | + "mod_note": null, |
| 407 | + "all_awardings": [], |
| 408 | + "collapsed": false, |
| 409 | + "body": "Cool. Congrats on the release. Do you think deki provides enough information so that a text-only LLM would work? I'm curious if an on-device gemma 3n 4b would be able to do /some/ stuff (not fully agentic, but maybe some hands-free control)\n\nCould you share what's in your YOLO trainset? Did you use accessibility/uiautomator APIs to dump the structure of various apps?\n\nHave you tried your YOLO on out-of-distribution apps? (for instance some apps don't expose anything on accessibility/uiautomator)", |
| 410 | + "edited": false, |
| 411 | + "top_awarded_type": null, |
| 412 | + "author_flair_css_class": null, |
| 413 | + "name": "t1_n1mg2ig", |
| 414 | + "is_submitter": false, |
| 415 | + "downs": 0, |
| 416 | + "author_flair_richtext": [], |
| 417 | + "author_patreon_flair": false, |
| 418 | + "body_html": "<div class=\"md\"><p>Cool. Congrats on the release. Do you think deki provides enough information so that a text-only LLM would work? I&#39;m curious if an on-device gemma 3n 4b would be able to do /some/ stuff (not fully agentic, but maybe some hands-free control)</p>\n\n<p>Could you share what&#39;s in your YOLO trainset? Did you use accessibility/uiautomator APIs to dump the structure of various apps?</p>\n\n<p>Have you tried your YOLO on out-of-distribution apps? (for instance some apps don&#39;t expose anything on accessibility/uiautomator)</p>\n</div>", |
| 419 | + "removal_reason": null, |
| 420 | + "collapsed_reason": null, |
| 421 | + "distinguished": null, |
| 422 | + "associated_award": null, |
| 423 | + "stickied": false, |
| 424 | + "author_premium": false, |
| 425 | + "can_gild": false, |
| 426 | + "gildings": {}, |
| 427 | + "unrepliable_reason": null, |
| 428 | + "author_flair_text_color": null, |
| 429 | + "score_hidden": false, |
| 430 | + "permalink": "/r/LocalLLaMA/comments/1lsi0gj/opensourced_image_description_models_object/n1mg2ig/", |
| 431 | + "subreddit_type": "public", |
| 432 | + "locked": false, |
| 433 | + "report_reasons": null, |
| 434 | + "created": 1751804177, |
| 435 | + "author_flair_text": null, |
| 436 | + "treatment_tags": [], |
| 437 | + "link_id": "t3_1lsi0gj", |
| 438 | + "subreddit_name_prefixed": "r/LocalLLaMA", |
| 439 | + "controversiality": 0, |
| 440 | + "depth": 0, |
| 441 | + "author_flair_background_color": null, |
| 442 | + "collapsed_because_crowd_control": null, |
| 443 | + "mod_reports": [], |
| 444 | + "num_reports": null, |
| 445 | + "ups": 2 |
| 446 | + } |
| 447 | + }, |
| 448 | + { |
| 449 | + "kind": "t1", |
| 450 | + "data": { |
| 451 | + "subreddit_id": "t5_81eyvm", |
| 452 | + "approved_at_utc": null, |
| 453 | + "author_is_blocked": false, |
| 454 | + "comment_type": null, |
| 455 | + "awarders": [], |
| 456 | + "mod_reason_by": null, |
| 457 | + "banned_by": null, |
| 458 | + "author_flair_type": "text", |
| 459 | + "total_awards_received": 0, |
| 460 | + "subreddit": "LocalLLaMA", |
| 461 | + "author_flair_template_id": null, |
| 462 | + "likes": null, |
| 463 | + "replies": { |
| 464 | + "kind": "Listing", |
| 465 | + "data": { |
| 466 | + "after": null, |
| 467 | + "dist": null, |
| 468 | + "modhash": "", |
| 469 | + "geo_filter": "", |
| 470 | + "children": [ |
| 471 | + { |
| 472 | + "kind": "t1", |
| 473 | + "data": { |
| 474 | + "subreddit_id": "t5_81eyvm", |
| 475 | + "approved_at_utc": null, |
| 476 | + "author_is_blocked": false, |
| 477 | + "comment_type": null, |
| 478 | + "awarders": [], |
| 479 | + "mod_reason_by": null, |
| 480 | + "banned_by": null, |
| 481 | + "author_flair_type": "text", |
| 482 | + "total_awards_received": 0, |
| 483 | + "subreddit": "LocalLLaMA", |
| 484 | + "author_flair_template_id": null, |
| 485 | + "likes": null, |
| 486 | + "replies": "", |
| 487 | + "user_reports": [], |
| 488 | + "saved": false, |
| 489 | + "id": "n1m2qes", |
| 490 | + "banned_at_utc": null, |
| 491 | + "mod_reason_title": null, |
| 492 | + "gilded": 0, |
| 493 | + "archived": false, |
| 494 | + "collapsed_reason_code": null, |
| 495 | + "no_follow": true, |
| 496 | + "author": "Old_Mathematician107", |
| 497 | + "can_mod_post": false, |
| 498 | + "created_utc": 1751797371, |
| 499 | + "send_replies": true, |
| 500 | + "parent_id": "t1_n1l6ppy", |
| 501 | + "score": 1, |
| 502 | + "author_fullname": "t2_dlk476nn6", |
| 503 | + "removal_reason": null, |
| 504 | + "approved_by": null, |
| 505 | + "mod_note": null, |
| 506 | + "all_awardings": [], |
| 507 | + "body": "Thanks! Yeah, just screenshots. No accessibility trees or something, only screenshots", |
| 508 | + "edited": false, |
| 509 | + "top_awarded_type": null, |
| 510 | + "author_flair_css_class": null, |
| 511 | + "name": "t1_n1m2qes", |
| 512 | + "is_submitter": true, |
| 513 | + "downs": 0, |
| 514 | + "author_flair_richtext": [], |
| 515 | + "author_patreon_flair": false, |
| 516 | + "body_html": "<div class=\"md\"><p>Thanks! Yeah, just screenshots. No accessibility trees or something, only screenshots</p>\n</div>", |
| 517 | + "gildings": {}, |
| 518 | + "collapsed_reason": null, |
| 519 | + "distinguished": null, |
| 520 | + "associated_award": null, |
| 521 | + "stickied": false, |
| 522 | + "author_premium": false, |
| 523 | + "can_gild": false, |
| 524 | + "link_id": "t3_1lsi0gj", |
| 525 | + "unrepliable_reason": null, |
| 526 | + "author_flair_text_color": null, |
| 527 | + "score_hidden": false, |
| 528 | + "permalink": "/r/LocalLLaMA/comments/1lsi0gj/opensourced_image_description_models_object/n1m2qes/", |
| 529 | + "subreddit_type": "public", |
| 530 | + "locked": false, |
| 531 | + "report_reasons": null, |
| 532 | + "created": 1751797371, |
| 533 | + "author_flair_text": null, |
| 534 | + "treatment_tags": [], |
| 535 | + "collapsed": false, |
| 536 | + "subreddit_name_prefixed": "r/LocalLLaMA", |
| 537 | + "controversiality": 0, |
| 538 | + "depth": 1, |
| 539 | + "author_flair_background_color": null, |
| 540 | + "collapsed_because_crowd_control": null, |
| 541 | + "mod_reports": [], |
| 542 | + "num_reports": null, |
| 543 | + "ups": 1 |
| 544 | + } |
| 545 | + } |
| 546 | + ], |
| 547 | + "before": null |
| 548 | + } |
| 549 | + }, |
303 | 550 | "user_reports": [], |
304 | 551 | "saved": false, |
305 | 552 | "id": "n1l6ppy", |
|
0 commit comments