-
Notifications
You must be signed in to change notification settings - Fork 568
[MVEB] PE-AV Model, Kinetics400 Dataset, RavdessAV Dataset #4199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 26 commits
2b411bb
fd0ce74
a65f505
ecca13e
8210f82
f4e0ece
8f67fb7
80d9217
c01d591
287d47c
66c108f
b24794b
82ccf4d
6979034
4af8520
fa753b4
f5e7a8f
32f3b4f
77e964a
f1b7989
95d75d9
e59f283
5321b3c
7b36363
05cd7f6
23c3135
7dabd1c
7238edc
6ef8678
0a3aa0f
52d70a3
412da86
5e719bd
c6855a3
3168318
68747c2
fa9c3d6
b9273d9
7907adb
75bc5c7
4c87896
cb39536
61c775f
400925b
64c94b5
a131a89
73bf160
978622e
939eefa
57eb8d9
ac7484f
bb68de2
91cada2
56c243f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -193,7 +193,23 @@ def evaluate( | |
| ds = self.dataset[hf_subset] | ||
|
|
||
| if isinstance(ds, Dataset | DatasetDict): | ||
| ds = ds.select_columns([self.label_column_name, self.input_column_name]) | ||
| # Keep label and input columns, plus any columns required by | ||
| # the task's declared modalities (e.g., audio for va2c tasks) | ||
| modality_to_column = { | ||
| "video": "video", | ||
| "audio": "audio", | ||
| "image": "image", | ||
| } | ||
|
||
| columns_to_keep = {self.label_column_name, self.input_column_name} | ||
| if isinstance(ds, DatasetDict): | ||
| available = set(next(iter(ds.values())).column_names) | ||
| else: | ||
| available = set(ds.column_names) | ||
AdnanElAssadi56 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| for mod in self.metadata.modalities: | ||
| col = modality_to_column.get(mod) | ||
| if col and col in available: | ||
| columns_to_keep.add(col) | ||
| ds = ds.select_columns(list(columns_to_keep)) | ||
| eval_function = ( | ||
| self._evaluate_subset | ||
| if not self.is_cross_validation | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -201,6 +201,15 @@ | |||
| "Any2AnyRetrieval", | ||||
| ) | ||||
|
|
||||
| MVEB_TASK_TYPE = ( | ||||
| "VideoClassification", | ||||
| "VideoClustering", | ||||
| "VideoPairClassification", | ||||
| "VideoZeroshotClassification", | ||||
| "VideoCentricQA", | ||||
| "Any2AnyRetrieval", | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicated?
Suggested change
|
||||
| ) | ||||
|
|
||||
|
|
||||
| _TASK_TYPE = ( | ||||
| ( | ||||
|
|
@@ -219,6 +228,7 @@ | |||
| ) | ||||
| + MIEB_TASK_TYPE | ||||
| + MAEB_TASK_TYPE | ||||
| + MVEB_TASK_TYPE | ||||
| ) | ||||
|
|
||||
| TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type] | ||||
|
|
@@ -246,7 +256,20 @@ | |||
| "a2at", | ||||
| "t2at", | ||||
| "at2at", | ||||
| "v2v", | ||||
| "v2c", | ||||
| "v2t", | ||||
| "t2v", | ||||
| "vt2t", | ||||
| "vt2v", | ||||
| "v2vt", | ||||
| "t2vt", | ||||
| "vt2vt", | ||||
| "va2c", | ||||
| "va2t", | ||||
| "vat2t", | ||||
| "v2a", | ||||
| "a2v", | ||||
| ] | ||||
| """The category of the task. | ||||
|
|
||||
|
|
@@ -270,7 +293,20 @@ | |||
| 18. a2at: audio to audio+text | ||||
| 19. t2at: text to audio+text | ||||
| 20. at2at: audio+text to audio+text | ||||
| 21. v2t: video to text | ||||
| 21. v2v: video to video | ||||
| 22. v2c: video to category | ||||
| 23. v2t: video to text | ||||
| 24. t2v: text to video | ||||
| 25. vt2t: video+text to text | ||||
| 26. vt2v: video+text to video | ||||
| 27. v2vt: video to video+text | ||||
| 28. t2vt: text to video+text | ||||
| 29. vt2vt: video+text to video+text | ||||
| 30. va2c: video+audio to category | ||||
| 31. va2t: video+audio to text | ||||
| 32. vat2t: video+audio+text to text | ||||
| 33. v2a: video to audio | ||||
| 34. a2v: audio to video | ||||
| """ | ||||
|
|
||||
| AnnotatorType = Literal[ | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we consider doing a more principled refactor here as discussed in #4182
That issue also showed how the current approach can lead to some odd interactions between modalities.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I'll do it a bit later