myang333 commited on 13 days ago

Commit

e857f97

verified ·

1 Parent(s): 4c8894b

Mirror LanguageBind source at upstream commit 7070c53375661cdb235801176b564b45f96f0648

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +17 -0
1/1 +1 -0
DATASETS.md +66 -0
DATASET_LICENSE +400 -0
LICENSE +21 -0
README.md +422 -0
TRAIN_AND_VALIDATE.md +214 -0
a_cls/class_labels_indices.csv +528 -0
a_cls/dataloader.py +100 -0
a_cls/datasets.py +93 -0
a_cls/filter_eval_audio.py +21 -0
a_cls/precision.py +12 -0
a_cls/stats.py +57 -0
a_cls/util.py +306 -0
a_cls/zero_shot.py +234 -0
a_cls/zero_shot_classifier.py +111 -0
a_cls/zero_shot_metadata.py +184 -0
a_cls/zeroshot_cls.py +46 -0
al_ret/data_dataloaders.py +28 -0
al_ret/dataloader_msrvtt_retrieval.py +114 -0
al_ret/datasets.py +137 -0
al_ret/metrics.py +70 -0
al_ret/precision.py +12 -0
al_ret/retrieval.py +266 -0
al_ret/util.py +73 -0
al_ret/zero_shot.py +91 -0
assets/audio/0.wav +3 -0
assets/audio/1.wav +3 -0
assets/demo.png +3 -0
assets/depth/0.png +3 -0
assets/depth/1.png +3 -0
assets/emergency.jpg +3 -0
assets/iclr_dataset_sample.jpg +3 -0
assets/image/0.jpg +0 -0
assets/image/1.jpg +0 -0
assets/languagebind.jpg +3 -0
assets/languagebind_frame.jpg +3 -0
assets/languagebind_result.jpg +3 -0
assets/languge_result.jpg +3 -0
assets/logo.jpg +3 -0
assets/logo_languagebind.png +3 -0
assets/res1.jpg +0 -0
assets/res2.jpg +0 -0
assets/result1.jpg +3 -0
assets/sota.jpg +3 -0
assets/thermal/0.jpg +0 -0
assets/thermal/1.jpg +0 -0
assets/video/0.mp4 +3 -0
assets/video/1.mp4 +3 -0
d_cls/cp_zero_shot_metadata.py +117 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/audio/0.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio/1.wav filter=lfs diff=lfs merge=lfs -text
+assets/demo.png filter=lfs diff=lfs merge=lfs -text
+assets/depth/0.png filter=lfs diff=lfs merge=lfs -text
+assets/depth/1.png filter=lfs diff=lfs merge=lfs -text
+assets/emergency.jpg filter=lfs diff=lfs merge=lfs -text
+assets/iclr_dataset_sample.jpg filter=lfs diff=lfs merge=lfs -text
+assets/languagebind.jpg filter=lfs diff=lfs merge=lfs -text
+assets/languagebind_frame.jpg filter=lfs diff=lfs merge=lfs -text
+assets/languagebind_result.jpg filter=lfs diff=lfs merge=lfs -text
+assets/languge_result.jpg filter=lfs diff=lfs merge=lfs -text
+assets/logo.jpg filter=lfs diff=lfs merge=lfs -text
+assets/logo_languagebind.png filter=lfs diff=lfs merge=lfs -text
+assets/result1.jpg filter=lfs diff=lfs merge=lfs -text
+assets/sota.jpg filter=lfs diff=lfs merge=lfs -text
+assets/video/0.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video/1.mp4 filter=lfs diff=lfs merge=lfs -text

1/1 ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

DATASETS.md ADDED Viewed

	@@ -0,0 +1,66 @@

+## Sample data
+We are releasing sample data here so that individuals who are interested can further modify the code to train it on their own data, which includes videos, text from various sources, depth, and infrared.
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th></th><th>Baidu Yun</th><th>Google Cloud</th><th>Peking University Yun</th>
+    </tr>
+    <tr align="center">
+        <td>DATA</td><td><a href="https://pan.baidu.com/s/1MnQUO6xrMPE5HAwveAdtZA?pwd=5ug9">Link</a></td><td><a href="https://drive.google.com/file/d/1p7y_0H3c84VbWpI-zx_m_mgn84uTZTdO/view?usp=drive_link">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/B6BDBDDCC616D47126DD0FF568CAF6CD">Link</a></td>
+    </tr>
+    <tr align="center">
+        <td>ANNOTATION</td><td><a href="https://pan.baidu.com/s/1uxxx_67VWy25q7CDilLsHA?pwd=37j3">Link</a></td><td><a href="https://drive.google.com/file/d/1WWVkt9LdbGK0VeQh-g7xy1gUGBwzwVah/view?usp=drive_link">Link</a></td><td><a href=https://disk.pku.edu.cn:443/link/67D836DE504E96457554455A597DC57F"">Link</a></td>
+    </tr>
+</table>
+</div>
+## VIDAL-10M
+### Text and Video
+Due to policy restrictions, we are unable to directly release the videos. However, we provide the YouTube IDs, which can be used to download the videos independently. All textual sources and YouTube IDs can be downloaded from [Google Disk](https://drive.google.com/file/d/1qgm3rO9JugazLJ6KRsAKZfLIagHu3PJ-/view?usp=sharing) or [Baidu Disk](https://pan.baidu.com/s/13gY-IcFSFIuDZ-q0hMTx0g?pwd=gum9).
+The organization format of `ANNOTATION` is as follows:
+```Bash
+{
+  "ImkVYKWqlDU": {
+    "folder": "coco_vat_9",
+    "mplug": "This video describes a group of scuba divers rolling backwards off a boat while playing an instrument. They are having fun and enjoying their time in the water.",
+    "polish_mplug": "scuba divers are seen rolling backwards off a boat while playing an instrument, displaying enjoyment and having a good time in the water.",
+    "ofa": [
+      " a man in a wet suit and a helmet on a boat",
+      " a man in a scuba suit on a boat",
+      " a person in a boat holding a diver helmet",
+      " a man in a wetsuit on a jet ski",
+      " a picture of a body of water with the words boats on it",
+      " a person in the water with the words if they rolled",
+      " a person in the water with a paddle",
+      " a person in the water with a scooter"
+    ],
+    "sound_mplug": "scuba divers rolling backwards off a boat while playing an instrument showcases exuberant laughter, splashing water, and cheery melodies blending with the gentle waves.",
+    "raw": "WHY SCUBA DIVERS ROLL BACKWARDS OFF BOAT #shorts"
+  },
+  "id": {
+    "folder": "video_folder",
+    "mplug": "mplug_caption",
+    "polish_mplug": "polish_mplug_caption",
+    "ofa": [
+      "ofa_caption_0",
+      "ofa_caption_1",
+      "ofa_caption_2",
+      "ofa_caption_3",
+      "ofa_caption_4",
+      "ofa_caption_5",
+      "ofa_caption_6",
+      "ofa_caption_7"
+    ],
+    "sound_mplug": "sound_mplug_caption",
+    "raw": "raw_caption#hashtags"
+  },
+  ...
+}
+```
+### Depth and Thermal (Infrared)
+We are uploading data to [Hugging Face](https://huggingface.co/datasets/LanguageBind/VIDAL-Depth-Thermal), but based on a conservative estimate, it's approximately **20T**. Please be patient as we work on it.

DATASET_LICENSE ADDED Viewed

	@@ -0,0 +1,400 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 PKU-YUAN's Group (袁粒课题组-北大信工)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,422 @@

+<p align="center">
+    <img src="assets/logo.jpg" width="350" style="margin-bottom: 0.2;"/><img src="assets/sota.jpg" width="450" style="margin-bottom: 0.2;"/>
+<p>
+<h2 align="center"> <a href="https://arxiv.org/pdf/2310.01852.pdf">【ICLR 2024 🔥】LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment</a></h2>
+<h5 align="center"> If you like our project, please give us a star ⭐ on GitHub for latest update.  </h2>
+<h5 align="center">
+[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/LanguageBind)
+[![Dataset meta](https://img.shields.io/badge/%F0%9F%A4%97%20Dataset-VIDAL-blue)](https://huggingface.co/datasets/LanguageBind/VIDAL-Depth-Thermal)
+[![arXiv](https://img.shields.io/badge/Arxiv-2310.01852-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2310.01852)
+[![wechat](https://img.shields.io/badge/量子位%20-black)](https://mp.weixin.qq.com/s/EFqLv_Euf5VU024zOtzkkg)
+[![jiqizhixin](https://img.shields.io/badge/机器之心%20-black)](https://mp.weixin.qq.com/s/E5Tazm_vz1CADMwV0tdhnw)
+[![zhihu](https://img.shields.io/badge/知乎-0084FF)](https://zhuanlan.zhihu.com/p/660567767)
+[![License](https://img.shields.io/badge/Code%20License-MIT-yellow)](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/LICENSE)
+[![Data License](https://img.shields.io/badge/Dataset%20license-CC--BY--NC%204.0-orange)](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/DATASET_LICENSE)
+[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FPKU-YuanGroup%2FLanguageBind&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visitor&edge_flat=false)](https://hits.seeyoufarm.com)
+[![GitHub issues](https://img.shields.io/github/issues/PKU-YuanGroup/LanguageBind?color=critical&label=Issues)](https://github.com/PKU-YuanGroup/LanguageBind/issues?q=is%3Aopen+is%3Aissue)
+[![GitHub closed issues](https://img.shields.io/github/issues-closed/PKU-YuanGroup/LanguageBind?color=success&label=Issues)](https://github.com/PKU-YuanGroup/LanguageBind/issues?q=is%3Aissue+is%3Aclosed)  <br>
+</h5>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-audio-classification-on-audioset)](https://paperswithcode.com/sota/zero-shot-audio-classification-on-audioset?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-audio-classification-on-vgg-sound)](https://paperswithcode.com/sota/zero-shot-audio-classification-on-vgg-sound?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-text-to-audio-retrieval-on-clotho)](https://paperswithcode.com/sota/zero-shot-text-to-audio-retrieval-on-clotho?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-scene-classification-unified)](https://paperswithcode.com/sota/zero-shot-scene-classification-unified?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-classification-unified-classes-on)](https://paperswithcode.com/sota/zero-shot-classification-unified-classes-on?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-video-retrieval-on-msvd)](https://paperswithcode.com/sota/zero-shot-video-retrieval-on-msvd?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-environment-sound-classification-on-1)](https://paperswithcode.com/sota/zero-shot-environment-sound-classification-on-1?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-text-to-audio-retrieval-on)](https://paperswithcode.com/sota/zero-shot-text-to-audio-retrieval-on?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-video-retrieval-on-activitynet)](https://paperswithcode.com/sota/zero-shot-video-retrieval-on-activitynet?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-video-retrieval-on-msr-vtt)](https://paperswithcode.com/sota/zero-shot-video-retrieval-on-msr-vtt?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-video-retrieval-on-didemo)](https://paperswithcode.com/sota/zero-shot-video-retrieval-on-didemo?p=languagebind-extending-video-language) <br>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/languagebind-extending-video-language/zero-shot-action-recognition-on-kinetics)](https://paperswithcode.com/sota/zero-shot-action-recognition-on-kinetics?p=languagebind-extending-video-language)
+<details open><summary>💡 I also have other vision-language projects that may interest you ✨. </summary><p>
+<!--  may -->
+> [**Video-LLaVA: Learning United Visual Representation by Alignment Before Projection**](https://arxiv.org/abs/2311.10122) <br>
+> Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan <br>
+[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/PKU-YuanGroup/Video-LLaVA)  [![github](https://img.shields.io/github/stars/PKU-YuanGroup/Video-LLaVA.svg?style=social)](https://github.com/PKU-YuanGroup/Video-LLaVA) [![arXiv](https://img.shields.io/badge/Arxiv-2311.10122-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.10122) <br>
+> [**MoE-LLaVA: Mixture of Experts for Large Vision-Language Models**](https://github.com/PKU-YuanGroup/MoE-LLaVA/blob/main/MoE-LLaVA.pdf) <br>
+> Bin Lin, Zhenyu Tang, Yang Ye, Jiaxi Cui, Bin Zhu, Peng Jin, Junwu Zhang, Munan Ning, Li Yuan <br>
+[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/PKU-YuanGroup/MoE-LLaVA)  [![github](https://img.shields.io/github/stars/PKU-YuanGroup/MoE-LLaVA.svg?style=social)](https://github.com/PKU-YuanGroup/MoE-LLaVA) [![arXiv](https://img.shields.io/badge/Arxiv-2401.15947-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2401.15947) <br>
+> [**Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models**](https://arxiv.org/abs/2311.08046) <br>
+> Munan Ning, Bin Zhu, Yujia Xie, Bin Lin, Jiaxi Cui, Lu Yuan, Dongdong Chen, Li Yuan <br>
+[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/PKU-YuanGroup/Video-Bench) [![github](https://img.shields.io/github/stars/PKU-YuanGroup/Video-Bench.svg?style=social)](https://github.com/PKU-YuanGroup/Video-Bench) [![arXiv](https://img.shields.io/badge/Arxiv-2311.16103-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.16103) <br>
+</p></details>
+## 📰 News
+* **[2024.01.27]**  👀👀👀 Our [MoE-LLaVA](https://github.com/PKU-YuanGroup/MoE-LLaVA) is released! A sparse model with 3B parameters outperformed the dense model with 7B parameters.
+* **[2024.01.16]**  🔥🔥🔥 Our LanguageBind has been accepted at ICLR 2024! We earn the score of 6(3)8(6)6(6)6(6) [here](https://openreview.net/forum?id=QmZKc7UZCy&noteId=OgsxQxAleA).
+* **[2023.12.15]**  💪💪💪 We expand the 💥💥💥 VIDAL dataset and now have **10M video-text data**. We launch **LanguageBind_Video 1.5**, checking our [model zoo](#-model-zoo).
+* **[2023.12.10]**  We expand the 💥💥💥 VIDAL dataset and now have **10M depth and 10M thermal data**. We are in the process of uploading thermal and depth data on [Hugging Face](https://huggingface.co/datasets/LanguageBind/VIDAL-Depth-Thermal) and expect the whole process to last 1-2 months.
+* **[2023.11.27]**  🔥🔥🔥 We have updated our [paper](https://arxiv.org/abs/2310.01852) with emergency zero-shot results., checking our ✨ [results](#emergency-results).
+* **[2023.11.26]**  💥💥💥 We have open-sourced all textual sources and corresponding YouTube IDs [here](DATASETS.md).
+* **[2023.11.26]**  📣📣📣 We have open-sourced fully fine-tuned **Video & Audio**, achieving improved performance once again, checking our [model zoo](#-model-zoo).
+* **[2023.11.22]**  We are about to release a fully fine-tuned version, and the **HUGE** version is currently undergoing training.
+* **[2023.11.21]**  💥 We are releasing sample data in [DATASETS.md](DATASETS.md) so that individuals who are interested can further modify the code to train it on their own data.
+* **[2023.11.20]**  🚀🚀🚀 [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) builds a large visual-language model to achieve 🎉SOTA performances based on LanguageBind encoders.
+* **[2023.10.23]**  🎶 LanguageBind-Audio achieves 🎉🎉🎉**state-of-the-art (SOTA) performance on 5 datasets**, checking our ✨ [results](#multiple-modalities)!
+* **[2023.10.14]**  😱 Released a stronger LanguageBind-Video, checking our ✨ [results](#video-language)! The video checkpoint **have updated** on Huggingface Model Hub!
+* **[2023.10.10]**  We provide sample data, which can be found in [assets](assets), and [emergency zero-shot usage](#emergency-zero-shot) is described.
+* **[2023.10.07]**  The checkpoints are available on 🤗 [Huggingface Model](https://huggingface.co/LanguageBind).
+* **[2023.10.04]**  Code and [demo](https://huggingface.co/spaces/LanguageBind/LanguageBind) are available now! Welcome to **watch** 👀 this repository for the latest updates.
+## 😮 Highlights
+### 💡 High performance, but NO intermediate modality required
+LanguageBind is a **language-centric** multimodal pretraining approach, **taking the language as the bind across different modalities** because the language modality is well-explored and contains rich semantics.
+* The following first figure shows the architecture of LanguageBind. LanguageBind can be easily extended to segmentation, detection tasks, and potentially to unlimited modalities.
+### ⚡️ A multimodal, fully aligned and voluminous dataset
+We propose **VIDAL-10M**, **10 Million data** with **V**ideo, **I**nfrared, **D**epth, **A**udio and their corresponding **L**anguage, which greatly expands the data beyond visual modalities.
+* The second figure shows our proposed VIDAL-10M dataset, which includes five modalities: video, infrared, depth, audio, and language.
+### 🔥 Multi-view enhanced description for training
+We make multi-view enhancements to language. We produce multi-view description that combines **meta-data**, **spatial**, and **temporal** to greatly enhance the semantic information of the language. In addition we further **enhance the language with ChatGPT** to create a good semantic space for each modality aligned language.
+<p align="center">
+<img src="assets/languagebind.jpg" width=100%>
+</p>
+<p align="center">
+<img src="assets/iclr_dataset_sample.jpg" width=99%>
+</p>
+## 🤗 Demo
+* **Local demo.** Highly recommend trying out our web demo, which incorporates all features currently supported by LanguageBind.
+```bash
+python gradio_app.py
+```
+* **Online demo.** We provide the [online demo](https://huggingface.co/spaces/LanguageBind/LanguageBind) in Huggingface Spaces. In this demo, you can calculate the similarity of modalities to language, such as audio-to-language, video-to-language, and depth-to-image.
+<p align="center">
+<img src="assets/demo.png" width=100%>
+</p>
+## 🚀 Main Results
+### Video-Language
+LanguageBind achieves **state-of-the-art (SOTA) performance on four datasets**, * donates the results of full tuning.
+<p align="left">
+<img src="assets/result1.jpg" width=80%>
+</p>
+### Multiple Modalities
+Video-Language, Infrared-Language, Depth-Language, and Audio-Language zero-shot classification, * donates the results of full tuning.
+<p align="left">
+<img src="assets/res1.jpg" width=80%>
+</p>
+We report text-to-audio results for retrieval, * donates the results of full tuning.
+<p align="left">
+<img src="assets/res2.jpg" width=35%>
+</p>
+### Emergency results
+<p align="left">
+<img src="assets/emergency.jpg" width=60%>
+</p>
+## 🛠️ Requirements and Installation
+* Python >= 3.8
+* Pytorch >= 1.13.1
+* CUDA Version >= 11.6
+* Install required packages:
+```bash
+git clone https://github.com/PKU-YuanGroup/LanguageBind
+cd LanguageBind
+pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+pip install -r requirements.txt
+```
+## 🐳 Model Zoo
+The names in the table represent different encoder models. For example, `LanguageBind/LanguageBind_Video_FT` represents the fully fine-tuned version, while `LanguageBind/LanguageBind_Video` represents the LoRA-tuned version.
+You can freely replace them in the recommended [API usage](#-api). We recommend using the fully fine-tuned version, as it offers stronger performance.
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Modality</th><th>LoRA tuning</th><th>Fine-tuning</th>
+    </tr>
+    <tr align="center">
+        <td>Video</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Video">LanguageBind_Video</a></td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Video_FT">LanguageBind_Video_FT</a></td>
+    </tr>
+    <tr align="center">
+        <td>Audio</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Audio">LanguageBind_Audio</a></td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Audio_FT">LanguageBind_Audio_FT</a></td>
+    </tr>
+    <tr align="center">
+        <td>Depth</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Depth">LanguageBind_Depth</a></td><td>-</td>
+    </tr>
+    <tr align="center">
+        <td>Thermal</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Thermal">LanguageBind_Thermal</a></td><td>-</td>
+    </tr>
+</table>
+</div>
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Version</th><th>Tuning</th><th>Model size</th><th>Num_frames</th><th>HF Link</th><th>MSR-VTT</th><th>DiDeMo</th><th>ActivityNet</th><th>MSVD</th>
+    </tr>
+    <tr align="center">
+        <td>LanguageBind_Video</td><td>LoRA</td><td>Large</td><td>8</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Video">Link</a></td><td>42.6</td><td>37.8</td><td>35.1</td><td>52.2</td>
+    </tr>
+    <tr align="center">
+        <td>LanguageBind_Video_FT</td><td>Full-tuning</td><td>Large</td><td>8</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Video_FT">Link</a></td><td>42.7</td><td>38.1</td><td>36.9</td><td>53.5</td>
+    </tr>
+    <tr align="center">
+        <td>LanguageBind_Video_V1.5_FT</td><td>Full-tuning</td><td>Large</td><td>8</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Video_V1.5_FT">Link</a></td><td>42.8</td><td>39.7</td><td>38.4</td><td>54.1</td>
+    </tr>
+    <tr align="center">
+        <td>LanguageBind_Video_V1.5_FT</td><td>Full-tuning</td><td>Large</td><td>12</td><td>Coming soon</td>
+    </tr>
+    <tr align="center">
+        <td>LanguageBind_Video_Huge_V1.5_FT</td><td>Full-tuning</td><td>Huge</td><td>8</td><td><a href="https://huggingface.co/LanguageBind/LanguageBind_Video_Huge_V1.5_FT">Link</a></td><td>44.8</td><td>39.9</td><td>41.0</td><td>53.7</td>
+    </tr>
+    <tr align="center">
+        <td>LanguageBind_Video_Huge_V1.5_FT</td><td>Full-tuning</td><td>Huge</td><td>12</td><td>Coming soon</td>
+    </tr>
+</table>
+</div>
+## 🤖 API
+**We open source all modalities preprocessing code.** If you want to load the model (e.g. ```LanguageBind/LanguageBind_Thermal```) from the model hub on Huggingface or on local, you can use the following code snippets!
+### Inference for Multi-modal Binding
+We have provided some sample datasets in [assets](assets) to quickly see how languagebind works.
+```python
+import torch
+from languagebind import LanguageBind, to_device, transform_dict, LanguageBindImageTokenizer
+if __name__ == '__main__':
+    device = 'cuda:0'
+    device = torch.device(device)
+    clip_type = {
+        'video': 'LanguageBind_Video_FT',  # also LanguageBind_Video
+        'audio': 'LanguageBind_Audio_FT',  # also LanguageBind_Audio
+        'thermal': 'LanguageBind_Thermal',
+        'image': 'LanguageBind_Image',
+        'depth': 'LanguageBind_Depth',
+    }
+    model = LanguageBind(clip_type=clip_type, cache_dir='./cache_dir')
+    model = model.to(device)
+    model.eval()
+    pretrained_ckpt = f'lb203/LanguageBind_Image'
+    tokenizer = LanguageBindImageTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir/tokenizer_cache_dir')
+    modality_transform = {c: transform_dict[c](model.modality_config[c]) for c in clip_type.keys()}
+    image = ['assets/image/0.jpg', 'assets/image/1.jpg']
+    audio = ['assets/audio/0.wav', 'assets/audio/1.wav']
+    video = ['assets/video/0.mp4', 'assets/video/1.mp4']
+    depth = ['assets/depth/0.png', 'assets/depth/1.png']
+    thermal = ['assets/thermal/0.jpg', 'assets/thermal/1.jpg']
+    language = ["Training a parakeet to climb up a ladder.", 'A lion climbing a tree to catch a monkey.']
+    inputs = {
+        'image': to_device(modality_transform['image'](image), device),
+        'video': to_device(modality_transform['video'](video), device),
+        'audio': to_device(modality_transform['audio'](audio), device),
+        'depth': to_device(modality_transform['depth'](depth), device),
+        'thermal': to_device(modality_transform['thermal'](thermal), device),
+    }
+    inputs['language'] = to_device(tokenizer(language, max_length=77, padding='max_length',
+                                             truncation=True, return_tensors='pt'), device)
+    with torch.no_grad():
+        embeddings = model(inputs)
+    print("Video x Text: \n",
+          torch.softmax(embeddings['video'] @ embeddings['language'].T, dim=-1).detach().cpu().numpy())
+    print("Image x Text: \n",
+          torch.softmax(embeddings['image'] @ embeddings['language'].T, dim=-1).detach().cpu().numpy())
+    print("Depth x Text: \n",
+          torch.softmax(embeddings['depth'] @ embeddings['language'].T, dim=-1).detach().cpu().numpy())
+    print("Audio x Text: \n",
+          torch.softmax(embeddings['audio'] @ embeddings['language'].T, dim=-1).detach().cpu().numpy())
+    print("Thermal x Text: \n",
+          torch.softmax(embeddings['thermal'] @ embeddings['language'].T, dim=-1).detach().cpu().numpy())
+```
+Then returns the following result.
+```bash
+Video x Text:
+ [[9.9989331e-01 1.0667283e-04]
+ [1.3255903e-03 9.9867439e-01]]
+Image x Text:
+ [[9.9990666e-01 9.3292067e-05]
+ [4.6132666e-08 1.0000000e+00]]
+Depth x Text:
+ [[0.9954276  0.00457235]
+ [0.12042473 0.8795753 ]]
+Audio x Text:
+ [[0.97634876 0.02365119]
+ [0.02917843 0.97082156]]
+Thermal x Text:
+ [[0.9482511  0.0517489 ]
+ [0.48746133 0.5125386 ]]
+```
+### Emergency zero-shot
+Since languagebind binds each modality together, we also found the **emergency zero-shot**. It's very simple to use.
+```python
+print("Video x Audio: \n", torch.softmax(embeddings['video'] @ embeddings['audio'].T, dim=-1).detach().cpu().numpy())
+print("Image x Depth: \n", torch.softmax(embeddings['image'] @ embeddings['depth'].T, dim=-1).detach().cpu().numpy())
+print("Image x Thermal: \n", torch.softmax(embeddings['image'] @ embeddings['thermal'].T, dim=-1).detach().cpu().numpy())
+```
+Then, you will get:
+```
+Video x Audio:
+ [[1.0000000e+00 0.0000000e+00]
+ [3.1150486e-32 1.0000000e+00]]
+Image x Depth:
+ [[1. 0.]
+ [0. 1.]]
+Image x Thermal:
+ [[1. 0.]
+ [0. 1.]]
+ ```
+### Different branches for X-Language task
+Additionally, LanguageBind can be **disassembled into different branches** to handle different tasks. Note that we do not train Image, which just initialize from OpenCLIP.
+#### Thermal
+```python
+import torch
+from languagebind import LanguageBindThermal, LanguageBindThermalTokenizer, LanguageBindThermalProcessor
+pretrained_ckpt = 'LanguageBind/LanguageBind_Thermal'
+model = LanguageBindThermal.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+tokenizer = LanguageBindThermalTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+thermal_process = LanguageBindThermalProcessor(model.config, tokenizer)
+model.eval()
+data = thermal_process([r"your/thermal.jpg"], ['your text'], return_tensors='pt')
+with torch.no_grad():
+    out = model(**data)
+print(out.text_embeds @ out.image_embeds.T)
+```
+#### Depth
+```python
+import torch
+from languagebind import LanguageBindDepth, LanguageBindDepthTokenizer, LanguageBindDepthProcessor
+pretrained_ckpt = 'LanguageBind/LanguageBind_Depth'
+model = LanguageBindDepth.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+tokenizer = LanguageBindDepthTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+depth_process = LanguageBindDepthProcessor(model.config, tokenizer)
+model.eval()
+data = depth_process([r"your/depth.png"], ['your text.'], return_tensors='pt')
+with torch.no_grad():
+    out = model(**data)
+print(out.text_embeds @ out.image_embeds.T)
+```
+#### Video
+```python
+import torch
+from languagebind import LanguageBindVideo, LanguageBindVideoTokenizer, LanguageBindVideoProcessor
+pretrained_ckpt = 'LanguageBind/LanguageBind_Video_FT'  # also 'LanguageBind/LanguageBind_Video'
+model = LanguageBindVideo.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+tokenizer = LanguageBindVideoTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+video_process = LanguageBindVideoProcessor(model.config, tokenizer)
+model.eval()
+data = video_process(["your/video.mp4"], ['your text.'], return_tensors='pt')
+with torch.no_grad():
+    out = model(**data)
+print(out.text_embeds @ out.image_embeds.T)
+```
+#### Audio
+```python
+import torch
+from languagebind import LanguageBindAudio, LanguageBindAudioTokenizer, LanguageBindAudioProcessor
+pretrained_ckpt = 'LanguageBind/LanguageBind_Audio_FT'  # also 'LanguageBind/LanguageBind_Audio'
+model = LanguageBindAudio.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+tokenizer = LanguageBindAudioTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+audio_process = LanguageBindAudioProcessor(model.config, tokenizer)
+model.eval()
+data = audio_process([r"your/audio.wav"], ['your audio.'], return_tensors='pt')
+with torch.no_grad():
+    out = model(**data)
+print(out.text_embeds @ out.image_embeds.T)
+```
+#### Image
+Note that our image encoder is the same as OpenCLIP. **Not** as fine-tuned as other modalities.
+```python
+import torch
+from languagebind import LanguageBindImage,  LanguageBindImageTokenizer,  LanguageBindImageProcessor
+pretrained_ckpt = 'LanguageBind/LanguageBind_Image'
+model = LanguageBindImage.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+tokenizer = LanguageBindImageTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
+image_process = LanguageBindImageProcessor(model.config, tokenizer)
+model.eval()
+data = image_process([r"your/image.jpg"], ['your text.'], return_tensors='pt')
+with torch.no_grad():
+    out = model(**data)
+print(out.text_embeds @ out.image_embeds.T)
+```
+## 💥 VIDAL-10M
+The datasets is in [DATASETS.md](DATASETS.md).
+## 🗝️ Training & Validating
+The training & validating instruction is in [TRAIN_AND_VALIDATE.md](TRAIN_AND_VALIDATE.md).
+## 👍 Acknowledgement
+* [OpenCLIP](https://github.com/mlfoundations/open_clip) An open source pretraining framework.
+* [CLIP4Clip](https://github.com/ArrowLuo/CLIP4Clip) An open source Video-Text retrieval framework.
+* [sRGB-TIR](https://github.com/rpmsnu/sRGB-TIR) An open source framework to generate infrared (thermal) images.
+* [GLPN](https://github.com/vinvino02/GLPDepth) An open source framework to generate depth images.
+## 🔒 License
+* The majority of this project is released under the MIT license as found in the [LICENSE](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/LICENSE) file.
+* The dataset of this project is released under the CC-BY-NC 4.0 license as found in the [DATASET_LICENSE](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/DATASET_LICENSE) file.
+## ✏️ Citation
+If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil:.
+```BibTeX
+@misc{zhu2023languagebind,
+      title={LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment},
+      author={Bin Zhu and Bin Lin and Munan Ning and Yang Yan and Jiaxi Cui and Wang HongFa and Yatian Pang and Wenhao Jiang and Junwu Zhang and Zongwei Li and Cai Wan Zhang and Zhifeng Li and Wei Liu and Li Yuan},
+      year={2023},
+      eprint={2310.01852},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+## ✨ Star History
+[![Star History](https://api.star-history.com/svg?repos=PKU-YuanGroup/LanguageBind&type=Date)](https://star-history.com/#PKU-YuanGroup/LanguageBind&Date)
+## 🤝 Contributors
+<a href="https://github.com/PKU-YuanGroup/LanguageBind/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=PKU-YuanGroup/LanguageBind" />
+</a>

TRAIN_AND_VALIDATE.md ADDED Viewed

	@@ -0,0 +1,214 @@

+We provide the **off-the-shelf** scripts in the [scripts folder](scripts).
+## Training LanguageBind
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Cache of pretrained weight</th><th>Baidu Yun</th><th>Google Cloud</th><th>Peking University Yun</th>
+    </tr>
+    <tr align="center">
+        <td>Large</td><td><a href="https://pan.baidu.com/s/1co46bkuUJXr8ePPKp1WWgA?pwd=ofm6">Link</a></td><td><a href="https://drive.google.com/drive/folders/1VQYZlqfKmCMuHffypf5F96odyMCEI87H?usp=drive_link">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/9CA764E6307790B01D2D4F7E314E8E43">Link</a></td>
+    </tr>
+    <tr align="center">
+        <td>Huge</td><td><a href="https://pan.baidu.com/s/1QLpyXEYunoXS-oqGsvzKKA?pwd=vgo2">Link</a></td><td>-</td><td><a href="https://disk.pku.edu.cn:443/link/720A77A7DB9EFD167C5AC8E3FC4B6068">Link</a></td>
+    </tr>
+</table>
+</div>
+For example, to **train** LanguageBind on **Depth-Language** with 8 GPUs (1 nodes x 8 GPUs).
+* First download the cache of pretrained weight above. and specify `CACHE_DIR=path/to/LanguageBind`.
+* The second step is to develop a path to `ANNOTATION` and `DATA` [here](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/data/base_datasets.py#L37) according to the [dataset preparation](https://github.com/PKU-YuanGroup/LanguageBind#-vidal-10m).
+* Then you can run
+```bash
+CACHE_DIR="/path/to/LanguageBind"
+ANNOTATION="path/to/data"
+cd /path/to/LanguageBind
+TORCH_DISTRIBUTED_DEBUG=DETAIL HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 torchrun --nnodes=1 --nproc_per_node 8 \
+    -m main  \
+    --train-data ${ANNOTATION} \
+    --train-num-samples 3020000 \
+    --clip-type "dl" --max-depth 10 \
+    --do_train \
+    --lock-text --lock-image --text-type "polish_mplug" \
+    --init-temp 0.07 --learn-temp \
+    --model "ViT-L-14" --cache-dir ${CACHE_DIR} \
+    --convert_to_lora --lora_r 2 \
+    --lr 5e-4 --coef-lr 1e-3 \
+    --beta1 0.9 --beta2 0.98 --wd 0.2 --eps 1e-6 \
+    --num-frames 1 --force-patch-dropout 0.5 \
+    --epochs 1 --batch-size 128 --accum-freq 1 --warmup 200 \
+    --precision "amp" --workers 10 --video-decode-backend "imgs" \
+    --save-frequency 1 --log-every-n-steps 20 --report-to "tensorboard" --resume "latest" \
+    --do_eval \
+    --val_d_cls_data "NYUV2"
+```
+## Validating LanguageBind
+For example, to **validate** LanguageBind on **Depth-Language** with 1 GPUs.
+* First specify ```RESUME```.
+* The second step is to prepare the [downstream dataset](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/TRAIN_AND_VALIDATE.md#downstream-datasets).
+* Then you can run
+```bash
+CACHE_DIR="/path/to/LanguageBind"
+RESUME="thermal_language.pt"
+ANNOTATION="path/to/data"
+cd /path/to/LanguageBind
+TORCH_DISTRIBUTED_DEBUG=DETAIL HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 torchrun --nproc_per_node 1 \
+    -m main  \
+    --train-data ${ANNOTATION} \
+    --train-num-samples 3020000 \
+    --clip-type "dl" --max-depth 10 \
+    --lock-text --lock-image --text-type "polish_mplug" \
+    --init-temp 0.07 --learn-temp \
+    --model "ViT-L-14" --cache-dir ${CACHE_DIR} \
+    --convert_to_lora --lora_r 2 \
+    --lr 5e-4 --coef-lr 1e-3 \
+    --beta1 0.9 --beta2 0.98 --wd 0.2 --eps 1e-6 \
+    --num-frames 1 --force-patch-dropout 0.5 \
+    --epochs 1 --batch-size 128 --accum-freq 1 --warmup 200 \
+    --precision "amp" --workers 10 --video-decode-backend "imgs" \
+    --save-frequency 1 --log-every-n-steps 20 --report-to "tensorboard" --resume ${RESUME} \
+    --do_eval \
+    --val_d_cls_data "NYUV2"
+```
+## Downstream datasets
+### Depth
+NYU V2 dataset is downloaded from [this repo](https://github.com/TUI-NICR/nicr-scene-analysis-datasets/tree/main/nicr_scene_analysis_datasets/datasets/nyuv2) and we reformat them to conform to the standard ImageNet format. We also provide data as follows. Change the ```data_root``` [here](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/data/build_datasets.py#L221).
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Datasets</th><th>Baidu Yun</th><th>Google Cloud</th><th>Peking University Yun</th>
+    </tr>
+    <tr align="center">
+        <td>NYU</td><td><a href="https://pan.baidu.com/s/1AGOG8U3F7W8AvJiEmuzs-A?pwd=1dsg">Link</a></td><td><a href="https://drive.google.com/file/d/1CltzrTBLFqLxJzpztSIN-5ZosZpXQQ6u/view?usp=sharing">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/7D7B164DEA64059793D3C3E3A65C0F64">Link</a></td>
+    </tr>
+</table>
+</div>
+### Video
+Video datasets are downloaded from [this repo](https://github.com/jpthu17/HBI) and we show the folder structure. Change the ```data_root``` [here](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/data/build_datasets.py#L74).
+### Audio
+Audio datasets are downloaded from [this repo](https://github.com/OFA-Sys/ONE-PEACE/blob/main/datasets.md#audio) and Audioset from [here](https://github.com/qiuqiangkong/audioset_tagging_cnn#1-download-dataset).We reformat them to conform to the standard ImageNet format. Change the ```data_root``` [here1](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/data/build_datasets.py#L144) and [here2](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/data/build_datasets.py#L159).
+### Infrared (Thermal)
+We download LLVIP from [official website](https://bupt-ai-cz.github.io/LLVIP/), and FLIR from [here](https://www.flir.com/oem/adas/adas-dataset-form/). We reformat them to conform to the standard ImageNet format. Change the ```data_root``` [here](https://github.com/PKU-YuanGroup/LanguageBind/blob/main/data/build_datasets.py#L233). We also provide the processed data as follows.
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Datasets</th><th>Baidu Yun</th><th>Google Cloud</th><th>Peking University Yun</th>
+    </tr>
+    <tr align="center">
+        <td>LLVIP</td><td><a href="https://pan.baidu.com/s/15HPVr016F7eO9005NDRJTg?pwd=46fh">Link</a></td><td><a href="https://drive.google.com/file/d/1RfKNR8q6dHiAHB4OlYecnkUSx-ghLuEO/view?usp=drive_link">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/30D592EA37AC7C411264801A74994376">Link</a></td>
+    </tr>
+    <tr align="center">
+        <td>FLIR V1</td><td><a href="https://pan.baidu.com/s/1ZDSo5VPxJ4SA7wS_rNk0uQ?pwd=l491">Link</a></td><td><a href="https://drive.google.com/file/d/1CezCLJ4GUfPMFimitPfK40OV2j2Kr8t8/view?usp=drive_link">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/AD89D6ADE2CAC2407B00650870CBBDEC">Link</a></td>
+    </tr>
+    <tr align="center">
+        <td>FLIR V2</td><td><a href="https://pan.baidu.com/s/16xdr2aQkHo3zJ4KbaTmO3Q?pwd=tj9f">Link</a></td><td><a href="https://drive.google.com/file/d/1Z2ThG5QH-9biFI2-Z8k2fBKSA6Nrees6/view?usp=drive_link">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/E06C010970B0ED51926700D2F7A21EA8">Link</a></td>
+    </tr>
+</table>
+</div>
+### Folder structure
+```bash
+downstream_datasets
+├── Audio
+│   ├── audiocaps
+│   │   └── audio
+│   │       ├── test
+│   │       ├── train
+│   │       └── val
+│   ├── audioset
+│   │   ├── balanced_train_segments
+│   │   ├── eval_segments
+│   │   └── unbalanced_train_segments
+│   │       ├── unbalanced_train_segments_part00
+│   │       ├── unbalanced_train_segments_part01
+│   │       ├── ...
+│   │       └── unbalanced_train_segments_part40
+│   ├── clotho
+│   │   ├── CLOTHO_retrieval_dataset
+│   │   └── evaluation
+│   ├── esc50
+│   │   └── test
+│   │       ├── airplane
+│   │       ├── breathing
+│   │       ├── ...
+│   │       └── wind
+├── laionaudio
+│   │   ├── audios
+│   │   ├── freesound_no_overlap
+│   │   └── jsons
+├── vggsound
+│       └── test
+│           ├── air\ conditioning\ noise
+│           ├── air\ horn
+│           ├── ...
+│           └── zebra\ braying
+├── Depth
+│   ├── nyuv2
+│   │   ├── data
+│   │   │   └── val
+│   │   │       ├── bathroom
+│   │   │       ├── bedroom
+│   │   │       ├── bookstore
+│   │   │       ├── classroom
+│   │   │       ├── dining_room
+│   │   │       ├── home_office
+│   │   │       ├── kitchen
+│   │   │       ├── living_room
+│   │   │       ├── office
+│   │   │       └── others
+├── Thermal
+│   ├── flirv1
+│   │   └── val
+│   │       ├── bicycle
+│   │       ├── car
+│   │       ├── dog
+│   │       └── person
+│   ├── flirv2
+│   │   └── val
+│   │       ├── bike
+│   │       ├── bus
+│   │       ├── car
+│   │       ├── hydrant
+│   │       ├── light
+│   │       ├── motor
+│   │       ├── other\ vehicle
+│   │       ├── person
+│   │       ├── sign
+│   │       ├── skateboard
+│   │       ├── stroller
+│   │       └── truck
+│   ├── llvip
+│   │   ├── train
+│   │   │   ├── background
+│   │   │   └── person
+│   │   └── val
+│   │       ├── background
+│   │       └── person
+└── VideoTextRetrieval
+    ├── vtRetdata
+    │   ├── ActivityNet
+    │   │   └── Videos
+    │   │       └── Activity_Videos
+    │   ├── Didemo
+    │   │   └── videos
+    │   ├── MSRVTT
+    │   │   └── MSRVTT_Videos
+    │   └── MSVD
+    │��      └── MSVD_Videos
+```

a_cls/class_labels_indices.csv ADDED Viewed

	@@ -0,0 +1,528 @@

+index,mid,display_name
+0,/m/09x0r,"Speech"
+1,/m/05zppz,"Male speech, man speaking"
+2,/m/02zsn,"Female speech, woman speaking"
+3,/m/0ytgt,"Child speech, kid speaking"
+4,/m/01h8n0,"Conversation"
+5,/m/02qldy,"Narration, monologue"
+6,/m/0261r1,"Babbling"
+7,/m/0brhx,"Speech synthesizer"
+8,/m/07p6fty,"Shout"
+9,/m/07q4ntr,"Bellow"
+10,/m/07rwj3x,"Whoop"
+11,/m/07sr1lc,"Yell"
+12,/m/04gy_2,"Battle cry"
+13,/t/dd00135,"Children shouting"
+14,/m/03qc9zr,"Screaming"
+15,/m/02rtxlg,"Whispering"
+16,/m/01j3sz,"Laughter"
+17,/t/dd00001,"Baby laughter"
+18,/m/07r660_,"Giggle"
+19,/m/07s04w4,"Snicker"
+20,/m/07sq110,"Belly laugh"
+21,/m/07rgt08,"Chuckle, chortle"
+22,/m/0463cq4,"Crying, sobbing"
+23,/t/dd00002,"Baby cry, infant cry"
+24,/m/07qz6j3,"Whimper"
+25,/m/07qw_06,"Wail, moan"
+26,/m/07plz5l,"Sigh"
+27,/m/015lz1,"Singing"
+28,/m/0l14jd,"Choir"
+29,/m/01swy6,"Yodeling"
+30,/m/02bk07,"Chant"
+31,/m/01c194,"Mantra"
+32,/t/dd00003,"Male singing"
+33,/t/dd00004,"Female singing"
+34,/t/dd00005,"Child singing"
+35,/t/dd00006,"Synthetic singing"
+36,/m/06bxc,"Rapping"
+37,/m/02fxyj,"Humming"
+38,/m/07s2xch,"Groan"
+39,/m/07r4k75,"Grunt"
+40,/m/01w250,"Whistling"
+41,/m/0lyf6,"Breathing"
+42,/m/07mzm6,"Wheeze"
+43,/m/01d3sd,"Snoring"
+44,/m/07s0dtb,"Gasp"
+45,/m/07pyy8b,"Pant"
+46,/m/07q0yl5,"Snort"
+47,/m/01b_21,"Cough"
+48,/m/0dl9sf8,"Throat clearing"
+49,/m/01hsr_,"Sneeze"
+50,/m/07ppn3j,"Sniff"
+51,/m/06h7j,"Run"
+52,/m/07qv_x_,"Shuffle"
+53,/m/07pbtc8,"Walk, footsteps"
+54,/m/03cczk,"Chewing, mastication"
+55,/m/07pdhp0,"Biting"
+56,/m/0939n_,"Gargling"
+57,/m/01g90h,"Stomach rumble"
+58,/m/03q5_w,"Burping, eructation"
+59,/m/02p3nc,"Hiccup"
+60,/m/02_nn,"Fart"
+61,/m/0k65p,"Hands"
+62,/m/025_jnm,"Finger snapping"
+63,/m/0l15bq,"Clapping"
+64,/m/01jg02,"Heart sounds, heartbeat"
+65,/m/01jg1z,"Heart murmur"
+66,/m/053hz1,"Cheering"
+67,/m/028ght,"Applause"
+68,/m/07rkbfh,"Chatter"
+69,/m/03qtwd,"Crowd"
+70,/m/07qfr4h,"Hubbub, speech noise, speech babble"
+71,/t/dd00013,"Children playing"
+72,/m/0jbk,"Animal"
+73,/m/068hy,"Domestic animals, pets"
+74,/m/0bt9lr,"Dog"
+75,/m/05tny_,"Bark"
+76,/m/07r_k2n,"Yip"
+77,/m/07qf0zm,"Howl"
+78,/m/07rc7d9,"Bow-wow"
+79,/m/0ghcn6,"Growling"
+80,/t/dd00136,"Whimper (dog)"
+81,/m/01yrx,"Cat"
+82,/m/02yds9,"Purr"
+83,/m/07qrkrw,"Meow"
+84,/m/07rjwbb,"Hiss"
+85,/m/07r81j2,"Caterwaul"
+86,/m/0ch8v,"Livestock, farm animals, working animals"
+87,/m/03k3r,"Horse"
+88,/m/07rv9rh,"Clip-clop"
+89,/m/07q5rw0,"Neigh, whinny"
+90,/m/01xq0k1,"Cattle, bovinae"
+91,/m/07rpkh9,"Moo"
+92,/m/0239kh,"Cowbell"
+93,/m/068zj,"Pig"
+94,/t/dd00018,"Oink"
+95,/m/03fwl,"Goat"
+96,/m/07q0h5t,"Bleat"
+97,/m/07bgp,"Sheep"
+98,/m/025rv6n,"Fowl"
+99,/m/09b5t,"Chicken, rooster"
+100,/m/07st89h,"Cluck"
+101,/m/07qn5dc,"Crowing, cock-a-doodle-doo"
+102,/m/01rd7k,"Turkey"
+103,/m/07svc2k,"Gobble"
+104,/m/09ddx,"Duck"
+105,/m/07qdb04,"Quack"
+106,/m/0dbvp,"Goose"
+107,/m/07qwf61,"Honk"
+108,/m/01280g,"Wild animals"
+109,/m/0cdnk,"Roaring cats (lions, tigers)"
+110,/m/04cvmfc,"Roar"
+111,/m/015p6,"Bird"
+112,/m/020bb7,"Bird vocalization, bird call, bird song"
+113,/m/07pggtn,"Chirp, tweet"
+114,/m/07sx8x_,"Squawk"
+115,/m/0h0rv,"Pigeon, dove"
+116,/m/07r_25d,"Coo"
+117,/m/04s8yn,"Crow"
+118,/m/07r5c2p,"Caw"
+119,/m/09d5_,"Owl"
+120,/m/07r_80w,"Hoot"
+121,/m/05_wcq,"Bird flight, flapping wings"
+122,/m/01z5f,"Canidae, dogs, wolves"
+123,/m/06hps,"Rodents, rats, mice"
+124,/m/04rmv,"Mouse"
+125,/m/07r4gkf,"Patter"
+126,/m/03vt0,"Insect"
+127,/m/09xqv,"Cricket"
+128,/m/09f96,"Mosquito"
+129,/m/0h2mp,"Fly, housefly"
+130,/m/07pjwq1,"Buzz"
+131,/m/01h3n,"Bee, wasp, etc."
+132,/m/09ld4,"Frog"
+133,/m/07st88b,"Croak"
+134,/m/078jl,"Snake"
+135,/m/07qn4z3,"Rattle"
+136,/m/032n05,"Whale vocalization"
+137,/m/04rlf,"Music"
+138,/m/04szw,"Musical instrument"
+139,/m/0fx80y,"Plucked string instrument"
+140,/m/0342h,"Guitar"
+141,/m/02sgy,"Electric guitar"
+142,/m/018vs,"Bass guitar"
+143,/m/042v_gx,"Acoustic guitar"
+144,/m/06w87,"Steel guitar, slide guitar"
+145,/m/01glhc,"Tapping (guitar technique)"
+146,/m/07s0s5r,"Strum"
+147,/m/018j2,"Banjo"
+148,/m/0jtg0,"Sitar"
+149,/m/04rzd,"Mandolin"
+150,/m/01bns_,"Zither"
+151,/m/07xzm,"Ukulele"
+152,/m/05148p4,"Keyboard (musical)"
+153,/m/05r5c,"Piano"
+154,/m/01s0ps,"Electric piano"
+155,/m/013y1f,"Organ"
+156,/m/03xq_f,"Electronic organ"
+157,/m/03gvt,"Hammond organ"
+158,/m/0l14qv,"Synthesizer"
+159,/m/01v1d8,"Sampler"
+160,/m/03q5t,"Harpsichord"
+161,/m/0l14md,"Percussion"
+162,/m/02hnl,"Drum kit"
+163,/m/0cfdd,"Drum machine"
+164,/m/026t6,"Drum"
+165,/m/06rvn,"Snare drum"
+166,/m/03t3fj,"Rimshot"
+167,/m/02k_mr,"Drum roll"
+168,/m/0bm02,"Bass drum"
+169,/m/011k_j,"Timpani"
+170,/m/01p970,"Tabla"
+171,/m/01qbl,"Cymbal"
+172,/m/03qtq,"Hi-hat"
+173,/m/01sm1g,"Wood block"
+174,/m/07brj,"Tambourine"
+175,/m/05r5wn,"Rattle (instrument)"
+176,/m/0xzly,"Maraca"
+177,/m/0mbct,"Gong"
+178,/m/016622,"Tubular bells"
+179,/m/0j45pbj,"Mallet percussion"
+180,/m/0dwsp,"Marimba, xylophone"
+181,/m/0dwtp,"Glockenspiel"
+182,/m/0dwt5,"Vibraphone"
+183,/m/0l156b,"Steelpan"
+184,/m/05pd6,"Orchestra"
+185,/m/01kcd,"Brass instrument"
+186,/m/0319l,"French horn"
+187,/m/07gql,"Trumpet"
+188,/m/07c6l,"Trombone"
+189,/m/0l14_3,"Bowed string instrument"
+190,/m/02qmj0d,"String section"
+191,/m/07y_7,"Violin, fiddle"
+192,/m/0d8_n,"Pizzicato"
+193,/m/01xqw,"Cello"
+194,/m/02fsn,"Double bass"
+195,/m/085jw,"Wind instrument, woodwind instrument"
+196,/m/0l14j_,"Flute"
+197,/m/06ncr,"Saxophone"
+198,/m/01wy6,"Clarinet"
+199,/m/03m5k,"Harp"
+200,/m/0395lw,"Bell"
+201,/m/03w41f,"Church bell"
+202,/m/027m70_,"Jingle bell"
+203,/m/0gy1t2s,"Bicycle bell"
+204,/m/07n_g,"Tuning fork"
+205,/m/0f8s22,"Chime"
+206,/m/026fgl,"Wind chime"
+207,/m/0150b9,"Change ringing (campanology)"
+208,/m/03qjg,"Harmonica"
+209,/m/0mkg,"Accordion"
+210,/m/0192l,"Bagpipes"
+211,/m/02bxd,"Didgeridoo"
+212,/m/0l14l2,"Shofar"
+213,/m/07kc_,"Theremin"
+214,/m/0l14t7,"Singing bowl"
+215,/m/01hgjl,"Scratching (performance technique)"
+216,/m/064t9,"Pop music"
+217,/m/0glt670,"Hip hop music"
+218,/m/02cz_7,"Beatboxing"
+219,/m/06by7,"Rock music"
+220,/m/03lty,"Heavy metal"
+221,/m/05r6t,"Punk rock"
+222,/m/0dls3,"Grunge"
+223,/m/0dl5d,"Progressive rock"
+224,/m/07sbbz2,"Rock and roll"
+225,/m/05w3f,"Psychedelic rock"
+226,/m/06j6l,"Rhythm and blues"
+227,/m/0gywn,"Soul music"
+228,/m/06cqb,"Reggae"
+229,/m/01lyv,"Country"
+230,/m/015y_n,"Swing music"
+231,/m/0gg8l,"Bluegrass"
+232,/m/02x8m,"Funk"
+233,/m/02w4v,"Folk music"
+234,/m/06j64v,"Middle Eastern music"
+235,/m/03_d0,"Jazz"
+236,/m/026z9,"Disco"
+237,/m/0ggq0m,"Classical music"
+238,/m/05lls,"Opera"
+239,/m/02lkt,"Electronic music"
+240,/m/03mb9,"House music"
+241,/m/07gxw,"Techno"
+242,/m/07s72n,"Dubstep"
+243,/m/0283d,"Drum and bass"
+244,/m/0m0jc,"Electronica"
+245,/m/08cyft,"Electronic dance music"
+246,/m/0fd3y,"Ambient music"
+247,/m/07lnk,"Trance music"
+248,/m/0g293,"Music of Latin America"
+249,/m/0ln16,"Salsa music"
+250,/m/0326g,"Flamenco"
+251,/m/0155w,"Blues"
+252,/m/05fw6t,"Music for children"
+253,/m/02v2lh,"New-age music"
+254,/m/0y4f8,"Vocal music"
+255,/m/0z9c,"A capella"
+256,/m/0164x2,"Music of Africa"
+257,/m/0145m,"Afrobeat"
+258,/m/02mscn,"Christian music"
+259,/m/016cjb,"Gospel music"
+260,/m/028sqc,"Music of Asia"
+261,/m/015vgc,"Carnatic music"
+262,/m/0dq0md,"Music of Bollywood"
+263,/m/06rqw,"Ska"
+264,/m/02p0sh1,"Traditional music"
+265,/m/05rwpb,"Independent music"
+266,/m/074ft,"Song"
+267,/m/025td0t,"Background music"
+268,/m/02cjck,"Theme music"
+269,/m/03r5q_,"Jingle (music)"
+270,/m/0l14gg,"Soundtrack music"
+271,/m/07pkxdp,"Lullaby"
+272,/m/01z7dr,"Video game music"
+273,/m/0140xf,"Christmas music"
+274,/m/0ggx5q,"Dance music"
+275,/m/04wptg,"Wedding music"
+276,/t/dd00031,"Happy music"
+277,/t/dd00032,"Funny music"
+278,/t/dd00033,"Sad music"
+279,/t/dd00034,"Tender music"
+280,/t/dd00035,"Exciting music"
+281,/t/dd00036,"Angry music"
+282,/t/dd00037,"Scary music"
+283,/m/03m9d0z,"Wind"
+284,/m/09t49,"Rustling leaves"
+285,/t/dd00092,"Wind noise (microphone)"
+286,/m/0jb2l,"Thunderstorm"
+287,/m/0ngt1,"Thunder"
+288,/m/0838f,"Water"
+289,/m/06mb1,"Rain"
+290,/m/07r10fb,"Raindrop"
+291,/t/dd00038,"Rain on surface"
+292,/m/0j6m2,"Stream"
+293,/m/0j2kx,"Waterfall"
+294,/m/05kq4,"Ocean"
+295,/m/034srq,"Waves, surf"
+296,/m/06wzb,"Steam"
+297,/m/07swgks,"Gurgling"
+298,/m/02_41,"Fire"
+299,/m/07pzfmf,"Crackle"
+300,/m/07yv9,"Vehicle"
+301,/m/019jd,"Boat, Water vehicle"
+302,/m/0hsrw,"Sailboat, sailing ship"
+303,/m/056ks2,"Rowboat, canoe, kayak"
+304,/m/02rlv9,"Motorboat, speedboat"
+305,/m/06q74,"Ship"
+306,/m/012f08,"Motor vehicle (road)"
+307,/m/0k4j,"Car"
+308,/m/0912c9,"Vehicle horn, car horn, honking"
+309,/m/07qv_d5,"Toot"
+310,/m/02mfyn,"Car alarm"
+311,/m/04gxbd,"Power windows, electric windows"
+312,/m/07rknqz,"Skidding"
+313,/m/0h9mv,"Tire squeal"
+314,/t/dd00134,"Car passing by"
+315,/m/0ltv,"Race car, auto racing"
+316,/m/07r04,"Truck"
+317,/m/0gvgw0,"Air brake"
+318,/m/05x_td,"Air horn, truck horn"
+319,/m/02rhddq,"Reversing beeps"
+320,/m/03cl9h,"Ice cream truck, ice cream van"
+321,/m/01bjv,"Bus"
+322,/m/03j1ly,"Emergency vehicle"
+323,/m/04qvtq,"Police car (siren)"
+324,/m/012n7d,"Ambulance (siren)"
+325,/m/012ndj,"Fire engine, fire truck (siren)"
+326,/m/04_sv,"Motorcycle"
+327,/m/0btp2,"Traffic noise, roadway noise"
+328,/m/06d_3,"Rail transport"
+329,/m/07jdr,"Train"
+330,/m/04zmvq,"Train whistle"
+331,/m/0284vy3,"Train horn"
+332,/m/01g50p,"Railroad car, train wagon"
+333,/t/dd00048,"Train wheels squealing"
+334,/m/0195fx,"Subway, metro, underground"
+335,/m/0k5j,"Aircraft"
+336,/m/014yck,"Aircraft engine"
+337,/m/04229,"Jet engine"
+338,/m/02l6bg,"Propeller, airscrew"
+339,/m/09ct_,"Helicopter"
+340,/m/0cmf2,"Fixed-wing aircraft, airplane"
+341,/m/0199g,"Bicycle"
+342,/m/06_fw,"Skateboard"
+343,/m/02mk9,"Engine"
+344,/t/dd00065,"Light engine (high frequency)"
+345,/m/08j51y,"Dental drill, dentist's drill"
+346,/m/01yg9g,"Lawn mower"
+347,/m/01j4z9,"Chainsaw"
+348,/t/dd00066,"Medium engine (mid frequency)"
+349,/t/dd00067,"Heavy engine (low frequency)"
+350,/m/01h82_,"Engine knocking"
+351,/t/dd00130,"Engine starting"
+352,/m/07pb8fc,"Idling"
+353,/m/07q2z82,"Accelerating, revving, vroom"
+354,/m/02dgv,"Door"
+355,/m/03wwcy,"Doorbell"
+356,/m/07r67yg,"Ding-dong"
+357,/m/02y_763,"Sliding door"
+358,/m/07rjzl8,"Slam"
+359,/m/07r4wb8,"Knock"
+360,/m/07qcpgn,"Tap"
+361,/m/07q6cd_,"Squeak"
+362,/m/0642b4,"Cupboard open or close"
+363,/m/0fqfqc,"Drawer open or close"
+364,/m/04brg2,"Dishes, pots, and pans"
+365,/m/023pjk,"Cutlery, silverware"
+366,/m/07pn_8q,"Chopping (food)"
+367,/m/0dxrf,"Frying (food)"
+368,/m/0fx9l,"Microwave oven"
+369,/m/02pjr4,"Blender"
+370,/m/02jz0l,"Water tap, faucet"
+371,/m/0130jx,"Sink (filling or washing)"
+372,/m/03dnzn,"Bathtub (filling or washing)"
+373,/m/03wvsk,"Hair dryer"
+374,/m/01jt3m,"Toilet flush"
+375,/m/012xff,"Toothbrush"
+376,/m/04fgwm,"Electric toothbrush"
+377,/m/0d31p,"Vacuum cleaner"
+378,/m/01s0vc,"Zipper (clothing)"
+379,/m/03v3yw,"Keys jangling"
+380,/m/0242l,"Coin (dropping)"
+381,/m/01lsmm,"Scissors"
+382,/m/02g901,"Electric shaver, electric razor"
+383,/m/05rj2,"Shuffling cards"
+384,/m/0316dw,"Typing"
+385,/m/0c2wf,"Typewriter"
+386,/m/01m2v,"Computer keyboard"
+387,/m/081rb,"Writing"
+388,/m/07pp_mv,"Alarm"
+389,/m/07cx4,"Telephone"
+390,/m/07pp8cl,"Telephone bell ringing"
+391,/m/01hnzm,"Ringtone"
+392,/m/02c8p,"Telephone dialing, DTMF"
+393,/m/015jpf,"Dial tone"
+394,/m/01z47d,"Busy signal"
+395,/m/046dlr,"Alarm clock"
+396,/m/03kmc9,"Siren"
+397,/m/0dgbq,"Civil defense siren"
+398,/m/030rvx,"Buzzer"
+399,/m/01y3hg,"Smoke detector, smoke alarm"
+400,/m/0c3f7m,"Fire alarm"
+401,/m/04fq5q,"Foghorn"
+402,/m/0l156k,"Whistle"
+403,/m/06hck5,"Steam whistle"
+404,/t/dd00077,"Mechanisms"
+405,/m/02bm9n,"Ratchet, pawl"
+406,/m/01x3z,"Clock"
+407,/m/07qjznt,"Tick"
+408,/m/07qjznl,"Tick-tock"
+409,/m/0l7xg,"Gears"
+410,/m/05zc1,"Pulleys"
+411,/m/0llzx,"Sewing machine"
+412,/m/02x984l,"Mechanical fan"
+413,/m/025wky1,"Air conditioning"
+414,/m/024dl,"Cash register"
+415,/m/01m4t,"Printer"
+416,/m/0dv5r,"Camera"
+417,/m/07bjf,"Single-lens reflex camera"
+418,/m/07k1x,"Tools"
+419,/m/03l9g,"Hammer"
+420,/m/03p19w,"Jackhammer"
+421,/m/01b82r,"Sawing"
+422,/m/02p01q,"Filing (rasp)"
+423,/m/023vsd,"Sanding"
+424,/m/0_ksk,"Power tool"
+425,/m/01d380,"Drill"
+426,/m/014zdl,"Explosion"
+427,/m/032s66,"Gunshot, gunfire"
+428,/m/04zjc,"Machine gun"
+429,/m/02z32qm,"Fusillade"
+430,/m/0_1c,"Artillery fire"
+431,/m/073cg4,"Cap gun"
+432,/m/0g6b5,"Fireworks"
+433,/g/122z_qxw,"Firecracker"
+434,/m/07qsvvw,"Burst, pop"
+435,/m/07pxg6y,"Eruption"
+436,/m/07qqyl4,"Boom"
+437,/m/083vt,"Wood"
+438,/m/07pczhz,"Chop"
+439,/m/07pl1bw,"Splinter"
+440,/m/07qs1cx,"Crack"
+441,/m/039jq,"Glass"
+442,/m/07q7njn,"Chink, clink"
+443,/m/07rn7sz,"Shatter"
+444,/m/04k94,"Liquid"
+445,/m/07rrlb6,"Splash, splatter"
+446,/m/07p6mqd,"Slosh"
+447,/m/07qlwh6,"Squish"
+448,/m/07r5v4s,"Drip"
+449,/m/07prgkl,"Pour"
+450,/m/07pqc89,"Trickle, dribble"
+451,/t/dd00088,"Gush"
+452,/m/07p7b8y,"Fill (with liquid)"
+453,/m/07qlf79,"Spray"
+454,/m/07ptzwd,"Pump (liquid)"
+455,/m/07ptfmf,"Stir"
+456,/m/0dv3j,"Boiling"
+457,/m/0790c,"Sonar"
+458,/m/0dl83,"Arrow"
+459,/m/07rqsjt,"Whoosh, swoosh, swish"
+460,/m/07qnq_y,"Thump, thud"
+461,/m/07rrh0c,"Thunk"
+462,/m/0b_fwt,"Electronic tuner"
+463,/m/02rr_,"Effects unit"
+464,/m/07m2kt,"Chorus effect"
+465,/m/018w8,"Basketball bounce"
+466,/m/07pws3f,"Bang"
+467,/m/07ryjzk,"Slap, smack"
+468,/m/07rdhzs,"Whack, thwack"
+469,/m/07pjjrj,"Smash, crash"
+470,/m/07pc8lb,"Breaking"
+471,/m/07pqn27,"Bouncing"
+472,/m/07rbp7_,"Whip"
+473,/m/07pyf11,"Flap"
+474,/m/07qb_dv,"Scratch"
+475,/m/07qv4k0,"Scrape"
+476,/m/07pdjhy,"Rub"
+477,/m/07s8j8t,"Roll"
+478,/m/07plct2,"Crushing"
+479,/t/dd00112,"Crumpling, crinkling"
+480,/m/07qcx4z,"Tearing"
+481,/m/02fs_r,"Beep, bleep"
+482,/m/07qwdck,"Ping"
+483,/m/07phxs1,"Ding"
+484,/m/07rv4dm,"Clang"
+485,/m/07s02z0,"Squeal"
+486,/m/07qh7jl,"Creak"
+487,/m/07qwyj0,"Rustle"
+488,/m/07s34ls,"Whir"
+489,/m/07qmpdm,"Clatter"
+490,/m/07p9k1k,"Sizzle"
+491,/m/07qc9xj,"Clicking"
+492,/m/07rwm0c,"Clickety-clack"
+493,/m/07phhsh,"Rumble"
+494,/m/07qyrcz,"Plop"
+495,/m/07qfgpx,"Jingle, tinkle"
+496,/m/07rcgpl,"Hum"
+497,/m/07p78v5,"Zing"
+498,/t/dd00121,"Boing"
+499,/m/07s12q4,"Crunch"
+500,/m/028v0c,"Silence"
+501,/m/01v_m0,"Sine wave"
+502,/m/0b9m1,"Harmonic"
+503,/m/0hdsk,"Chirp tone"
+504,/m/0c1dj,"Sound effect"
+505,/m/07pt_g0,"Pulse"
+506,/t/dd00125,"Inside, small room"
+507,/t/dd00126,"Inside, large room or hall"
+508,/t/dd00127,"Inside, public space"
+509,/t/dd00128,"Outside, urban or manmade"
+510,/t/dd00129,"Outside, rural or natural"
+511,/m/01b9nn,"Reverberation"
+512,/m/01jnbd,"Echo"
+513,/m/096m7z,"Noise"
+514,/m/06_y0by,"Environmental noise"
+515,/m/07rgkc5,"Static"
+516,/m/06xkwv,"Mains hum"
+517,/m/0g12c5,"Distortion"
+518,/m/08p9q4,"Sidetone"
+519,/m/07szfh9,"Cacophony"
+520,/m/0chx_,"White noise"
+521,/m/0cj0r,"Pink noise"
+522,/m/07p_0gm,"Throbbing"
+523,/m/01jwx6,"Vibration"
+524,/m/07c52,"Television"
+525,/m/06bz3,"Radio"
+526,/m/07hvw1,"Field recording"

a_cls/dataloader.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# -*- coding: utf-8 -*-
+# @Time    : 6/19/21 12:23 AM
+# @Author  : Yuan Gong
+# @Affiliation  : Massachusetts Institute of Technology
+# @Email   : yuangong@mit.edu
+# @File    : dataloader.py
+# modified from:
+# Author: David Harwath
+# with some functions borrowed from https://github.com/SeanNaren/deepspeech.pytorch
+import csv
+import json
+import logging
+import torchaudio
+import numpy as np
+import torch
+import torch.nn.functional
+from torch.utils.data import Dataset
+import random
+def make_midname_dict(label_csv):
+    index_lookup = {}
+    with open(label_csv, 'r') as f:
+        csv_reader = csv.DictReader(f)
+        line_count = 0
+        for row in csv_reader:
+            index_lookup[row['mid']] = row['display_name']
+            line_count += 1
+    return index_lookup
+def make_index_dict(label_csv):
+    index_lookup = {}
+    with open(label_csv, 'r') as f:
+        csv_reader = csv.DictReader(f)
+        line_count = 0
+        for row in csv_reader:
+            index_lookup[row['mid']] = row['index']
+            line_count += 1
+    return index_lookup
+def make_name_dict(label_csv):
+    name_lookup = {}
+    with open(label_csv, 'r') as f:
+        csv_reader = csv.DictReader(f)
+        line_count = 0
+        for row in csv_reader:
+            name_lookup[row['index']] = row['display_name']
+            line_count += 1
+    return name_lookup
+def lookup_list(index_list, label_csv):
+    label_list = []
+    table = make_name_dict(label_csv)
+    for item in index_list:
+        label_list.append(table[item])
+    return label_list
+def preemphasis(signal,coeff=0.97):
+    """perform preemphasis on the input signal.
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is none, default 0.97.
+    :returns: the filtered signal.
+    """
+    return np.append(signal[0],signal[1:]-coeff*signal[:-1])
+class AudiosetDataset(Dataset):
+    def __init__(self, dataset_json_file, audio_conf, label_csv=None):
+        """
+        Dataset that manages audio recordings
+        :param audio_conf: Dictionary containing the audio loading and preprocessing settings
+        :param dataset_json_file
+        """
+        self.datapath = dataset_json_file
+        with open(dataset_json_file, 'r') as fp:
+            data_json = json.load(fp)
+        self.data = data_json['data']
+        self.index_dict = make_index_dict(label_csv)
+        self.label_num = len(self.index_dict)
+    def __getitem__(self, index):
+        datum = self.data[index]
+        label_indices = np.zeros(self.label_num)
+        try:
+            fbank, mix_lambda = self._wav2fbank(datum['wav'])
+        except Exception as e:
+            logging.warning(f"Error at {datum['wav']} with \"{e}\"")
+            return self.__getitem__(random.randint(0, self.__len__()-1))
+        for label_str in datum['labels'].split(','):
+            label_indices[int(self.index_dict[label_str])] = 1.0
+        label_indices = torch.FloatTensor(label_indices)
+        return fbank, label_indices
+    def __len__(self):
+        return len(self.data)

a_cls/datasets.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os.path
+import torch
+from data.build_datasets import DataInfo
+from data.process_audio import get_audio_transform, torchaudio_loader
+from torchvision import datasets
+# -*- coding: utf-8 -*-
+# @Time    : 6/19/21 12:23 AM
+# @Author  : Yuan Gong
+# @Affiliation  : Massachusetts Institute of Technology
+# @Email   : yuangong@mit.edu
+# @File    : dataloader.py
+# modified from:
+# Author: David Harwath
+# with some functions borrowed from https://github.com/SeanNaren/deepspeech.pytorch
+import csv
+import json
+import logging
+import torchaudio
+import numpy as np
+import torch
+import torch.nn.functional
+from torch.utils.data import Dataset
+import random
+def make_index_dict(label_csv):
+    index_lookup = {}
+    with open(label_csv, 'r') as f:
+        csv_reader = csv.DictReader(f)
+        line_count = 0
+        for row in csv_reader:
+            index_lookup[row['mid']] = row['index']
+            line_count += 1
+    return index_lookup
+class AudiosetDataset(Dataset):
+    def __init__(self, args, transform, loader):
+        self.audio_root = '/apdcephfs_cq3/share_1311970/downstream_datasets/Audio/audioset/eval_segments'
+        dataset_json_file = '/apdcephfs_cq3/share_1311970/downstream_datasets/Audio/audioset/filter_eval.json'
+        label_csv = '/apdcephfs_cq3/share_1311970/downstream_datasets/Audio/audioset/class_labels_indices.csv'
+        with open(dataset_json_file, 'r') as fp:
+            data_json = json.load(fp)
+        self.data = data_json['data']
+        self.index_dict = make_index_dict(label_csv)
+        self.label_num = len(self.index_dict)
+        self.args = args
+        self.transform = transform
+        self.loader = loader
+    def __getitem__(self, index):
+        datum = self.data[index]
+        label_indices = np.zeros(self.label_num)
+        for label_str in datum['labels'].split(','):
+            label_indices[int(self.index_dict[label_str])] = 1.0
+        label_indices = torch.FloatTensor(label_indices)
+        audio = self.loader(os.path.join(self.audio_root, datum['wav']))
+        audio_data = self.transform(audio)
+        return audio_data, label_indices
+    def __len__(self):
+        return len(self.data)
+def is_valid_file(path):
+    return True
+def get_audio_dataset(args):
+    data_path = args.audio_data_path
+    transform = get_audio_transform(args)
+    if args.val_a_cls_data.lower() == 'audioset':
+        dataset = AudiosetDataset(args, transform=transform, loader=torchaudio_loader)
+    else:
+        dataset = datasets.ImageFolder(data_path, transform=transform, loader=torchaudio_loader, is_valid_file=is_valid_file)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        num_workers=args.workers,
+        sampler=None,
+    )
+    return DataInfo(dataloader=dataloader, sampler=None)

a_cls/filter_eval_audio.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import json
+import os.path
+from tqdm import tqdm
+with open(r"G:\audioset\audioset\zip_audios\16k\eval.json", 'r') as f:
+    data = json.load(f)['data']
+new_data = []
+total = 0
+success = 0
+for i in tqdm(data):
+    total += 1
+    video_id = os.path.basename(i['wav'])
+    new_video_id = 'Y' + video_id
+    i['wav'] = new_video_id
+    if os.path.exists(f"G:/audioset/audioset/zip_audios/eval_segments/{i['wav']}") and not video_id.startswith('mW3S0u8bj58'):
+        new_data.append(i)
+        success += 1
+print(total, success, total-success)
+with open(r"G:\audioset\audioset\zip_audios\16k\filter_eval.json", 'w') as f:
+    data = json.dump({'data': new_data}, f, indent=2)

a_cls/precision.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from contextlib import suppress
+def get_autocast(precision):
+    if precision == 'amp':
+        return torch.cuda.amp.autocast
+    elif precision == 'amp_bfloat16' or precision == 'amp_bf16':
+        # amp_bfloat16 is more stable than amp float16 for clip training
+        return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
+    else:
+        return suppress

a_cls/stats.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import numpy as np
+from scipy import stats
+from sklearn import metrics
+import torch
+def d_prime(auc):
+    standard_normal = stats.norm()
+    d_prime = standard_normal.ppf(auc) * np.sqrt(2.0)
+    return d_prime
+def calculate_stats(output, target):
+    """Calculate statistics including mAP, AUC, etc.
+    Args:
+      output: 2d array, (samples_num, classes_num)
+      target: 2d array, (samples_num, classes_num)
+    Returns:
+      stats: list of statistic of each class.
+    """
+    classes_num = target.shape[-1]
+    stats = []
+    # Accuracy, only used for single-label classification such as esc-50, not for multiple label one such as AudioSet
+    acc = metrics.accuracy_score(np.argmax(target, 1), np.argmax(output, 1))
+    # Class-wise statistics
+    for k in range(classes_num):
+        # Average precision
+        avg_precision = metrics.average_precision_score(
+            target[:, k], output[:, k], average=None)
+        # AUC
+        auc = metrics.roc_auc_score(target[:, k], output[:, k], average=None)
+        # Precisions, recalls
+        (precisions, recalls, thresholds) = metrics.precision_recall_curve(
+            target[:, k], output[:, k])
+        # FPR, TPR
+        (fpr, tpr, thresholds) = metrics.roc_curve(target[:, k], output[:, k])
+        save_every_steps = 1000     # Sample statistics to reduce size
+        dict = {'precisions': precisions[0::save_every_steps],
+                'recalls': recalls[0::save_every_steps],
+                'AP': avg_precision,
+                'fpr': fpr[0::save_every_steps],
+                'fnr': 1. - tpr[0::save_every_steps],
+                'auc': auc,
+                # note acc is not class-wise, this is just to keep consistent with other metrics
+                'acc': acc
+                }
+        stats.append(dict)
+    return stats

a_cls/util.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import math
+import pickle
+import numpy as np
+import torch
+import torch.nn as nn
+import random
+from collections import namedtuple
+def calc_recalls(S):
+    """
+    Computes recall at 1, 5, and 10 given a similarity matrix S.
+    By convention, rows of S are assumed to correspond to images and columns are captions.
+    """
+    assert(S.dim() == 2)
+    assert(S.size(0) == S.size(1))
+    if isinstance(S, torch.autograd.Variable):
+        S = S.data
+    n = S.size(0)
+    A2I_scores, A2I_ind = S.topk(10, 0)
+    I2A_scores, I2A_ind = S.topk(10, 1)
+    A_r1 = AverageMeter()
+    A_r5 = AverageMeter()
+    A_r10 = AverageMeter()
+    I_r1 = AverageMeter()
+    I_r5 = AverageMeter()
+    I_r10 = AverageMeter()
+    for i in range(n):
+        A_foundind = -1
+        I_foundind = -1
+        for ind in range(10):
+            if A2I_ind[ind, i] == i:
+                I_foundind = ind
+            if I2A_ind[i, ind] == i:
+                A_foundind = ind
+        # do r1s
+        if A_foundind == 0:
+            A_r1.update(1)
+        else:
+            A_r1.update(0)
+        if I_foundind == 0:
+            I_r1.update(1)
+        else:
+            I_r1.update(0)
+        # do r5s
+        if A_foundind >= 0 and A_foundind < 5:
+            A_r5.update(1)
+        else:
+            A_r5.update(0)
+        if I_foundind >= 0 and I_foundind < 5:
+            I_r5.update(1)
+        else:
+            I_r5.update(0)
+        # do r10s
+        if A_foundind >= 0 and A_foundind < 10:
+            A_r10.update(1)
+        else:
+            A_r10.update(0)
+        if I_foundind >= 0 and I_foundind < 10:
+            I_r10.update(1)
+        else:
+            I_r10.update(0)
+    recalls = {'A_r1':A_r1.avg, 'A_r5':A_r5.avg, 'A_r10':A_r10.avg,
+                'I_r1':I_r1.avg, 'I_r5':I_r5.avg, 'I_r10':I_r10.avg}
+                #'A_meanR':A_meanR.avg, 'I_meanR':I_meanR.avg}
+    return recalls
+def computeMatchmap(I, A):
+    assert(I.dim() == 3)
+    assert(A.dim() == 2)
+    D = I.size(0)
+    H = I.size(1)
+    W = I.size(2)
+    T = A.size(1)
+    Ir = I.view(D, -1).t()
+    matchmap = torch.mm(Ir, A)
+    matchmap = matchmap.view(H, W, T)
+    return matchmap
+def matchmapSim(M, simtype):
+    assert(M.dim() == 3)
+    if simtype == 'SISA':
+        return M.mean()
+    elif simtype == 'MISA':
+        M_maxH, _ = M.max(0)
+        M_maxHW, _ = M_maxH.max(0)
+        return M_maxHW.mean()
+    elif simtype == 'SIMA':
+        M_maxT, _ = M.max(2)
+        return M_maxT.mean()
+    else:
+        raise ValueError
+def sampled_margin_rank_loss(image_outputs, audio_outputs, nframes, margin=1., simtype='MISA'):
+    """
+    Computes the triplet margin ranking loss for each anchor image/caption pair
+    The impostor image/caption is randomly sampled from the minibatch
+    """
+    assert(image_outputs.dim() == 4)
+    assert(audio_outputs.dim() == 3)
+    n = image_outputs.size(0)
+    loss = torch.zeros(1, device=image_outputs.device, requires_grad=True)
+    for i in range(n):
+        I_imp_ind = i
+        A_imp_ind = i
+        while I_imp_ind == i:
+            I_imp_ind = np.random.randint(0, n)
+        while A_imp_ind == i:
+            A_imp_ind = np.random.randint(0, n)
+        nF = nframes[i]
+        nFimp = nframes[A_imp_ind]
+        anchorsim = matchmapSim(computeMatchmap(image_outputs[i], audio_outputs[i][:, 0:nF]), simtype)
+        Iimpsim = matchmapSim(computeMatchmap(image_outputs[I_imp_ind], audio_outputs[i][:, 0:nF]), simtype)
+        Aimpsim = matchmapSim(computeMatchmap(image_outputs[i], audio_outputs[A_imp_ind][:, 0:nFimp]), simtype)
+        A2I_simdif = margin + Iimpsim - anchorsim
+        if (A2I_simdif.data > 0).all():
+            loss = loss + A2I_simdif
+        I2A_simdif = margin + Aimpsim - anchorsim
+        if (I2A_simdif.data > 0).all():
+            loss = loss + I2A_simdif
+    loss = loss / n
+    return loss
+def compute_matchmap_similarity_matrix(image_outputs, audio_outputs, nframes, simtype='MISA'):
+    """
+    Assumes image_outputs is a (batchsize, embedding_dim, rows, height) tensor
+    Assumes audio_outputs is a (batchsize, embedding_dim, 1, time) tensor
+    Returns similarity matrix S where images are rows and audios are along the columns
+    """
+    assert(image_outputs.dim() == 4)
+    assert(audio_outputs.dim() == 3)
+    n = image_outputs.size(0)
+    S = torch.zeros(n, n, device=image_outputs.device)
+    for image_idx in range(n):
+            for audio_idx in range(n):
+                nF = max(1, nframes[audio_idx])
+                S[image_idx, audio_idx] = matchmapSim(computeMatchmap(image_outputs[image_idx], audio_outputs[audio_idx][:, 0:nF]), simtype)
+    return S
+def compute_pooldot_similarity_matrix(image_outputs, audio_outputs, nframes):
+    """
+    Assumes image_outputs is a (batchsize, embedding_dim, rows, height) tensor
+    Assumes audio_outputs is a (batchsize, embedding_dim, 1, time) tensor
+    Returns similarity matrix S where images are rows and audios are along the columns
+    S[i][j] is computed as the dot product between the meanpooled embeddings of
+    the ith image output and jth audio output
+    """
+    assert(image_outputs.dim() == 4)
+    assert(audio_outputs.dim() == 4)
+    n = image_outputs.size(0)
+    imagePoolfunc = nn.AdaptiveAvgPool2d((1, 1))
+    pooled_image_outputs = imagePoolfunc(image_outputs).squeeze(3).squeeze(2)
+    audioPoolfunc = nn.AdaptiveAvgPool2d((1, 1))
+    pooled_audio_outputs_list = []
+    for idx in range(n):
+        nF = max(1, nframes[idx])
+        pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0))
+    pooled_audio_outputs = torch.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2)
+    S = torch.mm(pooled_image_outputs, pooled_audio_outputs.t())
+    return S
+def one_imposter_index(i, N):
+    imp_ind = random.randint(0, N - 2)
+    if imp_ind == i:
+        imp_ind = N - 1
+    return imp_ind
+def basic_get_imposter_indices(N):
+    imposter_idc = []
+    for i in range(N):
+        # Select an imposter index for example i:
+        imp_ind = one_imposter_index(i, N)
+        imposter_idc.append(imp_ind)
+    return imposter_idc
+def semihardneg_triplet_loss_from_S(S, margin):
+    """
+    Input: Similarity matrix S as an autograd.Variable
+    Output: The one-way triplet loss from rows of S to columns of S. Impostors are taken
+    to be the most similar point to the anchor that is still less similar to the anchor
+    than the positive example.
+    You would need to run this function twice, once with S and once with S.t(),
+    in order to compute the triplet loss in both directions.
+    """
+    assert(S.dim() == 2)
+    assert(S.size(0) == S.size(1))
+    N = S.size(0)
+    loss = torch.autograd.Variable(torch.zeros(1).type(S.data.type()), requires_grad=True)
+    # Imposter - ground truth
+    Sdiff = S - torch.diag(S).view(-1, 1)
+    eps = 1e-12
+    # All examples less similar than ground truth
+    mask = (Sdiff < -eps).type(torch.LongTensor)
+    maskf = mask.type_as(S)
+    # Mask out all examples >= gt with minimum similarity
+    Sp = maskf * Sdiff + (1 - maskf) * torch.min(Sdiff).detach()
+    # Find the index maximum similar of the remaining
+    _, idc = Sp.max(dim=1)
+    idc = idc.data.cpu()
+    # Vector mask: 1 iff there exists an example < gt
+    has_neg = (mask.sum(dim=1) > 0).data.type(torch.LongTensor)
+    # Random imposter indices
+    random_imp_ind = torch.LongTensor(basic_get_imposter_indices(N))
+    # Use hardneg if there exists an example < gt, otherwise use random imposter
+    imp_idc = has_neg * idc + (1 - has_neg) * random_imp_ind
+    # This could probably be vectorized too, but I haven't.
+    for i, imp in enumerate(imp_idc):
+        local_loss = Sdiff[i, imp] + margin
+        if (local_loss.data > 0).all():
+            loss = loss + local_loss
+    loss = loss / N
+    return loss
+def sampled_triplet_loss_from_S(S, margin):
+    """
+    Input: Similarity matrix S as an autograd.Variable
+    Output: The one-way triplet loss from rows of S to columns of S. Imposters are
+    randomly sampled from the columns of S.
+    You would need to run this function twice, once with S and once with S.t(),
+    in order to compute the triplet loss in both directions.
+    """
+    assert(S.dim() == 2)
+    assert(S.size(0) == S.size(1))
+    N = S.size(0)
+    loss = torch.autograd.Variable(torch.zeros(1).type(S.data.type()), requires_grad=True)
+    # Imposter - ground truth
+    Sdiff = S - torch.diag(S).view(-1, 1)
+    imp_ind = torch.LongTensor(basic_get_imposter_indices(N))
+    # This could probably be vectorized too, but I haven't.
+    for i, imp in enumerate(imp_ind):
+        local_loss = Sdiff[i, imp] + margin
+        if (local_loss.data > 0).all():
+            loss = loss + local_loss
+    loss = loss / N
+    return loss
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def adjust_learning_rate(base_lr, lr_decay, optimizer, epoch):
+    """Sets the learning rate to the initial LR decayed by 10 every lr_decay epochs"""
+    lr = base_lr * (0.1 ** (epoch // lr_decay))
+    print('now learning rate changed to {:f}'.format(lr))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+def adjust_learning_rate2(base_lr, lr_decay, optimizer, epoch):
+    """Sets the learning rate to the initial LR decayed by 10 every lr_decay epochs"""
+    for param_group in optimizer.param_groups:
+        cur_lr = param_group['lr']
+        print('current learing rate is {:f}'.format(lr))
+    lr = cur_lr  * 0.1
+    print('now learning rate changed to {:f}'.format(lr))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+def load_progress(prog_pkl, quiet=False):
+    """
+    load progress pkl file
+    Args:
+        prog_pkl(str): path to progress pkl file
+    Return:
+        progress(list):
+        epoch(int):
+        global_step(int):
+        best_epoch(int):
+        best_avg_r10(float):
+    """
+    def _print(msg):
+        if not quiet:
+            print(msg)
+    with open(prog_pkl, "rb") as f:
+        prog = pickle.load(f)
+        epoch, global_step, best_epoch, best_avg_r10, _ = prog[-1]
+    _print("\nPrevious Progress:")
+    msg =  "[%5s %7s %5s %7s %6s]" % ("epoch", "step", "best_epoch", "best_avg_r10", "time")
+    _print(msg)
+    return prog, epoch, global_step, best_epoch, best_avg_r10
+def count_parameters(model):
+    return sum([p.numel() for p in model.parameters() if p.requires_grad])
+PrenetConfig = namedtuple(
+  'PrenetConfig', ['input_size', 'hidden_size', 'num_layers', 'dropout'])
+RNNConfig = namedtuple(
+  'RNNConfig',
+  ['input_size', 'hidden_size', 'num_layers', 'dropout', 'residual'])

a_cls/zero_shot.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import logging
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+from open_clip import get_input_dtype, get_tokenizer
+from open_clip.factory import HF_HUB_PREFIX
+from .precision import get_autocast
+from .stats import calculate_stats, d_prime
+from .zero_shot_classifier import build_zero_shot_classifier
+from .zero_shot_metadata import CLASSNAMES, OPENAI_IMAGENET_TEMPLATES
+def accuracy(output, target, topk=(1,)):
+    pred = output.topk(max(topk), 1, True, True)[1].t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]
+def run(model, classifier, dataloader, args):
+    autocast = get_autocast(args.precision)
+    input_dtype = get_input_dtype(args.precision)
+    with torch.no_grad():
+        top1, top5, n = 0., 0., 0.
+        for images, target in tqdm(dataloader, unit_scale=args.batch_size):
+            images = images.to(device=args.device, dtype=input_dtype)
+            images = images.unsqueeze(2)
+            target = target.to(args.device)
+            with autocast():
+                # predict
+                output = model(image=images)
+                image_features = output['image_features'] if isinstance(output, dict) else output[0]
+                logits = 100. * image_features @ classifier
+            # measure accuracy
+            acc1, acc5 = accuracy(logits, target, topk=(1, 5))
+            top1 += acc1
+            top5 += acc5
+            n += images.size(0)
+    top1 = (top1 / n)
+    top5 = (top5 / n)
+    return top1, top5
+def validate(audio_model, classifier, val_loader, args, epoch):
+    epoch = epoch - 1 ########################
+    # switch to evaluate mode
+    audio_model.eval()
+    autocast = get_autocast(args.precision)
+    input_dtype = get_input_dtype(args.precision)
+    A_predictions = []
+    A_targets = []
+    A_loss = []
+    with torch.no_grad():
+        for i, (audio_input, labels) in enumerate(tqdm(val_loader)):
+            audio_input = audio_input.to(device=args.device, dtype=input_dtype)
+            # compute output
+            with autocast():
+                # predict
+                output = audio_model(image=audio_input)
+                image_features = output['image_features'] if isinstance(output, dict) else output[0]
+                logits = 100. * image_features @ classifier
+            audio_output = logits
+            # audio_output = torch.sigmoid(audio_output)
+            predictions = audio_output.to('cpu').detach()
+            A_predictions.append(predictions)
+            A_targets.append(labels)
+            # compute the loss
+            labels = labels.to(args.device)
+            loss = nn.CrossEntropyLoss()(audio_output, torch.argmax(labels.long(), dim=1))
+            A_loss.append(loss.to('cpu').detach())
+        audio_output = torch.cat(A_predictions)
+        target = torch.cat(A_targets)
+        loss = np.mean(A_loss)
+        stats = calculate_stats(audio_output, target)
+        # save the prediction here
+        args.a_cls_output_dir = os.path.join(args.log_base_path, f'a_cls/{args.val_a_cls_data.lower()}')
+        os.makedirs(args.a_cls_output_dir, exist_ok=True)
+        if os.path.exists(args.a_cls_output_dir + '/predictions') == False:
+            os.mkdir(args.a_cls_output_dir + '/predictions')
+            np.savetxt(args.a_cls_output_dir + '/predictions/target.csv', target, delimiter=',')
+        np.savetxt(args.a_cls_output_dir + '/predictions/predictions_' + str(epoch) + '.csv', audio_output,
+                   delimiter=',')
+    valid_loss = loss
+    main_metrics = 'mAP'
+    metrics = {}
+    if args.do_train:
+        # ensemble results
+        cum_stats = validate_ensemble(args, epoch)
+        cum_mAP = np.mean([stat['AP'] for stat in cum_stats])
+        cum_mAUC = np.mean([stat['auc'] for stat in cum_stats])
+        cum_acc = cum_stats[0]['acc']
+    mAP = np.mean([stat['AP'] for stat in stats])
+    mAUC = np.mean([stat['auc'] for stat in stats])
+    acc = stats[0]['acc']
+    middle_ps = [stat['precisions'][int(len(stat['precisions']) / 2)] for stat in stats]
+    middle_rs = [stat['recalls'][int(len(stat['recalls']) / 2)] for stat in stats]
+    average_precision = np.mean(middle_ps)
+    average_recall = np.mean(middle_rs)
+    if main_metrics == 'mAP':
+        logging.info("mAP: {:.6f}".format(mAP))
+    else:
+        logging.info("acc: {:.6f}".format(acc))
+    logging.info("AUC: {:.6f}".format(mAUC))
+    logging.info("Avg Precision: {:.6f}".format(average_precision))
+    logging.info("Avg Recall: {:.6f}".format(average_recall))
+    logging.info("d_prime: {:.6f}".format(d_prime(mAUC)))
+    logging.info("valid_loss: {:.6f}".format(valid_loss))
+    if args.do_train:
+        logging.info("cum_mAP: {:.6f}".format(cum_mAP))
+        logging.info("cum_mAUC: {:.6f}".format(cum_mAUC))
+    if main_metrics == 'mAP':
+        metrics['mAP'] = float(mAP)
+    else:
+        metrics['acc'] = float(acc)
+    metrics['mAUC'] = float(mAUC)
+    metrics['average_precision'] = float(average_precision)
+    metrics['average_recall'] = float(average_recall)
+    metrics['d_prime_mAUC'] = float(d_prime(mAUC))
+    metrics['valid_loss'] = float(valid_loss)
+    if args.do_train:
+        metrics['cum_mAP'] = float(cum_mAP)
+        metrics['cum_mAUC'] = float(cum_mAUC)
+    return metrics
+def validate_ensemble(args, epoch):
+    exp_dir = args.a_cls_output_dir
+    target = np.loadtxt(exp_dir + '/predictions/target.csv', delimiter=',')
+    if epoch == 0:
+        cum_predictions = np.loadtxt(exp_dir + '/predictions/predictions_0.csv', delimiter=',')
+    else:
+        cum_predictions = np.loadtxt(exp_dir + '/predictions/cum_predictions.csv', delimiter=',') * (epoch - 1)
+        predictions = np.loadtxt(exp_dir + '/predictions/predictions_' + str(epoch) + '.csv', delimiter=',')
+        cum_predictions = cum_predictions + predictions
+        # remove the prediction file to save storage space
+        os.remove(exp_dir + '/predictions/predictions_' + str(epoch - 1) + '.csv')
+    cum_predictions = cum_predictions / (epoch + 1)
+    np.savetxt(exp_dir + '/predictions/cum_predictions.csv', cum_predictions, delimiter=',')
+    stats = calculate_stats(cum_predictions, target)
+    return stats
+def zero_shot_eval(model, data, epoch, args):
+    temp_val_a_cls_data = args.val_a_cls_data
+    args.val_a_cls_data = list(data.keys())
+    assert len(args.val_a_cls_data) == 1
+    args.val_a_cls_data = args.val_a_cls_data[0]
+    if args.val_a_cls_data not in data:
+        return {}
+    if args.zeroshot_frequency == 0:
+        return {}
+    if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs:
+        return {}
+    if args.distributed and not args.horovod:
+        model = model.module
+    logging.info(f'Starting zero-shot {args.val_a_cls_data.upper()}.')
+    logging.info('Building zero-shot classifier')
+    autocast = get_autocast(args.precision)
+    with autocast():
+        tokenizer = get_tokenizer(HF_HUB_PREFIX+args.model, cache_dir=args.cache_dir)
+        # tokenizer = get_tokenizer("ViT-L-14")
+        classifier = build_zero_shot_classifier(
+            model,
+            tokenizer=tokenizer,
+            classnames=CLASSNAMES[args.val_a_cls_data],
+            templates=OPENAI_IMAGENET_TEMPLATES,
+            num_classes_per_batch=10,
+            device=args.device,
+            use_tqdm=True,
+        )
+    logging.info('Using classifier')
+    results = {}
+    if args.val_a_cls_data.lower() == 'audioset':
+        if args.val_a_cls_data in data:
+            stats = validate(model, classifier, data[args.val_a_cls_data].dataloader, args, epoch)
+            results.update(stats)
+    else:
+        if args.val_a_cls_data in data:
+            top1, top5 = run(model, classifier, data[args.val_a_cls_data].dataloader, args)
+            results[f'{args.val_a_cls_data}-zeroshot-val-top1'] = top1
+            results[f'{args.val_a_cls_data}-zeroshot-val-top5'] = top5
+    logging.info(f'Finished zero-shot {args.val_a_cls_data.upper()}.')
+    args.val_a_cls_data = temp_val_a_cls_data
+    return results

a_cls/zero_shot_classifier.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from functools import partial
+from itertools import islice
+from typing import Callable, List, Optional, Sequence, Union
+import torch
+import torch.nn.functional as F
+def batched(iterable, n):
+    """Batch data into lists of length *n*. The last batch may be shorter.
+    NOTE based on more-itertools impl, to be replaced by python 3.12 itertools.batched impl
+    """
+    it = iter(iterable)
+    while True:
+        batch = list(islice(it, n))
+        if not batch:
+            break
+        yield batch
+def build_zero_shot_classifier(
+        model,
+        tokenizer,
+        classnames: Sequence[str],
+        templates: Sequence[Union[Callable, str]],
+        num_classes_per_batch: Optional[int] = 10,
+        device: Union[str, torch.device] = 'cpu',
+        use_tqdm: bool = False,
+):
+    """ Build zero-shot classifier weights by iterating over class names in batches
+    Args:
+        model: CLIP model instance
+        tokenizer: CLIP tokenizer instance
+        classnames: A sequence of class (label) names
+        templates: A sequence of callables or format() friendly strings to produce templates per class name
+        num_classes_per_batch: The number of classes to batch together in each forward, all if None
+        device: Device to use.
+        use_tqdm: Enable TQDM progress bar.
+    """
+    assert isinstance(templates, Sequence) and len(templates) > 0
+    assert isinstance(classnames, Sequence) and len(classnames) > 0
+    use_format = isinstance(templates[0], str)
+    num_templates = len(templates)
+    num_classes = len(classnames)
+    if use_tqdm:
+        import tqdm
+        num_iter = 1 if num_classes_per_batch is None else ((num_classes - 1) // num_classes_per_batch + 1)
+        iter_wrap = partial(tqdm.tqdm, total=num_iter, unit_scale=num_classes_per_batch)
+    else:
+        iter_wrap = iter
+    def _process_batch(batch_classnames):
+        num_batch_classes = len(batch_classnames)
+        texts = [template.format(c) if use_format else template(c) for c in batch_classnames for template in templates]
+        input_ids, attention_mask = tokenizer(texts)
+        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
+        class_embeddings = F.normalize(model.encode_text(input_ids, attention_mask), dim=-1)
+        class_embeddings = class_embeddings.reshape(num_batch_classes, num_templates, -1).mean(dim=1)
+        class_embeddings = class_embeddings / class_embeddings.norm(dim=1, keepdim=True)
+        class_embeddings = class_embeddings.T
+        return class_embeddings
+    with torch.no_grad():
+        if num_classes_per_batch:
+            batched_embeds = [_process_batch(batch) for batch in iter_wrap(batched(classnames, num_classes_per_batch))]
+            zeroshot_weights = torch.cat(batched_embeds, dim=1)
+        else:
+            zeroshot_weights = _process_batch(classnames)
+    return zeroshot_weights
+def build_zero_shot_classifier_legacy(
+        model,
+        tokenizer,
+        classnames: Sequence[str],
+        templates: Sequence[Union[Callable, str]],
+        device: Union[str, torch.device] = 'cpu',
+        use_tqdm: bool = False,
+):
+    """ Build zero-shot classifier weights by iterating over class names 1 by 1
+    Args:
+        model: CLIP model instance
+        tokenizer: CLIP tokenizer instance
+        classnames: A sequence of class (label) names
+        templates: A sequence of callables or format() friendly strings to produce templates per class name
+        device: Device to use.
+        use_tqdm: Enable TQDM progress bar.
+    """
+    assert isinstance(templates, Sequence) and len(templates) > 0
+    assert isinstance(classnames, Sequence) and len(classnames) > 0
+    if use_tqdm:
+        import tqdm
+        iter_wrap = tqdm.tqdm
+    else:
+        iter_wrap = iter
+    use_format = isinstance(templates[0], str)
+    with torch.no_grad():
+        zeroshot_weights = []
+        for classname in iter_wrap(classnames):
+            texts = [template.format(classname) if use_format else template(classname) for template in templates]
+            texts = tokenizer(texts).to(device)  # tokenize
+            class_embeddings = model.encode_text(texts)
+            class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0)
+            class_embedding /= class_embedding.norm()
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(device)
+    return zeroshot_weights

a_cls/zero_shot_metadata.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import pandas as pd
+OPENAI_IMAGENET_TEMPLATES = (
+    # lambda c: f'This is a sound of {c}.',
+    lambda c: f'a sound of {c}.',
+)
+# OPENAI_IMAGENET_TEMPLATES = (
+#     lambda c: f'a bad sound of a {c}.',
+#     lambda c: f'a sound of many {c}.',
+#     lambda c: f'a sculpture of a {c}.',
+#     lambda c: f'a sound of the hard to see {c}.',
+#     lambda c: f'a low resolution sound of the {c}.',
+#     lambda c: f'a rendering of a {c}.',
+#     lambda c: f'graffiti of a {c}.',
+#     lambda c: f'a bad sound of the {c}.',
+#     lambda c: f'a cropped sound of the {c}.',
+#     lambda c: f'a tattoo of a {c}.',
+#     lambda c: f'the embroidered {c}.',
+#     lambda c: f'a sound of a hard to see {c}.',
+#     lambda c: f'a bright sound of a {c}.',
+#     lambda c: f'a sound of a clean {c}.',
+#     lambda c: f'a sound of a dirty {c}.',
+#     lambda c: f'a dark sound of the {c}.',
+#     lambda c: f'a drawing of a {c}.',
+#     lambda c: f'a sound of my {c}.',
+#     lambda c: f'the plastic {c}.',
+#     lambda c: f'a sound of the cool {c}.',
+#     lambda c: f'a close-up sound of a {c}.',
+#     lambda c: f'a black and white sound of the {c}.',
+#     lambda c: f'a painting of the {c}.',
+#     lambda c: f'a painting of a {c}.',
+#     lambda c: f'a pixelated sound of the {c}.',
+#     lambda c: f'a sculpture of the {c}.',
+#     lambda c: f'a bright sound of the {c}.',
+#     lambda c: f'a cropped sound of a {c}.',
+#     lambda c: f'a plastic {c}.',
+#     lambda c: f'a sound of the dirty {c}.',
+#     lambda c: f'a jpeg corrupted sound of a {c}.',
+#     lambda c: f'a blurry sound of the {c}.',
+#     lambda c: f'a sound of the {c}.',
+#     lambda c: f'a good sound of the {c}.',
+#     lambda c: f'a rendering of the {c}.',
+#     lambda c: f'a {c} in a video game.',
+#     lambda c: f'a sound of one {c}.',
+#     lambda c: f'a doodle of a {c}.',
+#     lambda c: f'a close-up sound of the {c}.',
+#     lambda c: f'a sound of a {c}.',
+#     lambda c: f'the origami {c}.',
+#     lambda c: f'the {c} in a video game.',
+#     lambda c: f'a sketch of a {c}.',
+#     lambda c: f'a doodle of the {c}.',
+#     lambda c: f'a origami {c}.',
+#     lambda c: f'a low resolution sound of a {c}.',
+#     lambda c: f'the toy {c}.',
+#     lambda c: f'a rendition of the {c}.',
+#     lambda c: f'a sound of the clean {c}.',
+#     lambda c: f'a sound of a large {c}.',
+#     lambda c: f'a rendition of a {c}.',
+#     lambda c: f'a sound of a nice {c}.',
+#     lambda c: f'a sound of a weird {c}.',
+#     lambda c: f'a blurry sound of a {c}.',
+#     lambda c: f'a cartoon {c}.',
+#     lambda c: f'art of a {c}.',
+#     lambda c: f'a sketch of the {c}.',
+#     lambda c: f'a embroidered {c}.',
+#     lambda c: f'a pixelated sound of a {c}.',
+#     lambda c: f'itap of the {c}.',
+#     lambda c: f'a jpeg corrupted sound of the {c}.',
+#     lambda c: f'a good sound of a {c}.',
+#     lambda c: f'a plushie {c}.',
+#     lambda c: f'a sound of the nice {c}.',
+#     lambda c: f'a sound of the small {c}.',
+#     lambda c: f'a sound of the weird {c}.',
+#     lambda c: f'the cartoon {c}.',
+#     lambda c: f'art of the {c}.',
+#     lambda c: f'a drawing of the {c}.',
+#     lambda c: f'a sound of the large {c}.',
+#     lambda c: f'a black and white sound of a {c}.',
+#     lambda c: f'the plushie {c}.',
+#     lambda c: f'a dark sound of a {c}.',
+#     lambda c: f'itap of a {c}.',
+#     lambda c: f'graffiti of the {c}.',
+#     lambda c: f'a toy {c}.',
+#     lambda c: f'itap of my {c}.',
+#     lambda c: f'a sound of a cool {c}.',
+#     lambda c: f'a sound of a small {c}.',
+#     lambda c: f'a tattoo of the {c}.',
+# )
+# a much smaller subset of above prompts
+# from https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb
+SIMPLE_IMAGENET_TEMPLATES = (
+    lambda c: f'itap of a {c}.',
+    lambda c: f'a bad sound of the {c}.',
+    lambda c: f'a origami {c}.',
+    lambda c: f'a sound of the large {c}.',
+    lambda c: f'a {c} in a video game.',
+    lambda c: f'art of the {c}.',
+    lambda c: f'a sound of the small {c}.',
+)
+PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "class_labels_indices.csv")
+CLASSNAMES = {
+    'Audioset': tuple(pd.read_csv(PATH).values[:, 2]),
+    'ESC50': (
+        'airplane', 'breathing', 'brushing teeth', 'can opening', 'car horn', 'cat', 'chainsaw', 'chirping birds',
+        'church bells', 'clapping', 'clock alarm', 'clock tick', 'coughing', 'cow', 'crackling fire', 'crickets',
+        'crow', 'crying baby', 'dog', 'door wood creaks', 'door wood knock', 'drinking sipping', 'engine', 'fireworks',
+        'footsteps', 'frog', 'glass breaking', 'hand saw', 'helicopter', 'hen', 'insects', 'keyboard typing',
+        'laughing', 'mouse click', 'pig', 'pouring water', 'rain', 'rooster', 'sea waves', 'sheep', 'siren',
+        'sneezing', 'snoring', 'thunderstorm', 'toilet flush', 'train', 'vacuum cleaner', 'washing machine',
+        'water drops', 'wind'
+    ),
+    'VGGSound': (
+        'air conditioning noise', 'air horn', 'airplane', 'airplane flyby', 'alarm clock ringing',
+        'alligators, crocodiles hissing', 'ambulance siren', 'arc welding', 'baby babbling', 'baby crying',
+        'baby laughter', 'baltimore oriole calling', 'barn swallow calling', 'basketball bounce',
+        'bathroom ventilation fan running', 'beat boxing', 'bee, wasp, etc. buzzing', 'bird chirping, tweeting',
+        'bird squawking', 'bird wings flapping', 'black capped chickadee calling', 'blowtorch igniting',
+        'bouncing on trampoline', 'bowling impact', 'bull bellowing', 'canary calling', 'cap gun shooting',
+        'car engine idling', 'car engine knocking', 'car engine starting', 'car passing by', 'cat caterwauling',
+        'cat growling', 'cat hissing', 'cat meowing', 'cat purring', 'cattle mooing', 'cattle, bovinae cowbell',
+        'cell phone buzzing', 'chainsawing trees', 'cheetah chirrup', 'chicken clucking', 'chicken crowing',
+        'child singing', 'child speech, kid speaking', 'children shouting', 'chimpanzee pant-hooting',
+        'chinchilla barking', 'chipmunk chirping', 'chopping food', 'chopping wood', 'church bell ringing',
+        'civil defense siren', 'cow lowing', 'coyote howling', 'cricket chirping', 'crow cawing', 'cuckoo bird calling',
+        'cupboard opening or closing', 'cutting hair with electric trimmers', 'dinosaurs bellowing', 'disc scratching',
+        'dog barking', 'dog baying', 'dog bow-wow', 'dog growling', 'dog howling', 'dog whimpering',
+        'donkey, ass braying', 'door slamming', 'driving buses', 'driving motorcycle', 'driving snowmobile',
+        'duck quacking', 'eagle screaming', 'eating with cutlery', 'electric grinder grinding',
+        'electric shaver, electric razor shaving', 'elephant trumpeting', 'eletric blender running', 'elk bugling',
+        'engine accelerating, revving, vroom', 'female singing', 'female speech, woman speaking', 'ferret dooking',
+        'fire crackling', 'fire truck siren', 'fireworks banging', 'firing cannon', 'firing muskets',
+        'fly, housefly buzzing', 'foghorn', 'footsteps on snow', 'forging swords', 'fox barking', 'francolin calling',
+        'frog croaking', 'gibbon howling', 'goat bleating', 'golf driving', 'goose honking', 'hail',
+        'hair dryer drying', 'hammering nails', 'heart sounds, heartbeat', 'hedge trimmer running', 'helicopter',
+        'horse clip-clop', 'horse neighing', 'ice cracking', 'ice cream truck, ice cream van', 'lathe spinning',
+        'lawn mowing', 'lighting firecrackers', 'lions growling', 'lions roaring', 'lip smacking',
+        'machine gun shooting', 'magpie calling', 'male singing', 'male speech, man speaking', 'metronome',
+        'missile launch', 'mosquito buzzing', 'motorboat, speedboat acceleration', 'mouse clicking', 'mouse pattering',
+        'mouse squeaking', 'mynah bird singing', 'ocean burbling', 'opening or closing car doors',
+        'opening or closing car electric windows', 'opening or closing drawers', 'orchestra', 'otter growling',
+        'owl hooting', 'parrot talking', 'penguins braying', 'people babbling', 'people battle cry',
+        'people belly laughing', 'people booing', 'people burping', 'people cheering', 'people clapping',
+        'people coughing', 'people crowd', 'people eating', 'people eating apple', 'people eating crisps',
+        'people eating noodle', 'people farting', 'people finger snapping', 'people gargling', 'people giggling',
+        'people hiccup', 'people humming', 'people marching', 'people nose blowing', 'people running',
+        'people screaming', 'people shuffling', 'people slapping', 'people slurping', 'people sneezing',
+        'people sniggering', 'people sobbing', 'people whispering', 'people whistling', 'pheasant crowing',
+        'pig oinking', 'pigeon, dove cooing', 'planing timber', 'plastic bottle crushing', 'playing accordion',
+        'playing acoustic guitar', 'playing badminton', 'playing bagpipes', 'playing banjo', 'playing bass drum',
+        'playing bass guitar', 'playing bassoon', 'playing bongo', 'playing bugle', 'playing castanets',
+        'playing cello', 'playing clarinet', 'playing congas', 'playing cornet', 'playing cymbal', 'playing darts',
+        'playing didgeridoo', 'playing djembe', 'playing double bass', 'playing drum kit', 'playing electric guitar',
+        'playing electronic organ', 'playing erhu', 'playing flute', 'playing french horn', 'playing glockenspiel',
+        'playing gong', 'playing guiro', 'playing hammond organ', 'playing harmonica', 'playing harp',
+        'playing harpsichord', 'playing hockey', 'playing lacrosse', 'playing mandolin', 'playing marimba, xylophone',
+        'playing oboe', 'playing piano', 'playing saxophone', 'playing shofar', 'playing sitar', 'playing snare drum',
+        'playing squash', 'playing steel guitar, slide guitar', 'playing steelpan', 'playing synthesizer',
+        'playing tabla', 'playing table tennis', 'playing tambourine', 'playing tennis', 'playing theremin',
+        'playing timbales', 'playing timpani', 'playing trombone', 'playing trumpet', 'playing tuning fork',
+        'playing tympani', 'playing ukulele', 'playing vibraphone', 'playing violin, fiddle', 'playing volleyball',
+        'playing washboard', 'playing zither', 'police car (siren)', 'police radio chatter', 'popping popcorn',
+        'printer printing', 'pumping water', 'race car, auto racing', 'railroad car, train wagon', 'raining', 'rapping',
+        'reversing beeps', 'ripping paper', 'roller coaster running', 'rope skipping', 'rowboat, canoe, kayak rowing',
+        'running electric fan', 'sailing', 'scuba diving', 'sea lion barking', 'sea waves', 'sharpen knife',
+        'sheep bleating', 'shot football', 'singing bowl', 'singing choir', 'skateboarding', 'skidding', 'skiing',
+        'sliding door', 'sloshing water', 'slot machine', 'smoke detector beeping', 'snake hissing', 'snake rattling',
+        'splashing water', 'spraying water', 'squishing water', 'stream burbling', 'strike lighter', 'striking bowling',
+        'striking pool', 'subway, metro, underground', 'swimming', 'tap dancing', 'tapping guitar',
+        'telephone bell ringing', 'thunder', 'toilet flushing', 'tornado roaring', 'tractor digging', 'train horning',
+        'train wheels squealing', 'train whistling', 'turkey gobbling', 'typing on computer keyboard',
+        'typing on typewriter', 'underwater bubbling', 'using sewing machines', 'vacuum cleaner cleaning floors',
+        'vehicle horn, car horn, honking', 'volcano explosion', 'warbler chirping', 'waterfall burbling',
+        'whale calling', 'wind chime', 'wind noise', 'wind rustling leaves', 'wood thrush calling',
+        'woodpecker pecking tree', 'writing on blackboard with chalk', 'yodelling', 'zebra braying'
+    )
+}

a_cls/zeroshot_cls.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import json
+import logging
+import os
+from training.distributed import is_master
+from .zero_shot import zero_shot_eval
+try:
+    import wandb
+except ImportError:
+    wandb = None
+def evaluate_a_cls(model, data, epoch, args, tb_writer=None):
+    metrics = {}
+    if not is_master(args):
+        return metrics
+    model.eval()
+    zero_shot_metrics = zero_shot_eval(model, data, epoch, args)
+    metrics.update(zero_shot_metrics)
+    if not metrics:
+        return metrics
+    logging.info(
+        f"Eval Epoch: {epoch} "
+        + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+    )
+    if args.save_logs:
+        for name, val in metrics.items():
+            if tb_writer is not None:
+                tb_writer.add_scalar(f"val/a_cls/{args.val_a_cls_data[0].lower()}/{name}", val, epoch)
+        args.a_cls_output_dir = os.path.join(args.log_base_path, f'a_cls/{args.val_a_cls_data[0].lower()}')
+        os.makedirs(args.a_cls_output_dir, exist_ok=True)
+        with open(os.path.join(args.a_cls_output_dir, "results.jsonl"), "a+") as f:
+            f.write(json.dumps(metrics))
+            f.write("\n")
+    if args.wandb:
+        assert wandb is not None, 'Please install wandb.'
+        for name, val in metrics.items():
+            wandb.log({f"val/{name}": val, 'epoch': epoch})
+    return metrics

al_ret/data_dataloaders.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import argparse
+import torch
+from torch.utils.data import DataLoader
+from data.build_datasets import get_data
+from data.process_audio import get_audio_transform
+from .dataloader_msrvtt_retrieval import MSRVTT_DataLoader
+def dataloader_msrvtt_test(args, tokenizer, subset="test"):
+    msrvtt_testset = MSRVTT_DataLoader(
+        csv_path=args.val_csv,
+        features_path=args.features_path,
+        max_words=args.max_words,
+        tokenizer=tokenizer,
+        transform=get_audio_transform(args)
+    )
+    dataloader_msrvtt = DataLoader(
+        msrvtt_testset,
+        batch_size=args.batch_size_val,
+        num_workers=args.num_thread_reader,
+        shuffle=False,
+        drop_last=False,
+    )
+    return dataloader_msrvtt, len(msrvtt_testset)
+DATALOADER_DICT = {}
+DATALOADER_DICT["msrvtt"] = {"val":dataloader_msrvtt_test, "test":None}

al_ret/dataloader_msrvtt_retrieval.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+from __future__ import print_function
+import os
+import torchaudio
+from torch.utils.data import Dataset
+import numpy as np
+import pandas as pd
+from collections import defaultdict
+import json
+import random
+from torchvision.io import read_video
+class MSRVTT_DataLoader(Dataset):
+    """MSRVTT dataset loader."""
+    def __init__(
+            self,
+            csv_path,
+            features_path,
+            tokenizer,
+            transform=77,
+            max_words=30,
+    ):
+        self.data = pd.read_csv(csv_path)
+        self.features_path = features_path
+        self.max_words = max_words
+        self.tokenizer = tokenizer
+        # self.rawVideoExtractor = RawVideoExtractor(framerate=feature_framerate, size=image_resolution)
+        self.transform = transform
+        self.SPECIAL_TOKEN = {"CLS_TOKEN": "<|startoftext|>", "SEP_TOKEN": "<|endoftext|>",
+                              "MASK_TOKEN": "[MASK]", "UNK_TOKEN": "[UNK]", "PAD_TOKEN": "[PAD]"}
+    def __len__(self):
+        return len(self.data)
+    def _get_text(self, video_id, sentence):
+        choice_video_ids = [video_id]
+        n_caption = len(choice_video_ids)
+        k = n_caption
+        pairs_text = np.zeros((k, self.max_words), dtype=np.long)
+        pairs_mask = np.zeros((k, self.max_words), dtype=np.long)
+        pairs_segment = np.zeros((k, self.max_words), dtype=np.long)
+        for i, video_id in enumerate(choice_video_ids):
+            # words = self.tokenizer.tokenize(sentence)
+            #
+            # words = [self.SPECIAL_TOKEN["CLS_TOKEN"]] + words
+            # total_length_with_CLS = self.max_words - 1
+            # if len(words) > total_length_with_CLS:
+            #     words = words[:total_length_with_CLS]
+            # words = words + [self.SPECIAL_TOKEN["SEP_TOKEN"]]
+            #
+            # input_ids = self.tokenizer.convert_tokens_to_ids(words)
+            # input_mask = [1] * len(input_ids)
+            # segment_ids = [0] * len(input_ids)
+            output = self.tokenizer(sentence)
+            input_ids = output[0].squeeze()
+            input_mask = output[1].squeeze()
+            segment_ids = [0] * len(input_ids)
+            while len(input_ids) < self.max_words:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+            assert len(input_ids) == self.max_words
+            assert len(input_mask) == self.max_words
+            assert len(segment_ids) == self.max_words
+            pairs_text[i] = np.array(input_ids)
+            pairs_mask[i] = np.array(input_mask)
+            pairs_segment[i] = np.array(segment_ids)
+        return pairs_text, pairs_mask, pairs_segment, choice_video_ids
+    def _get_rawvideo(self, choice_video_ids):
+        # Pair x L x T x 3 x H x W
+        audio = np.zeros((len(choice_video_ids), 3,
+                          self.transform.num_mel_bins, self.transform.target_length), dtype=np.float)
+        assert len(choice_video_ids) == 1
+        for i, video_id in enumerate(choice_video_ids):
+            # Individual for YoucokII dataset, due to it video format
+            video_path = os.path.join(self.features_path, "{}.mp4".format(video_id))
+            if os.path.exists(video_path) is False:
+                video_path = video_path.replace(".mp4", ".webm")
+            # raw_video_data = self.rawVideoExtractor.get_video_data(video_path)
+            # _, raw_audio_data, info = read_video(video_path, pts_unit='sec')
+            # audio_data = self.transform((raw_audio_data, info['audio_fps']))
+            audio_data = torchaudio.load(video_path.replace('mp4', 'wav'))
+            audio_data = self.transform(audio_data)
+            # audio[i] = audio_data
+        return audio_data
+    def __getitem__(self, idx):
+        video_id = self.data['video_id'].values[idx]
+        sentence = self.data['sentence'].values[idx]
+        pairs_text, pairs_mask, pairs_segment, choice_video_ids = self._get_text(video_id, sentence)
+        audio_data = self._get_rawvideo(choice_video_ids)
+        return audio_data, pairs_text, pairs_mask

al_ret/datasets.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import logging
+import os.path
+import random
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+from data.build_datasets import DataInfo
+from open_clip import get_input_dtype, get_tokenizer
+from open_clip.factory import HF_HUB_PREFIX
+from data.process_audio import get_audio_transform, torchaudio_loader
+class Audiocaps_dataset(Dataset):
+    def __init__(self, data_path, transform, loader, tokenizer):
+        super(Audiocaps_dataset, self).__init__()
+        self.audio_root = data_path
+        raw_meta = pd.read_csv(f'{self.audio_root}/audiocaps_test.tsv', delimiter='\t').values
+        audio_ids = list(set(raw_meta[:, 1].tolist()))
+        captions = {}
+        for i in raw_meta:
+            if captions.get(i[1], None) is None:
+                captions[i[1]] = [i[2]]
+            else:
+                captions[i[1]] = captions[i[1]] + [i[2]]
+        # captions = {i[:1][0]: i[1:].tolist() for i in raw_meta}
+        self.sample_len = 0
+        self.sentences_dict = {}
+        self.cut_off_points = []
+        for audio_id in audio_ids:
+            assert audio_id in captions
+            for cap in captions[audio_id]:
+                cap_txt = cap
+                self.sentences_dict[len(self.sentences_dict)] = (audio_id[10:], cap_txt)
+            self.cut_off_points.append(len(self.sentences_dict))
+        self.multi_sentence_per_audio = True  # !!! important tag for eval
+        if self.multi_sentence_per_audio:
+            # if self.subset == "val" or self.subset == "test":
+            self.sentence_num = len(self.sentences_dict)
+            self.audio_num = len(audio_ids)
+            assert len(self.cut_off_points) == self.audio_num
+            print("Sentence number: {}".format(self.sentence_num))
+            print("Video number: {}".format(self.audio_num))
+        self.sample_len = len(self.sentences_dict)
+        self.transform = transform
+        self.torchaudio_loader = loader
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return self.sample_len
+    def __getitem__(self, idx):
+        audiocap_id, caption = self.sentences_dict[idx]
+        audio_path = os.path.join(self.audio_root, audiocap_id)
+        audio = self.torchaudio_loader(audio_path)
+        audio_data = self.transform(audio)
+        input_ids, attention_mask = self.tokenizer(caption)
+        return audio_data, input_ids.squeeze(), attention_mask.squeeze()
+class Clotho_dataset(Dataset):
+    def __init__(self, data_path, transform, loader, tokenizer):
+        super(Clotho_dataset, self).__init__()
+        self.audio_root = data_path
+        raw_meta = pd.read_csv(f'{self.audio_root}/CLOTHO_retrieval_dataset/clotho_captions_evaluation.csv').values
+        audio_ids = raw_meta[:, 0].tolist()
+        captions = {i[:1][0]: i[1:].tolist() for i in raw_meta}
+        # self.meta = pd.DataFrame(np.vstack([np.vstack([raw_meta[:, 0], raw_meta[:, i]]).T for i in range(1, 6)]),
+        #                          columns=['uniq_id', 'text'])
+        self.sample_len = 0
+        self.sentences_dict = {}
+        self.cut_off_points = []
+        for audio_id in audio_ids:
+            assert audio_id in captions
+            for cap in captions[audio_id]:
+                cap_txt = cap
+                self.sentences_dict[len(self.sentences_dict)] = (audio_id, cap_txt)
+            self.cut_off_points.append(len(self.sentences_dict))
+        self.multi_sentence_per_audio = True    # !!! important tag for eval
+        if self.multi_sentence_per_audio:
+            # if self.subset == "val" or self.subset == "test":
+            self.sentence_num = len(self.sentences_dict)
+            self.audio_num = len(audio_ids)
+            assert len(self.cut_off_points) == self.audio_num
+            print("Sentence number: {}".format(self.sentence_num))
+            print("Video number: {}".format(self.audio_num))
+        self.sample_len = len(self.sentences_dict)
+        self.transform = transform
+        self.torchaudio_loader = loader
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return self.sample_len
+    def __getitem__(self, idx):
+        audiocap_id, caption = self.sentences_dict[idx]
+        # audiocap_id = self.meta['uniq_id'][idx]
+        audio_path = os.path.join(self.audio_root, f'evaluation/{audiocap_id}')
+        audio = self.torchaudio_loader(audio_path)
+        audio_data = self.transform(audio)
+        # caption = self.meta['text'][idx]
+        input_ids, attention_mask = self.tokenizer(caption)
+        return audio_data, input_ids.squeeze(), attention_mask.squeeze()
+def get_audio_dataset(args):
+    data_path = args.audio_data_path
+    transform = get_audio_transform(args)
+    tokenizer = get_tokenizer(HF_HUB_PREFIX+args.model, cache_dir=args.cache_dir)
+    if args.val_al_ret_data.lower() == 'audiocaps':
+        dataset = Audiocaps_dataset(data_path, transform=transform, loader=torchaudio_loader, tokenizer=tokenizer)
+    elif args.val_al_ret_data.lower() == 'clotho':
+        dataset = Clotho_dataset(data_path, transform=transform, loader=torchaudio_loader, tokenizer=tokenizer)
+    else:
+        raise ValueError(f'unsupport dataset {args.val_al_ret_data}')
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        num_workers=args.workers,
+        shuffle=False,
+        drop_last=False,
+    )
+    return dataloader

al_ret/metrics.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+from __future__ import print_function
+import numpy as np
+import torch
+def compute_metrics(x):
+    sx = np.sort(-x, axis=1)
+    d = np.diag(-x)
+    d = d[:, np.newaxis]
+    ind = sx - d
+    ind = np.where(ind == 0)
+    ind = ind[1]
+    metrics = {}
+    metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind)
+    metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind)
+    metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind)
+    metrics['MR'] = np.median(ind) + 1
+    metrics["MedianR"] = metrics['MR']
+    metrics["MeanR"] = np.mean(ind) + 1
+    # metrics["cols"] = [int(i) for i in list(ind)]
+    return metrics
+def print_computed_metrics(metrics):
+    r1 = metrics['R1']
+    r5 = metrics['R5']
+    r10 = metrics['R10']
+    mr = metrics['MR']
+    print('R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.format(r1, r5, r10, mr))
+# below two functions directly come from: https://github.com/Deferf/Experiments
+def tensor_text_to_video_metrics(sim_tensor, top_k = [1,5,10]):
+    if not torch.is_tensor(sim_tensor):
+      sim_tensor = torch.tensor(sim_tensor)
+    # Permute sim_tensor so it represents a sequence of text-video similarity matrices.
+    # Then obtain the double argsort to position the rank on the diagonal
+    stacked_sim_matrices = sim_tensor.permute(1, 0, 2)
+    first_argsort = torch.argsort(stacked_sim_matrices, dim = -1, descending= True)
+    second_argsort = torch.argsort(first_argsort, dim = -1, descending= False)
+    # Extracts ranks i.e diagonals
+    ranks = torch.flatten(torch.diagonal(second_argsort, dim1 = 1, dim2 = 2))
+    # Now we need to extract valid ranks, as some belong to inf padding values
+    permuted_original_data = torch.flatten(torch.diagonal(sim_tensor, dim1 = 0, dim2 = 2))
+    mask = ~ torch.logical_or(torch.isinf(permuted_original_data), torch.isnan(permuted_original_data))
+    valid_ranks = ranks[mask]
+    # A quick dimension check validates our results, there may be other correctness tests pending
+    # Such as dot product localization, but that is for other time.
+    #assert int(valid_ranks.shape[0]) ==  sum([len(text_dict[k]) for k in text_dict])
+    if not torch.is_tensor(valid_ranks):
+      valid_ranks = torch.tensor(valid_ranks)
+    results = {f"R{k}": float(torch.sum(valid_ranks < k) * 100 / len(valid_ranks)) for k in top_k}
+    results["MedianR"] = float(torch.median(valid_ranks + 1))
+    results["MeanR"] = float(np.mean(valid_ranks.numpy() + 1))
+    results["Std_Rank"] = float(np.std(valid_ranks.numpy() + 1))
+    results['MR'] = results["MedianR"]
+    return results
+def tensor_video_to_text_sim(sim_tensor):
+    if not torch.is_tensor(sim_tensor):
+      sim_tensor = torch.tensor(sim_tensor)
+    # Code to avoid nans
+    sim_tensor[sim_tensor != sim_tensor] = float('-inf')
+    # Forms a similarity matrix for use with rank at k
+    values, _ = torch.max(sim_tensor, dim=1, keepdim=True)
+    return torch.squeeze(values).T

al_ret/precision.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from contextlib import suppress
+def get_autocast(precision):
+    if precision == 'amp':
+        return torch.cuda.amp.autocast
+    elif precision == 'amp_bfloat16' or precision == 'amp_bf16':
+        # amp_bfloat16 is more stable than amp float16 for clip training
+        return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
+    else:
+        return suppress

al_ret/retrieval.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import json
+import logging
+import os
+import numpy as np
+import torch
+from training.distributed import is_master
+from .zero_shot import zero_shot_eval
+from .util import parallel_apply
+from .metrics import compute_metrics, tensor_text_to_video_metrics, tensor_video_to_text_sim
+from torch.nn import functional as F
+try:
+    import wandb
+except ImportError:
+    wandb = None
+#
+# def evaluate_al_ret(model, data, epoch, args, tb_writer=None):
+#     metrics = {}
+#     if not is_master(args):
+#         return metrics
+#     model.eval()
+#
+#     zero_shot_metrics = zero_shot_eval(model, data, epoch, args)
+#     metrics.update(zero_shot_metrics)
+#
+#     if not metrics:
+#         return metrics
+#
+#     logging.info(
+#         f"Eval Epoch: {epoch} "
+#         + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+#     )
+#
+#     if args.save_logs:
+#         for name, val in metrics.items():
+#             if tb_writer is not None:
+#                 tb_writer.add_scalar(f"val/al_ret/{name}", val, epoch)
+#         args.al_ret_output_dir = os.path.join(args.log_base_path, 'al_ret')
+#         os.makedirs(args.al_ret_output_dir, exist_ok=True)
+#         with open(os.path.join(args.al_ret_output_dir, "results.jsonl"), "a+") as f:
+#             f.write(json.dumps(metrics))
+#             f.write("\n")
+#
+#     if args.wandb:
+#         assert wandb is not None, 'Please install wandb.'
+#         for name, val in metrics.items():
+#             wandb.log({f"val/{name}": val, 'epoch': epoch})
+#
+#     return metrics
+def _run_on_single_gpu(model,
+                       # batch_list_t, batch_list_v,
+                       batch_sequence_output_list, batch_visual_output_list):
+    sim_matrix = []
+    for idx1 in range(len(batch_sequence_output_list)):
+        # input_mask, segment_ids, *_tmp = b1
+        sequence_output = batch_sequence_output_list[idx1]
+        each_row = []
+        for idx2 in range(len(batch_visual_output_list)):
+            # video_mask, *_tmp = b2
+            visual_output = batch_visual_output_list[idx2]
+            # b1b2_logits, *_tmp = model.get_similarity_logits(sequence_output, visual_output, input_mask, video_mask,
+            #                                                          loose_type=model.loose_type)
+            # logging.info(f"{model.logit_scale.device}, {visual_output.device}, {sequence_output.device}")
+            b1b2_logits = model.logit_scale * sequence_output @ visual_output.T
+            # print(model.logit_scale.device, visual_output.device, sequence_output.device)
+            # logging.info(f"{b1b2_logits.shape}, {b1b2_logits.device}")
+            b1b2_logits = b1b2_logits.cpu().detach().numpy()
+            each_row.append(b1b2_logits)
+        each_row = np.concatenate(tuple(each_row), axis=-1)
+        sim_matrix.append(each_row)
+    return sim_matrix
+def evaluate_al_ret(model, data, epoch, args, tb_writer=None):
+    if is_master(args) and (args.val_frequency and ((epoch % args.val_frequency) == 0 or epoch == args.epochs)):
+        # print(data)
+        val_al_ret_data = list(data.keys())
+        # print(val_vl_ret_data)
+        assert len(val_al_ret_data) == 1
+        val_al_ret_data = val_al_ret_data[0]
+        test_dataloader = data[val_al_ret_data]
+        # print(len(test_dataloader))
+        # print(len(test_dataloader))
+        # print(len(test_dataloader))
+        # print(len(test_dataloader))
+        device = model.device
+        n_gpu = torch.cuda.device_count()
+        logging.info(f"\nEval Epoch: {epoch}, eval Audio-Text Retrieval under {val_al_ret_data.upper()} test data")
+        if hasattr(model, 'module'):
+            model = model.module.to(device)
+        else:
+            model = model.to(device)
+        # #################################################################
+        ## below variables are used to multi-sentences retrieval
+        # multi_sentence_: important tag for eval
+        # cut_off_points: used to tag the label when calculate the metric
+        # sentence_num: used to cut the sentence representation
+        # video_num: used to cut the video representation
+        # #################################################################
+        multi_sentence_ = False
+        cut_off_points_, sentence_num_, video_num_ = [], -1, -1
+        if hasattr(test_dataloader.dataset, 'multi_sentence_per_audio') and test_dataloader.dataset.multi_sentence_per_audio:
+        # if False:
+            multi_sentence_ = True
+            cut_off_points_ = test_dataloader.dataset.cut_off_points
+            sentence_num_ = test_dataloader.dataset.sentence_num
+            video_num_ = test_dataloader.dataset.audio_num
+            cut_off_points_ = [itm - 1 for itm in cut_off_points_]
+        if multi_sentence_:
+            print("Eval under the multi-sentence per audio clip setting.")
+            print("sentence num: {}, video num: {}".format(sentence_num_, video_num_))
+            logging.info("Eval under the multi-sentence per audio clip setting.")
+            logging.info("sentence num: {}, video num: {}".format(sentence_num_, video_num_))
+        model.eval()
+        with torch.no_grad():
+            # batch_list_t = []
+            # batch_list_v = []
+            batch_sequence_output_list, batch_visual_output_list = [], []
+            total_video_num = 0
+            # ----------------------------
+            # 1. cache the features
+            # ----------------------------
+            for bid, batch in enumerate(test_dataloader):
+                # batch = tuple(t.to(device) for t in batch)
+                video, input_ids, attention_mask = batch
+                # print(input_ids.shape, video.shape, video.dtype)
+                input_ids = input_ids.squeeze().to(device)
+                attention_mask = attention_mask.squeeze().to(device)
+                # video = video.squeeze().permute(0, 2, 1, 3, 4).float().to(device)
+                video = video.float().to(device)
+                # print(input_ids.shape, video.shape, video.dtype)
+                # print(input_ids.shape, video.shape)
+                if multi_sentence_:
+                    # multi-sentences retrieval means: one clip has two or more descriptions.
+                    b, *_t = video.shape
+                    sequence_output = model.encode_text(input_ids, attention_mask)
+                    # logging.info(f'multi: {sequence_output.shape}')
+                    # sequence_output = model.get_sequence_output(input_ids, segment_ids, input_mask)
+                    batch_sequence_output_list.append(sequence_output)
+                    # batch_list_t.append((input_mask, segment_ids,))
+                    # 0 16
+                    s_, e_ = total_video_num, total_video_num + b
+                    filter_inds = [itm - s_ for itm in cut_off_points_ if itm >= s_ and itm < e_] # cut_off_points_ [0 4 9 14]
+                    if len(filter_inds) > 0:
+                        # video, video_mask = video[filter_inds, ...], video_mask[filter_inds, ...]
+                        # print('before', video.shape)
+                        video = video[filter_inds, ...]
+                        # print('after', video.shape)
+                        # visual_output = model.get_visual_output(video, video_mask)
+                        visual_output = model.encode_image(video)
+                        batch_visual_output_list.append(visual_output)
+                        # batch_list_v.append((video_mask,))
+                    total_video_num += b
+                else:
+                    sequence_output = model.encode_text(input_ids, attention_mask)
+                    visual_output = model.encode_image(video)
+                    # sequence_output, visual_output = model.get_sequence_visual_output(input_ids, segment_ids, input_mask, video, video_mask)
+                    batch_sequence_output_list.append(sequence_output)
+                    # batch_list_t.append((input_mask, segment_ids,))
+                    batch_visual_output_list.append(visual_output)
+                    # batch_list_v.append((video_mask,))
+                print(f"Process {val_al_ret_data.upper()}: {bid}/{len(test_dataloader)}\r", end='')
+            # ----------------------------------
+            # 2. calculate the similarity
+            # ----------------------------------
+            n_gpu = torch.cuda.device_count()
+            if n_gpu > 1:
+                # print('n_gpu > 1')
+                device_ids = list(range(n_gpu))
+                # print('device_ids', device_ids)
+                batch_t_output_splits = []
+                batch_v_output_splits = []
+                bacth_len = len(batch_sequence_output_list)
+                # print(bacth_len)
+                split_len = (bacth_len + n_gpu - 1) // n_gpu
+                for dev_id in device_ids:
+                    s_, e_ = dev_id * split_len, (dev_id + 1) * split_len
+                    if dev_id == 0:
+                        batch_t_output_splits.append(batch_sequence_output_list[s_:e_])
+                        batch_v_output_splits.append(batch_visual_output_list)
+                        # print(len(batch_sequence_output_list[s_:e_]), len(batch_visual_output_list))
+                    else:
+                        devc = torch.device('cuda:{}'.format(str(dev_id)))
+                        devc_batch_list = [b.to(devc) for b in batch_sequence_output_list[s_:e_]]
+                        batch_t_output_splits.append(devc_batch_list)
+                        devc_batch_list = [b.to(devc) for b in batch_visual_output_list]
+                        batch_v_output_splits.append(devc_batch_list)
+                        # print(len(devc_batch_list), len(devc_batch_list))
+                parameters_tuple_list = [(
+                                          batch_t_output_splits[dev_id], batch_v_output_splits[dev_id]) for dev_id in device_ids]
+                parallel_outputs = parallel_apply(_run_on_single_gpu, model, parameters_tuple_list, device_ids)
+                sim_matrix = []
+                for idx in range(len(parallel_outputs)):
+                    sim_matrix += parallel_outputs[idx]
+                sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)
+            else:
+                sim_matrix = _run_on_single_gpu(model,
+                                                # batch_list_t, batch_list_v,
+                                                batch_sequence_output_list, batch_visual_output_list)
+                sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)
+        #####################################################################
+        if multi_sentence_:
+            logging.info(f"{val_al_ret_data.upper()} before reshape, sim matrix size: {sim_matrix.shape}")
+            cut_off_points2len_ = [itm + 1 for itm in cut_off_points_]
+            max_length = max([e_-s_ for s_, e_ in zip([0]+cut_off_points2len_[:-1], cut_off_points2len_)])
+            sim_matrix_new = []
+            for s_, e_ in zip([0] + cut_off_points2len_[:-1], cut_off_points2len_):
+                sim_matrix_new.append(np.concatenate((sim_matrix[s_:e_],
+                                                      np.full((max_length-e_+s_, sim_matrix.shape[1]), -np.inf)), axis=0))
+            sim_matrix = np.stack(tuple(sim_matrix_new), axis=0)
+            logging.info(f"{val_al_ret_data.upper()} after reshape, sim matrix size: {sim_matrix.shape}")
+            tv_metrics = tensor_text_to_video_metrics(sim_matrix)
+            # vt_metrics = compute_metrics(tensor_video_to_text_sim(sim_matrix))
+        else:
+            logging.info(f"{val_al_ret_data.upper()} sim matrix size: {sim_matrix.shape[0]}, {sim_matrix.shape[1]}")
+            t2v_sim_matrix = torch.from_numpy(sim_matrix).cuda()
+            # t2v_sim_matrix = t2v_sim_matrix * F.softmax(t2v_sim_matrix*10, dim=0) * len(t2v_sim_matrix)
+            tv_metrics = compute_metrics(t2v_sim_matrix.cpu().numpy())
+            # vt_metrics = compute_metrics(t2v_sim_matrix.T.cpu().numpy())
+            logging.info('\t Length-T: {}, Length-V:{}'.format(len(sim_matrix), len(sim_matrix[0])))
+        logging.info(f"{val_al_ret_data.upper()} Text-to-Audio:")
+        logging.info('\t>>>  R@1: {:.1f} - R@5: {:.1f} - R@10: {:.1f} - Median R: {:.1f} - Mean R: {:.1f}'.
+                    format(tv_metrics['R1'], tv_metrics['R5'], tv_metrics['R10'], tv_metrics['MR'], tv_metrics['MeanR']))
+        # logging.info(f"{val_al_ret_data.upper()} Text-to-Audio:")
+        # logging.info('\t>>>  V2T$R@1: {:.1f} - V2T$R@5: {:.1f} - V2T$R@10: {:.1f} - V2T$Median R: {:.1f} - V2T$Mean R: {:.1f}'.
+        #             format(vt_metrics['R1'], vt_metrics['R5'], vt_metrics['R10'], vt_metrics['MR'], vt_metrics['MeanR']))
+        if args.save_logs:
+            for name, val in tv_metrics.items():
+                if tb_writer is not None:
+                    tb_writer.add_scalar(f"val/al_ret/{val_al_ret_data}/t2a/{name}", val, epoch)
+            # for name, val in vt_metrics.items():
+            #     if tb_writer is not None:
+            #         tb_writer.add_scalar(f"val/al_ret/{val_al_ret_data}/v2t/{name}", val, epoch)
+            args.al_ret_output_dir = os.path.join(args.log_base_path, f'al_ret/{val_al_ret_data}')
+            os.makedirs(args.al_ret_output_dir, exist_ok=True)
+            with open(os.path.join(args.al_ret_output_dir, "results.jsonl"), "a+") as f:
+                f.write(json.dumps({'t2a': tv_metrics}))
+                f.write("\n")
+                # f.write(json.dumps({'v2t': vt_metrics}))
+                # f.write("\n")

al_ret/util.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import torch.nn as nn
+import threading
+from torch._utils import ExceptionWrapper
+import logging
+def get_a_var(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj
+    if isinstance(obj, list) or isinstance(obj, tuple):
+        for result in map(get_a_var, obj):
+            if isinstance(result, torch.Tensor):
+                return result
+    if isinstance(obj, dict):
+        for result in map(get_a_var, obj.items()):
+            if isinstance(result, torch.Tensor):
+                return result
+    return None
+def parallel_apply(fct, model, inputs, device_ids):
+    modules = nn.parallel.replicate(model, device_ids)
+    assert len(modules) == len(inputs)
+    lock = threading.Lock()
+    results = {}
+    grad_enabled = torch.is_grad_enabled()
+    def _worker(i, module, input):
+        torch.set_grad_enabled(grad_enabled)
+        device = get_a_var(input).get_device()
+        try:
+            with torch.cuda.device(device):
+                # this also avoids accidental slicing of `input` if it is a Tensor
+                if not isinstance(input, (list, tuple)):
+                    input = (input,)
+                output = fct(module, *input)
+            with lock:
+                results[i] = output
+        except Exception:
+            with lock:
+                results[i] = ExceptionWrapper(where="in replica {} on device {}".format(i, device))
+    if len(modules) > 1:
+        threads = [threading.Thread(target=_worker, args=(i, module, input))
+                   for i, (module, input) in enumerate(zip(modules, inputs))]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0])
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, ExceptionWrapper):
+            output.reraise()
+        outputs.append(output)
+    return outputs
+def get_logger(filename=None):
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.DEBUG)
+    logging.basicConfig(format='%(asctime)s - %(levelname)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+    if filename is not None:
+        handler = logging.FileHandler(filename)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
+        logging.getLogger().addHandler(handler)
+    return logger

al_ret/zero_shot.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import logging
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from open_clip import get_input_dtype, get_tokenizer
+from open_clip.factory import HF_HUB_PREFIX
+from .precision import get_autocast
+def compute_metrics(x):
+    sx = np.sort(-x, axis=1)
+    d = np.diag(-x)
+    d = d[:, np.newaxis]
+    ind = sx - d
+    ind = np.where(ind == 0)
+    ind = ind[1]
+    metrics = {}
+    metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind)
+    metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind)
+    metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind)
+    metrics['MR'] = np.median(ind) + 1
+    metrics["MedianR"] = metrics['MR']
+    metrics["MeanR"] = np.mean(ind) + 1
+    # metrics["cols"] = [int(i) for i in list(ind)]
+    return metrics
+def _run_on_single_gpu(model, batch_sequence_output_list, batch_visual_output_list):
+    sim_matrix = []
+    logit_scale = model.logit_scale.exp()
+    for idx1, sequence_output in enumerate(batch_sequence_output_list):
+        each_row = []
+        for idx2, visual_output in enumerate(batch_visual_output_list):
+            b1b2_logits = logit_scale * torch.matmul(sequence_output, visual_output.t())
+            b1b2_logits = b1b2_logits.cpu().detach().numpy()
+            each_row.append(b1b2_logits)
+        each_row = np.concatenate(tuple(each_row), axis=-1)
+        sim_matrix.append(each_row)
+    return sim_matrix
+def run(model, dataloader, args):
+    autocast = get_autocast(args.precision)
+    input_dtype = get_input_dtype(args.precision)
+    with torch.no_grad():
+        sequence_output_list, visual_output_list = [], []
+        for images, input_ids, attention_mask in tqdm(dataloader, unit_scale=args.batch_size):
+            images = images.to(device=args.device, dtype=input_dtype)
+            images = images.unsqueeze(2)
+            input_ids = input_ids.squeeze().to(args.device)
+            attention_mask = attention_mask.squeeze().to(args.device)
+            with autocast():
+                # predict
+                sequence_output = model.encode_text(input_ids, attention_mask)
+                visual_output = model.encode_image(images)
+            sequence_output_list.append(sequence_output)
+            visual_output_list.append(visual_output)
+    sim_matrix = _run_on_single_gpu(model, sequence_output_list, visual_output_list)
+    sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)
+    return sim_matrix
+def zero_shot_eval(model, data, epoch, args):
+    temp_val_al_ret_data = args.val_al_ret_data
+    args.val_al_ret_data = list(data.keys())
+    assert len(args.val_al_ret_data) == 1
+    args.val_al_ret_data = args.val_al_ret_data[0]
+    if args.val_al_ret_data not in data:
+        return {}
+    if args.zeroshot_frequency == 0:
+        return {}
+    if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs:
+        return {}
+    if args.distributed and not args.horovod:
+        model = model.module
+    logging.info(f'Starting zero-shot {args.val_al_ret_data.upper()}.')
+    results = {}
+    if args.val_al_ret_data in data:
+        logit_matrix = run(model, data[args.val_al_ret_data].dataloader, args)
+        results = compute_metrics(logit_matrix)
+    logging.info(f'Finished zero-shot {args.val_al_ret_data.upper()}.')
+    args.val_al_ret_data = temp_val_al_ret_data
+    return results

assets/audio/0.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38aff33c1d6e68dfa0bd310d1e4cff10df4ac3642b3cc96637fab2a0e74b64a9
+size 327788

assets/audio/1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25400620506fbc099ee78fe4d31379a218a264ddcfcfe658e4ba9c2255fc6c01
+size 327788

assets/demo.png ADDED Viewed

Git LFS Details

SHA256: 34d7015339c050253fd4044c26bb1b82b423e04106ae84e193bbc87640cbf2de
Pointer size: 131 Bytes
Size of remote file: 364 kB

assets/depth/0.png ADDED Viewed

Git LFS Details

SHA256: c0bb5fa3ffca3067c69ec6b1dfc600798491a1005f9dd2bdaf0c98c5b3a1d2ac
Pointer size: 131 Bytes
Size of remote file: 233 kB

assets/depth/1.png ADDED Viewed

Git LFS Details

SHA256: f642578b11c348dc12ccb0b3d19c146986e6abc17f8cfc02f1cf8e325cdaeaf0
Pointer size: 131 Bytes
Size of remote file: 234 kB

assets/emergency.jpg ADDED Viewed

Git LFS Details

SHA256: db64fd16c971bd704deb7290f347bc784772e52f8718d78724d3805081a57ef7
Pointer size: 131 Bytes
Size of remote file: 204 kB

assets/iclr_dataset_sample.jpg ADDED Viewed

Git LFS Details

SHA256: 81dee815642f74a217e20138a60f9fa6bc76c2a5f2ae5faed18741ef755f6a6e
Pointer size: 131 Bytes
Size of remote file: 169 kB

assets/image/0.jpg ADDED Viewed

assets/image/1.jpg ADDED Viewed

assets/languagebind.jpg ADDED Viewed

Git LFS Details

SHA256: df5faf91d750c28ce16a2ac919b2e277320274c8c1c3636aa572316adcb9c5c1
Pointer size: 131 Bytes
Size of remote file: 273 kB

assets/languagebind_frame.jpg ADDED Viewed

Git LFS Details

SHA256: 9a400701a13ffdc459a5edc933aeb5290aa7114034ef8594f435b7906f15f767
Pointer size: 132 Bytes
Size of remote file: 1.36 MB

assets/languagebind_result.jpg ADDED Viewed

Git LFS Details

SHA256: dac8188d8911a77ab9ecaeeb45303d39422af073c55b7a6785dff664ed4ce544
Pointer size: 131 Bytes
Size of remote file: 441 kB

assets/languge_result.jpg ADDED Viewed

Git LFS Details

SHA256: dac8188d8911a77ab9ecaeeb45303d39422af073c55b7a6785dff664ed4ce544
Pointer size: 131 Bytes
Size of remote file: 441 kB

assets/logo.jpg ADDED Viewed

Git LFS Details

SHA256: 8cdf04f5629c0ffbdcb6dd0fd3ef9df91665361c8da7fa55aaf050ad33408c4c
Pointer size: 131 Bytes
Size of remote file: 915 kB

assets/logo_languagebind.png ADDED Viewed

Git LFS Details

SHA256: f4b53c886ec8a771db8de8812f681aeb8e80a2457fb174aa12753ebf3a835507
Pointer size: 131 Bytes
Size of remote file: 908 kB

assets/res1.jpg ADDED Viewed

assets/res2.jpg ADDED Viewed

assets/result1.jpg ADDED Viewed

Git LFS Details

SHA256: 322aef993d3c5cec718cda144e9b7eb55751dcd5704d526dd1170a0cb04ff697
Pointer size: 131 Bytes
Size of remote file: 142 kB

assets/sota.jpg ADDED Viewed

Git LFS Details

SHA256: 166389a5f6e92f21bbb5cc7b57d50df05b5fbee3fc7da6c5bb9dbb5d9a90666f
Pointer size: 131 Bytes
Size of remote file: 199 kB

assets/thermal/0.jpg ADDED Viewed

assets/thermal/1.jpg ADDED Viewed

assets/video/0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d92bdf4ad672f6bc82a72c886c3c8bc7e799866bbe41b184d640a6c5f21a075
+size 661405

assets/video/1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a6dcc0228ffcadaaac4441476f02d3109c3f005af56aeb609a0ee1f66128b80
+size 590954

d_cls/cp_zero_shot_metadata.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import pandas as pd
+OPENAI_IMAGENET_TEMPLATES = (
+    lambda c: f'a bad photo of a {c}.',
+    lambda c: f'a photo of many {c}.',
+    lambda c: f'a sculpture of a {c}.',
+    lambda c: f'a photo of the hard to see {c}.',
+    lambda c: f'a low resolution photo of the {c}.',
+    lambda c: f'a rendering of a {c}.',
+    lambda c: f'graffiti of a {c}.',
+    lambda c: f'a bad photo of the {c}.',
+    lambda c: f'a cropped photo of the {c}.',
+    lambda c: f'a tattoo of a {c}.',
+    lambda c: f'the embroidered {c}.',
+    lambda c: f'a photo of a hard to see {c}.',
+    lambda c: f'a bright photo of a {c}.',
+    lambda c: f'a photo of a clean {c}.',
+    lambda c: f'a photo of a dirty {c}.',
+    lambda c: f'a dark photo of the {c}.',
+    lambda c: f'a drawing of a {c}.',
+    lambda c: f'a photo of my {c}.',
+    lambda c: f'the plastic {c}.',
+    lambda c: f'a photo of the cool {c}.',
+    lambda c: f'a close-up photo of a {c}.',
+    lambda c: f'a black and white photo of the {c}.',
+    lambda c: f'a painting of the {c}.',
+    lambda c: f'a painting of a {c}.',
+    lambda c: f'a pixelated photo of the {c}.',
+    lambda c: f'a sculpture of the {c}.',
+    lambda c: f'a bright photo of the {c}.',
+    lambda c: f'a cropped photo of a {c}.',
+    lambda c: f'a plastic {c}.',
+    lambda c: f'a photo of the dirty {c}.',
+    lambda c: f'a jpeg corrupted photo of a {c}.',
+    lambda c: f'a blurry photo of the {c}.',
+    lambda c: f'a photo of the {c}.',
+    lambda c: f'a good photo of the {c}.',
+    lambda c: f'a rendering of the {c}.',
+    lambda c: f'a {c} in a video game.',
+    lambda c: f'a photo of one {c}.',
+    lambda c: f'a doodle of a {c}.',
+    lambda c: f'a close-up photo of the {c}.',
+    lambda c: f'a photo of a {c}.',
+    lambda c: f'the origami {c}.',
+    lambda c: f'the {c} in a video game.',
+    lambda c: f'a sketch of a {c}.',
+    lambda c: f'a doodle of the {c}.',
+    lambda c: f'a origami {c}.',
+    lambda c: f'a low resolution photo of a {c}.',
+    lambda c: f'the toy {c}.',
+    lambda c: f'a rendition of the {c}.',
+    lambda c: f'a photo of the clean {c}.',
+    lambda c: f'a photo of a large {c}.',
+    lambda c: f'a rendition of a {c}.',
+    lambda c: f'a photo of a nice {c}.',
+    lambda c: f'a photo of a weird {c}.',
+    lambda c: f'a blurry photo of a {c}.',
+    lambda c: f'a cartoon {c}.',
+    lambda c: f'art of a {c}.',
+    lambda c: f'a sketch of the {c}.',
+    lambda c: f'a embroidered {c}.',
+    lambda c: f'a pixelated photo of a {c}.',
+    lambda c: f'itap of the {c}.',
+    lambda c: f'a jpeg corrupted photo of the {c}.',
+    lambda c: f'a good photo of a {c}.',
+    lambda c: f'a plushie {c}.',
+    lambda c: f'a photo of the nice {c}.',
+    lambda c: f'a photo of the small {c}.',
+    lambda c: f'a photo of the weird {c}.',
+    lambda c: f'the cartoon {c}.',
+    lambda c: f'art of the {c}.',
+    lambda c: f'a drawing of the {c}.',
+    lambda c: f'a photo of the large {c}.',
+    lambda c: f'a black and white photo of a {c}.',
+    lambda c: f'the plushie {c}.',
+    lambda c: f'a dark photo of a {c}.',
+    lambda c: f'itap of a {c}.',
+    lambda c: f'graffiti of the {c}.',
+    lambda c: f'a toy {c}.',
+    lambda c: f'itap of my {c}.',
+    lambda c: f'a photo of a cool {c}.',
+    lambda c: f'a photo of a small {c}.',
+    lambda c: f'a tattoo of the {c}.',
+)
+# a much smaller subset of above prompts
+# from https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb
+SIMPLE_IMAGENET_TEMPLATES = (
+    lambda c: f'itap of a {c}.',
+    lambda c: f'a bad photo of the {c}.',
+    lambda c: f'a origami {c}.',
+    lambda c: f'a photo of the large {c}.',
+    lambda c: f'a {c} in a video game.',
+    lambda c: f'art of the {c}.',
+    lambda c: f'a photo of the small {c}.',
+)
+IMAGENET_CLASSNAMES = (
+)
+CLASSNAMES = {
+    'NYUV2': (
+        "bathroom", "bedroom", "bookstore", "classroom", "dining room",
+        "home office", "kitchen", "living room", "office", "others"
+    ),
+    'SUNRGBD': (
+        "bathroom", "bedroom", "classroom", "computer room", "conference room", "corridor", "dining area",
+        "dining room", "discussion area", "furniture store", "home office", "kitchen", "lab", "lecture theatre",
+        "library", "living room", "office", "rest space", "study space"
+    ),
+}