Spaces:

lpiccinelli
/

UniK3D-demo

Running on Zero

App Files Files Community

Luigi Piccinelli commited on Mar 20

Commit

1ea89dd

1 Parent(s): 6b96309

init demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +16 -0
LICENSE +407 -0
README.md +1 -0
app.py +800 -0
assets/demo/bears.jpg +0 -0
assets/demo/berzirk.jpg +0 -0
assets/demo/dl3dv.json +4 -0
assets/demo/dl3dv.png +0 -0
assets/demo/equirectangular.jpg +0 -0
assets/demo/kitti360.json +14 -0
assets/demo/kitti360.png +0 -0
assets/demo/luke.webp +0 -0
assets/demo/naruto.jpg +0 -0
assets/demo/poorthings.jpg +0 -0
assets/demo/scannet.jpg +0 -0
assets/demo/scannet.json +21 -0
assets/demo/venice.jpg +0 -0
assets/docs/unik3d-banner.png +0 -0
assets/docs/unik3d-teaser.png +0 -0
configs/config_vitb.json +159 -0
configs/config_vitl.json +159 -0
configs/config_vits.json +159 -0
gradio_demo.py +796 -0
hubconf.py +29 -0
pyproject.toml +25 -0
requirements.txt +84 -0
requirements_demo.txt +84 -0
scripts/README.md +55 -0
scripts/demo.py +150 -0
scripts/train.py +630 -0
unik3d/__init__.py +1 -0
unik3d/datasets/_2d3ds.py +67 -0
unik3d/datasets/_4dor.py +52 -0
unik3d/datasets/__init__.py +161 -0
unik3d/datasets/a2d2.py +78 -0
unik3d/datasets/adt.py +68 -0
unik3d/datasets/aimotive.py +51 -0
unik3d/datasets/argoverse.py +73 -0
unik3d/datasets/argoverse2.py +49 -0
unik3d/datasets/arkit.py +49 -0
unik3d/datasets/ase.py +66 -0
unik3d/datasets/base_dataset.py +344 -0
unik3d/datasets/bdd.py +82 -0
unik3d/datasets/bedlam.py +50 -0
unik3d/datasets/behave.py +52 -0
unik3d/datasets/blendedmvg.py +50 -0
unik3d/datasets/cityscape.py +78 -0
unik3d/datasets/ddad.py +84 -0
unik3d/datasets/deep360.py +56 -0
unik3d/datasets/dense.py +91 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+**/__pycache__/
+**/build/
+**/dist/
+**/*egg-info
+.gradio/
+# ignore scripts
+_*.sh
+__*.png
+__*.jpg
+__*.webp
+___*.py
+**/___*.py
+# ignore pcds
+*.ply

LICENSE ADDED Viewed

	@@ -0,0 +1,407 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md CHANGED Viewed

@@ -8,6 +8,7 @@ sdk_version: 5.22.0
 app_file: app.py
 pinned: false
 license: cc-by-nc-4.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: cc-by-nc-4.0
+short_description: UniK3D (CVPR 2025)
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,800 @@

+import gc
+import os
+import shutil
+import time
+from datetime import datetime
+from math import pi
+import sys
+import gradio as gr
+import numpy as np
+import torch
+import trimesh
+from PIL import Image
+sys.path.append("unik3d/")
+from unik3d.models import UniK3D
+from unik3d.utils.camera import OPENCV, Fisheye624, Pinhole, Spherical
+from unik3d.utils.visualization import colorize
+def predictions_to_glb(
+    predictions,
+    mask_black_bg=False,
+    mask_far_points=False,
+) -> trimesh.Scene:
+    print("Building GLB scene")
+    images = predictions["image"].squeeze().permute(1, 2, 0).cpu().numpy()
+    world_points = predictions["points"].squeeze().permute(1, 2, 0).cpu().numpy()
+    vertices_3d = world_points.reshape(-1, 3)
+    # flip x and y
+    vertices_3d[:, 1] *= -1
+    vertices_3d[:, 0] *= -1
+    colors_rgb = (images.reshape(-1, 3)).astype(np.uint8)
+    if mask_black_bg:
+        black_bg_mask = colors_rgb.sum(axis=1) >= 16
+        vertices_3d = vertices_3d[black_bg_mask]
+        colors_rgb = colors_rgb[black_bg_mask]
+    if mask_far_points:
+        far_points_mask = np.linalg.norm(vertices_3d, axis=-1) < 100.0
+        vertices_3d = vertices_3d[far_points_mask]
+        colors_rgb = colors_rgb[far_points_mask]
+    scene_3d = trimesh.Scene()
+    point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
+    scene_3d.add_geometry(point_cloud_data)
+    return scene_3d
+def instantiate_model(model_name):
+    type_ = model_name[0].lower()
+    name = f"unik3d-vit{type_}"
+    model = UniK3D.from_pretrained(f"lpiccinelli/{name}")
+    # Set resolution level and interpolation mode as specified.
+    model.resolution_level = 9
+    model.interpolation_mode = "bilinear"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device).eval()
+    return model
+def instantiate_camera(camera_name, params, device):
+    if camera_name == "Predicted":
+        return None
+    fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2, hfov, H, W = params
+    if camera_name == "Pinhole":
+        params = [fx, fy, cx, cy]
+    elif camera_name == "Fisheye624":
+        params = [fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2]
+    elif camera_name == "OPENCV":
+        params = [fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2]
+    elif camera_name == "Equirectangular":
+        # dummy intrinsics for spherical camera, assume hfov -> vfov based on input shapes
+        hfov2 = hfov * pi / 180.0 / 2
+        params = [fx, fy, cx, cy, W, H, hfov2, H / W * hfov2]
+        camera_name = "Spherical"
+    return eval(camera_name)(params=torch.tensor(params).float()).to(device)
+def run_model(target_dir, model_name, camera_name, params):
+    print("Instantiating model and camera...")
+    model = instantiate_model(model_name)
+    image_names = [x for x in os.listdir(target_dir) if x.endswith(".png")]
+    input_image = np.array(Image.open(os.path.join(target_dir, image_names[-1])))
+    image_tensor = torch.from_numpy(input_image).permute(2, 0, 1).unsqueeze(0).float()
+    device = next(model.parameters()).device
+    image_tensor = image_tensor.to(device)
+    H, W = image_tensor.shape[-2:]
+    params = params + [H, W]
+    camera = instantiate_camera(camera_name, params=params, device=device)
+    # Perform inference with the model.
+    print("Running inference...")
+    outputs = model.infer(image_tensor, camera=camera, normalize=True)
+    outputs["image"] = image_tensor
+    return outputs
+def gradio_demo(
+    target_dir,
+    model_name,
+    camera_name,
+    fx,
+    fy,
+    cx,
+    cy,
+    k1,
+    k2,
+    k3,
+    k4,
+    k5,
+    k6,
+    t1,
+    t2,
+    hfov,
+    mask_black_bg,
+    mask_far_points,
+):
+    print(target_dir)
+    if not os.path.isdir(target_dir) or target_dir == "None":
+        return None, "No valid target directory found. Please upload first.", None
+    start_time = time.time()
+    gc.collect()
+    print("Running run_model...")
+    params = [fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2, hfov]
+    with torch.no_grad():
+        outputs = run_model(target_dir, model_name, camera_name, params)
+    # Save predictions
+    points = outputs["points"].squeeze().permute(1, 2, 0).cpu().numpy()
+    rgb = outputs["image"].squeeze().permute(1, 2, 0).cpu().numpy()
+    prediction_save_path = os.path.join(target_dir, "predictions.npz")
+    np.savez(prediction_save_path, {"points": points, "image": rgb})
+    # Build a GLB file name
+    glbfile = os.path.join(
+        target_dir,
+        f"glbscene.glb",
+    )
+    # Convert predictions to GLB
+    glbscene = predictions_to_glb(
+        outputs,
+        mask_black_bg=mask_black_bg,
+        mask_far_points=mask_far_points,
+    )
+    glbscene.export(file_obj=glbfile)
+    # Cleanup
+    del outputs
+    gc.collect()
+    end_time = time.time()
+    print(f"Total time: {end_time - start_time:.2f} seconds")
+    log_msg = f"Success. Waiting for visualization."
+    return glbfile, log_msg, prediction_save_path
+def handle_uploads(input_image):
+    gc.collect()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    tmpdir = os.environ.get("TMPDIR", "/tmp")
+    target_dir = os.path.join(tmpdir, f"input_images_{timestamp}")
+    if os.path.exists(target_dir):
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir)
+    dst_path = os.path.join(target_dir, "image.png")
+    Image.fromarray(input_image).save(dst_path)
+    image_paths = [dst_path]
+    print(f"Files uploaded.")
+    return target_dir, image_paths
+def update_gallery_on_upload(input_images):
+    if input_images is None:
+        return None, None
+    target_dir, image_path = handle_uploads(input_images)
+    return target_dir, "Upload complete. Click 'Run UniK3D' to get 3D pointcloud."
+def update_parameters(camera):
+    if camera == "Pinhole":
+        return (
+            gr.update(visible=True),  # fx
+            gr.update(visible=True),  # fy
+            gr.update(visible=True),  # cx
+            gr.update(visible=True),  # cy
+            gr.update(visible=False),  # k1
+            gr.update(visible=False),  # k2
+            gr.update(visible=False),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=False),  # t1
+            gr.update(visible=False),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    elif camera == "OPENCV":
+        return (
+            gr.update(visible=True),  # fx
+            gr.update(visible=True),  # fy
+            gr.update(visible=True),  # cx
+            gr.update(visible=True),  # cy
+            gr.update(visible=True),  # k1
+            gr.update(visible=True),  # k2
+            gr.update(visible=True),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=True),  # t1
+            gr.update(visible=True),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    elif camera == "Fisheye624":
+        return (
+            gr.update(visible=True),  # fx
+            gr.update(visible=True),  # fy
+            gr.update(visible=True),  # cx
+            gr.update(visible=True),  # cy
+            gr.update(visible=True),  # k1
+            gr.update(visible=True),  # k2
+            gr.update(visible=True),  # k3
+            gr.update(visible=True),  # k4
+            gr.update(visible=True),  # k5
+            gr.update(visible=True),  # k6
+            gr.update(visible=True),  # t1
+            gr.update(visible=True),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    elif camera == "Equirectangular":
+        return (
+            gr.update(visible=False),  # fx
+            gr.update(visible=False),  # fy
+            gr.update(visible=False),  # cx
+            gr.update(visible=False),  # cy
+            gr.update(visible=False),  # k1
+            gr.update(visible=False),  # k2
+            gr.update(visible=False),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=False),  # t1
+            gr.update(visible=False),  # t2
+            gr.update(visible=True),  # hfov
+        )
+    elif camera == "Predicted":
+        return (
+            gr.update(visible=False),  # fx
+            gr.update(visible=False),  # fy
+            gr.update(visible=False),  # cx
+            gr.update(visible=False),  # cy
+            gr.update(visible=False),  # k1
+            gr.update(visible=False),  # k2
+            gr.update(visible=False),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=False),  # t1
+            gr.update(visible=False),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    else:
+        raise ValueError(f"Invalid camera type: {camera}")
+def clear_fields():
+    return None
+def update_log():
+    return "Loading Model and Running Inference..."
+def update_visualization(target_dir, mask_black_bg, mask_far_points, is_example):
+    if is_example == "True":
+        return (
+            None,
+            "No reconstruction available. Please click the Reconstruct button first.",
+        )
+    if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+        return (
+            None,
+            "No reconstruction available. Please click the Reconstruct button first.",
+        )
+    predictions_path = os.path.join(target_dir, "predictions.npz")
+    if not os.path.exists(predictions_path):
+        return (
+            None,
+            f"No reconstruction available at {predictions_path}. Please run 'Reconstruct' first.",
+        )
+    loaded = np.load(predictions_path, allow_pickle=True)
+    predictions = {key: loaded[key] for key in loaded.keys()}
+    glbfile = os.path.join(
+        target_dir,
+        f"glbscene.glb",
+    )
+    if not os.path.exists(glbfile):
+        glbscene = predictions_to_glb(
+            predictions,
+            mask_black_bg=mask_black_bg,
+            mask_far_points=mask_far_points,
+        )
+        glbscene.export(file_obj=glbfile)
+    return glbfile, "Updating Visualization"
+if __name__ == "__main__":
+    theme = gr.themes.Citrus()
+    theme.set(
+        checkbox_label_background_fill_selected="*button_primary_background_fill",
+        checkbox_label_text_color_selected="*button_primary_text_color",
+    )
+    with gr.Blocks(
+        theme=theme,
+        css="""
+        .custom-log * {
+            font-style: italic;
+            font-size: 22px !important;
+            background-image: linear-gradient(120deg, #ff7e26 0%, #ff9c59 60%, #fff4d6 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            font-weight: bold !important;
+            color: transparent !important;
+            text-align: center !important;
+        }
+        .example-log * {
+            font-style: italic;
+            font-size: 16px !important;
+            background-image: linear-gradient(120deg, #ff7e26 0%, #ff9c59 60%, #fff4d6 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            color: transparent !important;
+        }
+        #my_radio .wrap {
+            display: flex;
+            flex-wrap: nowrap;
+            justify-content: center;
+            align-items: center;
+        }
+        #my_radio .wrap label {
+            display: flex;
+            width: 50%;
+            justify-content: center;
+            align-items: center;
+            margin: 0;
+            padding: 10px 0;
+            box-sizing: border-box;
+        }
+        """,
+    ) as demo:
+        # Instead of gr.State, we use a hidden Textbox:
+        is_example = gr.Textbox(label="is_example", visible=False, value="None")
+        gr.HTML(
+            """
+        <h1>UniK3D: Universal Camera Monocular 3D Estimation</h1>
+        <p>
+        <a href="https://github.com/lpiccinelli-eth/UniK3D">🌟 GitHub Repository</a> |
+        <a href="">🚀 Project Page</a>
+        </p>
+        <div style="font-size: 16px; line-height: 1.5;">
+        <p>Upload one image to create a 3D estimation of a scene or object. UniK3D allows to predict directly 3D of any camera and scene.</p>
+        <h3>Getting Started:</h3>
+        <ol>
+            <li><strong>Upload Your Image:</strong> Use the "Upload Images" panel to provide your input.</li>
+            <li><strong>Run:</strong> Click the "Run UniK3D" button to start the 3D estimation process.</li>
+            <li><strong>Visualize:</strong> The 3D reconstruction will appear in the viewer on the right. You can rotate, pan, and zoom to explore the model, and download the GLB file.</li>
+        </ol>
+        <p><strong style="color: #ff7e26;">Please note:</strong> <span style="color: #ff7e26; font-weight: bold;">Our model runs on CPU on HuggingFace Space. Actual inference is less than 100ms second per image on consumer-level GPUs. Web-based 3D pointcloud visualization may be slow due to Gradio's rendering. For faster visualization, use a local machine to run our demo from our <a href="https://github.com/lpiccinelli-eth/UniK3D">GitHub repository</a>. </span></p>
+        </div>
+        """
+        )
+        target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
+        with gr.Row():
+            with gr.Column():
+                camera_dropdown = gr.Dropdown(
+                    choices=[
+                        "Predicted",
+                        "Pinhole",
+                        "Fisheye624",
+                        "OPENCV",
+                        "Equirectangular",
+                    ],
+                    label="Input Camera",
+                )
+                model_dropdown = gr.Dropdown(
+                    choices=["Large", "Base", "Small"], label="Utilized Model"
+                )
+                mask_black_bg = gr.Checkbox(
+                    label="Filter Black Background", value=False
+                )
+                mask_far_points = gr.Checkbox(label="Filter Far Points", value=False)
+            with gr.Column():
+                fx = gr.Number(label="Focal length x", value=500.0, visible=False)
+                fy = gr.Number(label="Focal length y", value=500.0, visible=False)
+                cx = gr.Number(label="Center projection x", value=320.0, visible=False)
+                cy = gr.Number(label="Center projection y", value=240.0, visible=False)
+                hfov = gr.Number(
+                    label="Horizontal FoV (degree)", value=0.0, visible=False
+                )
+            with gr.Column():
+                k1 = gr.Number(label="Radial 1", value=0.0, visible=False)
+                k2 = gr.Number(label="Radial 2", value=0.0, visible=False)
+                k3 = gr.Number(label="Radial 3", value=0.0, visible=False)
+                k4 = gr.Number(label="Radial 4", value=0.0, visible=False)
+            with gr.Column():
+                k5 = gr.Number(label="Radial 5", value=0.0, visible=False)
+                k6 = gr.Number(label="Radial 6", value=0.0, visible=False)
+                t1 = gr.Number(label="Tangential 1", value=0.0, visible=False)
+                t2 = gr.Number(label="Tangential 2", value=0.0, visible=False)
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(label="Upload Images")
+                gr.Markdown("**3D Estimation**")
+                with gr.Row():
+                    log_output = gr.Markdown(
+                        "Please upload one image at a time, then click `Run UniK3D`.",
+                        elem_classes=["custom-log"],
+                    )
+                    reconstruction_npy = gr.File(
+                        label="Download 3D Pointcloud", type="filepath"
+                    )
+            with gr.Column(scale=2):
+                reconstruction_output = gr.Model3D(
+                    height=520, zoom_speed=0.5, pan_speed=0.5
+                )
+                with gr.Row():
+                    submit_btn = gr.Button("Run UniK3D", scale=1, variant="primary")
+                    clear_btn = gr.ClearButton(
+                        [
+                            input_image,
+                            reconstruction_output,
+                            log_output,
+                            target_dir_output,
+                            reconstruction_npy,
+                        ],
+                        scale=1,
+                    )
+        examples = [
+            [
+                "assets/demo/poorthings.jpg",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                True,
+                False,
+            ],
+            [
+                "assets/demo/naruto.jpg",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/bears.jpg",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                True,
+                False,
+            ],
+            [
+                "assets/demo/berzirk.jpg",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                True,
+                False,
+            ],
+            [
+                "assets/demo/luke.webp",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/equirectangular.jpg",
+                "Large",
+                "Equirectangular",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                360.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/venice.jpg",
+                "Large",
+                "Equirectangular",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                360.0,
+                False,
+                True,
+            ],
+            [
+                "assets/demo/dl3dv.png",
+                "Large",
+                "OPENCV",
+                429.57611083984375,
+                429.6898193359375,
+                479.5,
+                269.5,
+                -0.0014844092074781656,
+                0.0007422995404340327,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.00012013866944471374,
+                0.001125041046179831,
+                0.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/scannet.jpg",
+                "Large",
+                "Fisheye624",
+                791.90869140625,
+                792.7230834960938,
+                878.16796875,
+                585.045166015625,
+                -0.029167557135224342,
+                -0.006803446915000677,
+                -0.0012682401575148106,
+                -4.6094228309812024e-05,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                False,
+                False,
+            ],
+        ]
+        def example_pipeline(
+            input_image,
+            model_name,
+            camera_name,
+            fx,
+            fy,
+            cx,
+            cy,
+            k1,
+            k2,
+            k3,
+            k4,
+            k5,
+            k6,
+            t1,
+            t2,
+            hfov,
+            mask_black_bg,
+            mask_far_points,
+        ):
+            target_dir, image_path = handle_uploads(input_image)
+            glbfile, log_msg, prediction_save_path = gradio_demo(
+                target_dir,
+                model_name,
+                camera_name,
+                fx,
+                fy,
+                cx,
+                cy,
+                k1,
+                k2,
+                k3,
+                k4,
+                k5,
+                k6,
+                t1,
+                t2,
+                hfov,
+                mask_black_bg,
+                mask_far_points,
+            )
+            return (
+                glbfile,
+                log_msg,
+                prediction_save_path,
+                target_dir,
+                image_path,
+            )
+        gr.Markdown("Click any row to load an example.", elem_classes=["example-log"])
+        gr.Examples(
+            examples=examples,
+            inputs=[
+                input_image,
+                model_dropdown,
+                camera_dropdown,
+                fx,
+                fy,
+                cx,
+                cy,
+                k1,
+                k2,
+                k3,
+                k4,
+                k5,
+                k6,
+                t1,
+                t2,
+                hfov,
+                mask_black_bg,
+                mask_far_points,
+            ],
+            outputs=[reconstruction_output, log_output, reconstruction_npy],
+            fn=example_pipeline,
+            cache_examples=False,
+            examples_per_page=50,
+        )
+        submit_btn.click(
+            fn=clear_fields, inputs=[], outputs=[reconstruction_output]
+        ).then(fn=update_log, inputs=[], outputs=[log_output]).then(
+            fn=gradio_demo,
+            inputs=[
+                target_dir_output,
+                model_dropdown,
+                camera_dropdown,
+                fx,
+                fy,
+                cx,
+                cy,
+                k1,
+                k2,
+                k3,
+                k4,
+                k5,
+                k6,
+                t1,
+                t2,
+                hfov,
+                mask_black_bg,
+                mask_far_points,
+            ],
+            outputs=[reconstruction_output, log_output, reconstruction_npy],
+        ).then(
+            fn=lambda: "False", inputs=[], outputs=[is_example]
+        )
+        mask_black_bg.change(
+            update_visualization,
+            [target_dir_output, mask_black_bg, mask_far_points, is_example],
+            [reconstruction_output, log_output],
+        )
+        mask_far_points.change(
+            update_visualization,
+            [target_dir_output, mask_black_bg, mask_far_points, is_example],
+            [reconstruction_output, log_output],
+        )
+        input_image.change(
+            fn=update_gallery_on_upload,
+            inputs=[input_image],
+            outputs=[target_dir_output, log_output],
+        )
+        # Dynamically update intrinsic parameter visibility when camera selection changes.
+        camera_dropdown.change(
+            fn=update_parameters,
+            inputs=camera_dropdown,
+            outputs=[fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2, hfov],
+        )
+        # demo.queue(max_size=20).launch(show_error=True, share=False, ssr_mode=False)
+        demo.launch(
+            show_error=True,
+        )

assets/demo/bears.jpg ADDED Viewed

assets/demo/berzirk.jpg ADDED Viewed

assets/demo/dl3dv.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "name": "OPENCV",
+    "params": [429.57611083984375, 429.6898193359375, 479.5, 269.5, -0.0014844092074781656, 0.0007422995404340327, 0.0, 0.0, 0.0, 0.0, 0.00012013866944471374, 0.001125041046179831, 0.0, 0.0, 0.0, 0.0]
+}

assets/demo/dl3dv.png ADDED Viewed

assets/demo/equirectangular.jpg ADDED Viewed

assets/demo/kitti360.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "params": [
+        890.8814086914062,
+        890.5255737304688,
+        477.7955017089844,
+        470.34332275390625,
+        0.016798235476017,
+        1.6548773050308228,
+        0.000422239420004189,
+        0.000424621335696429,
+        2.213404655456543
+    ],
+    "name": "MEI"
+}

assets/demo/kitti360.png ADDED Viewed

assets/demo/luke.webp ADDED Viewed

assets/demo/naruto.jpg ADDED Viewed

assets/demo/poorthings.jpg ADDED Viewed

assets/demo/scannet.jpg ADDED Viewed

assets/demo/scannet.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "params": [
+        791.90869140625,
+        792.7230834960938,
+        878.16796875,
+        585.045166015625,
+        -0.029167557135224342,
+        -0.006803446915000677,
+        -0.0012682401575148106,
+        -4.6094228309812024e-05,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+    ],
+    "name": "Fisheye624"
+}

assets/demo/venice.jpg ADDED Viewed

assets/docs/unik3d-banner.png ADDED Viewed

assets/docs/unik3d-teaser.png ADDED Viewed

configs/config_vitb.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "generic": {
+        "seed": 42,
+        "deterministic": true,
+        "name_page": "ufish"
+    },
+    "training": {
+        "n_iters": 250000,
+        "batch_size": 8,
+        "validation_interval": 2500,
+        "nsteps_accumulation_gradient": 4,
+        "lr": 5e-05,
+        "lr_final": 1e-06,
+        "lr_warmup": 1.0,
+        "cycle_beta": true,
+        "wd": 0.1,
+        "wd_final": 0.1,
+        "warmup_iters": 75000,
+        "ld": 1.0,
+        "drop_path": 0.0,
+        "ema": 0.9995,
+        "f16": "f16",
+        "clipping": 1.0,
+        "losses": {
+            "depth": {
+                "name": "Scale",
+                "weight": 1.0,
+                "fn": "l1",
+                "gamma": 1.0,
+                "alpha": 1.0,
+                "output_fn": "sqrt",
+                "input_fn": "log"
+            },
+            "camera": {
+                "name": "PolarRegression",
+                "weight": 1.0,
+                "gamma": 1.0,
+                "alpha": 1.0,
+                "fn": "l1",
+                "output_fn": "sqrt",
+                "input_fn": "linear",
+                "dims": [
+                    1,
+                    2
+                ],
+                "polar_weight": 3.0,
+                "polar_asym": 0.7
+            },
+            "confidence": {
+                "name": "Confidence",
+                "weight": 0.1,
+                "input_fn": "log",
+                "output_fn": "sqrt"
+            }
+        }
+    },
+    "data": {
+        "image_shape": [
+            518,
+            518
+        ],
+        "resize_method": "contextcrop",
+        "normalization": "imagenet",
+        "pair": 1,
+        "mini": 1.0,
+        "num_frames": 1,
+        "sampling": {
+            "KITTI": 1.0
+        },
+        "train_datasets": [
+            "KITTI"
+        ],
+        "val_datasets": [
+            "KITTI"
+        ],
+        "data_root": "datasets",
+        "crop": "garg",
+        "augmentations": {
+            "random_scale": 4.0,
+            "random_translate_x": 0.04,
+            "random_translate_y": 0.01,
+            "scale_p": 0.0,
+            "translate_p": 0.0,
+            "random_rotation": 0.0,
+            "rotation_p": 0.0,
+            "random_shear": 0.0,
+            "affine_p": 0.0,
+            "random_jitter": 0.5,
+            "jitter_p": 1.0,
+            "random_blur": 2.0,
+            "blur_p": 0.5,
+            "random_gamma": 0.5,
+            "gamma_p": 1.0,
+            "grayscale_p": 0.2,
+            "flip_p": 0.5,
+            "cut_p": 0.0,
+            "invert_p": 0.0,
+            "shape_mult": 14,
+            "noise_pad": 1.0,
+            "test_context": 1.0
+        },
+        "shape_constraints": {
+            "ratio_bounds": [
+                0.5,
+                2.5
+            ],
+            "pixels_max": 600000.0,
+            "pixels_min": 200000.0,
+            "height_min": 15,
+            "width_min": 15,
+            "shape_mult": 14,
+            "sample": true
+        }
+    },
+    "model": {
+        "name": "UniK3D",
+        "num_heads": 8,
+        "expansion": 4,
+        "num_steps": 100000,
+        "layer_scale": 1e-4,
+        "camera": {
+            "augment": true,
+            "weak_ratio": 0.9,
+            "tau": 50000
+        },
+        "pixel_decoder": {
+            "name": "Decoder",
+            "hidden_dim": 384,
+            "dropout": 0.0,
+            "depths": [
+                2,
+                2,
+                2
+            ],
+            "detach": 0.1,
+            "out_dim": 48,
+            "kernel_size": 3,
+            "num_prompt_blocks": 1,
+            "use_norm": false
+        },
+        "pixel_encoder": {
+            "lr": 3e-06,
+            "wd": 0.1,
+            "name": "dinov2_vitb14",
+            "frozen_stages": 0,
+            "num_register_tokens": 0,
+            "use_norm": true,
+            "freeze_norm": true,
+            "pretrained": null,
+            "stacking_fn": "last",
+            "output_idx": [
+                3,
+                6,
+                9,
+                12
+            ]
+        }
+    }
+}

configs/config_vitl.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "generic": {
+        "seed": 42,
+        "deterministic": true,
+        "name_page": "ufish"
+    },
+    "training": {
+        "n_iters": 250000,
+        "batch_size": 8,
+        "validation_interval": 2500,
+        "nsteps_accumulation_gradient": 4,
+        "lr": 5e-05,
+        "lr_final": 1e-06,
+        "lr_warmup": 1.0,
+        "cycle_beta": true,
+        "wd": 0.1,
+        "wd_final": 0.1,
+        "warmup_iters": 75000,
+        "ld": 1.0,
+        "drop_path": 0.0,
+        "ema": 0.9995,
+        "f16": "f16",
+        "clipping": 1.0,
+        "losses": {
+            "depth": {
+                "name": "Scale",
+                "weight": 1.0,
+                "fn": "l1",
+                "gamma": 1.0,
+                "alpha": 1.0,
+                "output_fn": "sqrt",
+                "input_fn": "log"
+            },
+            "camera": {
+                "name": "PolarRegression",
+                "weight": 1.0,
+                "gamma": 1.0,
+                "alpha": 1.0,
+                "fn": "l1",
+                "output_fn": "sqrt",
+                "input_fn": "linear",
+                "dims": [
+                    1,
+                    2
+                ],
+                "polar_weight": 3.0,
+                "polar_asym": 0.7
+            },
+            "confidence": {
+                "name": "Confidence",
+                "weight": 0.1,
+                "input_fn": "log",
+                "output_fn": "sqrt"
+            }
+        }
+    },
+    "data": {
+        "image_shape": [
+            518,
+            518
+        ],
+        "resize_method": "contextcrop",
+        "normalization": "imagenet",
+        "pair": 1,
+        "mini": 1.0,
+        "num_frames": 1,
+        "sampling": {
+            "KITTI": 1.0
+        },
+        "train_datasets": [
+            "KITTI"
+        ],
+        "val_datasets": [
+            "KITTI"
+        ],
+        "data_root": "datasets",
+        "crop": "garg",
+        "augmentations": {
+            "random_scale": 4.0,
+            "random_translate_x": 0.04,
+            "random_translate_y": 0.01,
+            "scale_p": 0.0,
+            "translate_p": 0.0,
+            "random_rotation": 0.0,
+            "rotation_p": 0.0,
+            "random_shear": 0.0,
+            "affine_p": 0.0,
+            "random_jitter": 0.5,
+            "jitter_p": 1.0,
+            "random_blur": 2.0,
+            "blur_p": 0.5,
+            "random_gamma": 0.5,
+            "gamma_p": 1.0,
+            "grayscale_p": 0.2,
+            "flip_p": 0.5,
+            "cut_p": 0.0,
+            "invert_p": 0.0,
+            "shape_mult": 14,
+            "noise_pad": 1.0,
+            "test_context": 1.0
+        },
+        "shape_constraints": {
+            "ratio_bounds": [
+                0.5,
+                2.5
+            ],
+            "pixels_max": 600000.0,
+            "pixels_min": 200000.0,
+            "height_min": 15,
+            "width_min": 15,
+            "shape_mult": 14,
+            "sample": true
+        }
+    },
+    "model": {
+        "name": "UniK3D",
+        "num_heads": 8,
+        "expansion": 4,
+        "num_steps": 100000,
+        "layer_scale": 1e-4,
+        "camera": {
+            "augment": true,
+            "weak_ratio": 0.9,
+            "tau": 50000
+        },
+        "pixel_decoder": {
+            "name": "Decoder",
+            "hidden_dim": 512,
+            "dropout": 0.0,
+            "depths": [
+                2,
+                2,
+                2
+            ],
+            "detach": 0.1,
+            "out_dim": 64,
+            "kernel_size": 3,
+            "num_prompt_blocks": 1,
+            "use_norm": false
+        },
+        "pixel_encoder": {
+            "lr": 3e-06,
+            "wd": 0.1,
+            "name": "dinov2_vitl14",
+            "frozen_stages": 0,
+            "num_register_tokens": 0,
+            "use_norm": true,
+            "freeze_norm": true,
+            "pretrained": null,
+            "stacking_fn": "last",
+            "output_idx": [
+                6,
+                12,
+                18,
+                24
+            ]
+        }
+    }
+}

configs/config_vits.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "generic": {
+        "seed": 42,
+        "deterministic": true,
+        "name_page": "ufish"
+    },
+    "training": {
+        "n_iters": 250000,
+        "batch_size": 8,
+        "validation_interval": 2500,
+        "nsteps_accumulation_gradient": 4,
+        "lr": 5e-05,
+        "lr_final": 1e-06,
+        "lr_warmup": 1.0,
+        "cycle_beta": true,
+        "wd": 0.1,
+        "wd_final": 0.1,
+        "warmup_iters": 75000,
+        "ld": 1.0,
+        "drop_path": 0.0,
+        "ema": 0.9995,
+        "f16": "f16",
+        "clipping": 1.0,
+        "losses": {
+            "depth": {
+                "name": "Scale",
+                "weight": 1.0,
+                "fn": "l1",
+                "gamma": 1.0,
+                "alpha": 1.0,
+                "output_fn": "sqrt",
+                "input_fn": "log"
+            },
+            "camera": {
+                "name": "PolarRegression",
+                "weight": 1.0,
+                "gamma": 1.0,
+                "alpha": 1.0,
+                "fn": "l1",
+                "output_fn": "sqrt",
+                "input_fn": "linear",
+                "dims": [
+                    1,
+                    2
+                ],
+                "polar_weight": 3.0,
+                "polar_asym": 0.7
+            },
+            "confidence": {
+                "name": "Confidence",
+                "weight": 0.1,
+                "input_fn": "log",
+                "output_fn": "sqrt"
+            }
+        }
+    },
+    "data": {
+        "image_shape": [
+            518,
+            518
+        ],
+        "resize_method": "contextcrop",
+        "normalization": "imagenet",
+        "pair": 1,
+        "mini": 1.0,
+        "num_frames": 1,
+        "sampling": {
+            "KITTI": 1.0
+        },
+        "train_datasets": [
+            "KITTI"
+        ],
+        "val_datasets": [
+            "KITTI"
+        ],
+        "data_root": "datasets",
+        "crop": "garg",
+        "augmentations": {
+            "random_scale": 4.0,
+            "random_translate_x": 0.04,
+            "random_translate_y": 0.01,
+            "scale_p": 0.0,
+            "translate_p": 0.0,
+            "random_rotation": 0.0,
+            "rotation_p": 0.0,
+            "random_shear": 0.0,
+            "affine_p": 0.0,
+            "random_jitter": 0.5,
+            "jitter_p": 1.0,
+            "random_blur": 2.0,
+            "blur_p": 0.5,
+            "random_gamma": 0.5,
+            "gamma_p": 1.0,
+            "grayscale_p": 0.2,
+            "flip_p": 0.5,
+            "cut_p": 0.0,
+            "invert_p": 0.0,
+            "shape_mult": 14,
+            "noise_pad": 1.0,
+            "test_context": 1.0
+        },
+        "shape_constraints": {
+            "ratio_bounds": [
+                0.5,
+                2.5
+            ],
+            "pixels_max": 600000.0,
+            "pixels_min": 200000.0,
+            "height_min": 15,
+            "width_min": 15,
+            "shape_mult": 14,
+            "sample": true
+        }
+    },
+    "model": {
+        "name": "UniK3D",
+        "num_heads": 8,
+        "expansion": 4,
+        "num_steps": 100000,
+        "layer_scale": 1e-4,
+        "camera": {
+            "augment": true,
+            "weak_ratio": 0.9,
+            "tau": 50000
+        },
+        "pixel_decoder": {
+            "name": "Decoder",
+            "hidden_dim": 256,
+            "dropout": 0.0,
+            "depths": [
+                2,
+                2,
+                2
+            ],
+            "detach": 0.1,
+            "out_dim": 32,
+            "kernel_size": 3,
+            "num_prompt_blocks": 1,
+            "use_norm": false
+        },
+        "pixel_encoder": {
+            "lr": 3e-06,
+            "wd": 0.1,
+            "name": "dinov2_vits14",
+            "frozen_stages": 0,
+            "num_register_tokens": 0,
+            "use_norm": true,
+            "freeze_norm": true,
+            "pretrained": null,
+            "stacking_fn": "last",
+            "output_idx": [
+                3,
+                6,
+                9,
+                12
+            ]
+        }
+    }
+}

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,796 @@

+import gc
+import os
+import shutil
+import time
+from datetime import datetime
+from math import pi
+import gradio as gr
+import numpy as np
+import torch
+import trimesh
+from PIL import Image
+from unik3d.models import UniK3D
+from unik3d.utils.camera import OPENCV, Fisheye624, Pinhole, Spherical
+from unik3d.utils.visualization import colorize
+def predictions_to_glb(
+    predictions,
+    mask_black_bg=False,
+    mask_far_points=False,
+) -> trimesh.Scene:
+    print("Building GLB scene")
+    images = predictions["image"].squeeze().permute(1, 2, 0).cpu().numpy()
+    world_points = predictions["points"].squeeze().permute(1, 2, 0).cpu().numpy()
+    vertices_3d = world_points.reshape(-1, 3)
+    # flip x and y
+    vertices_3d[:, 1] *= -1
+    vertices_3d[:, 0] *= -1
+    colors_rgb = (images.reshape(-1, 3)).astype(np.uint8)
+    if mask_black_bg:
+        black_bg_mask = colors_rgb.sum(axis=1) >= 16
+        vertices_3d = vertices_3d[black_bg_mask]
+        colors_rgb = colors_rgb[black_bg_mask]
+    if mask_far_points:
+        far_points_mask = np.linalg.norm(vertices_3d, axis=-1) < 100.0
+        vertices_3d = vertices_3d[far_points_mask]
+        colors_rgb = colors_rgb[far_points_mask]
+    scene_3d = trimesh.Scene()
+    point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
+    scene_3d.add_geometry(point_cloud_data)
+    return scene_3d
+def instantiate_model(model_name):
+    type_ = model_name[0].lower()
+    name = f"unik3d-vit{type_}"
+    model = UniK3D.from_pretrained(f"lpiccinelli/{name}")
+    # Set resolution level and interpolation mode as specified.
+    model.resolution_level = 9
+    model.interpolation_mode = "bilinear"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device).eval()
+    return model
+def instantiate_camera(camera_name, params, device):
+    if camera_name == "Predicted":
+        return None
+    fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2, hfov, H, W = params
+    if camera_name == "Pinhole":
+        params = [fx, fy, cx, cy]
+    elif camera_name == "Fisheye624":
+        params = [fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2]
+    elif camera_name == "OPENCV":
+        params = [fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2]
+    elif camera_name == "Equirectangular":
+        # dummy intrinsics for spherical camera, assume hfov -> vfov based on input shapes
+        hfov2 = hfov * pi / 180.0 / 2
+        params = [fx, fy, cx, cy, W, H, hfov2, H / W * hfov2]
+        camera_name = "Spherical"
+    return eval(camera_name)(params=torch.tensor(params).float()).to(device)
+def run_model(target_dir, model_name, camera_name, params):
+    print("Instantiating model and camera...")
+    model = instantiate_model(model_name)
+    image_names = [x for x in os.listdir(target_dir) if x.endswith(".png")]
+    input_image = np.array(Image.open(os.path.join(target_dir, image_names[-1])))
+    image_tensor = torch.from_numpy(input_image).permute(2, 0, 1).unsqueeze(0).float()
+    device = next(model.parameters()).device
+    image_tensor = image_tensor.to(device)
+    H, W = image_tensor.shape[-2:]
+    params = params + [H, W]
+    camera = instantiate_camera(camera_name, params=params, device=device)
+    # Perform inference with the model.
+    print("Running inference...")
+    outputs = model.infer(image_tensor, camera=camera, normalize=True)
+    outputs["image"] = image_tensor
+    return outputs
+def gradio_demo(
+    target_dir,
+    model_name,
+    camera_name,
+    fx,
+    fy,
+    cx,
+    cy,
+    k1,
+    k2,
+    k3,
+    k4,
+    k5,
+    k6,
+    t1,
+    t2,
+    hfov,
+    mask_black_bg,
+    mask_far_points,
+):
+    print(target_dir)
+    if not os.path.isdir(target_dir) or target_dir == "None":
+        return None, "No valid target directory found. Please upload first.", None
+    start_time = time.time()
+    gc.collect()
+    print("Running run_model...")
+    params = [fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2, hfov]
+    with torch.no_grad():
+        outputs = run_model(target_dir, model_name, camera_name, params)
+    # Save predictions
+    points = outputs["points"].squeeze().permute(1, 2, 0).cpu().numpy()
+    rgb = outputs["image"].squeeze().permute(1, 2, 0).cpu().numpy()
+    prediction_save_path = os.path.join(target_dir, "predictions.npz")
+    np.savez(prediction_save_path, {"points": points, "image": rgb})
+    # Build a GLB file name
+    glbfile = os.path.join(
+        target_dir,
+        f"glbscene.glb",
+    )
+    # Convert predictions to GLB
+    glbscene = predictions_to_glb(
+        outputs,
+        mask_black_bg=mask_black_bg,
+        mask_far_points=mask_far_points,
+    )
+    glbscene.export(file_obj=glbfile)
+    # Cleanup
+    del outputs
+    gc.collect()
+    end_time = time.time()
+    print(f"Total time: {end_time - start_time:.2f} seconds")
+    log_msg = f"Success. Waiting for visualization."
+    return glbfile, log_msg, prediction_save_path
+def handle_uploads(input_image):
+    gc.collect()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    tmpdir = os.environ.get("TMPDIR", "/tmp")
+    target_dir = os.path.join(tmpdir, f"input_images_{timestamp}")
+    if os.path.exists(target_dir):
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir)
+    dst_path = os.path.join(target_dir, "image.png")
+    Image.fromarray(input_image).save(dst_path)
+    image_paths = [dst_path]
+    print(f"Files uploaded.")
+    return target_dir, image_paths
+def update_gallery_on_upload(input_images):
+    if input_images is None:
+        return None, None
+    target_dir, image_path = handle_uploads(input_images)
+    return target_dir, "Upload complete. Click 'Run UniK3D' to get 3D pointcloud."
+def update_parameters(camera):
+    if camera == "Pinhole":
+        return (
+            gr.update(visible=True),  # fx
+            gr.update(visible=True),  # fy
+            gr.update(visible=True),  # cx
+            gr.update(visible=True),  # cy
+            gr.update(visible=False),  # k1
+            gr.update(visible=False),  # k2
+            gr.update(visible=False),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=False),  # t1
+            gr.update(visible=False),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    elif camera == "OPENCV":
+        return (
+            gr.update(visible=True),  # fx
+            gr.update(visible=True),  # fy
+            gr.update(visible=True),  # cx
+            gr.update(visible=True),  # cy
+            gr.update(visible=True),  # k1
+            gr.update(visible=True),  # k2
+            gr.update(visible=True),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=True),  # t1
+            gr.update(visible=True),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    elif camera == "Fisheye624":
+        return (
+            gr.update(visible=True),  # fx
+            gr.update(visible=True),  # fy
+            gr.update(visible=True),  # cx
+            gr.update(visible=True),  # cy
+            gr.update(visible=True),  # k1
+            gr.update(visible=True),  # k2
+            gr.update(visible=True),  # k3
+            gr.update(visible=True),  # k4
+            gr.update(visible=True),  # k5
+            gr.update(visible=True),  # k6
+            gr.update(visible=True),  # t1
+            gr.update(visible=True),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    elif camera == "Equirectangular":
+        return (
+            gr.update(visible=False),  # fx
+            gr.update(visible=False),  # fy
+            gr.update(visible=False),  # cx
+            gr.update(visible=False),  # cy
+            gr.update(visible=False),  # k1
+            gr.update(visible=False),  # k2
+            gr.update(visible=False),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=False),  # t1
+            gr.update(visible=False),  # t2
+            gr.update(visible=True),  # hfov
+        )
+    elif camera == "Predicted":
+        return (
+            gr.update(visible=False),  # fx
+            gr.update(visible=False),  # fy
+            gr.update(visible=False),  # cx
+            gr.update(visible=False),  # cy
+            gr.update(visible=False),  # k1
+            gr.update(visible=False),  # k2
+            gr.update(visible=False),  # k3
+            gr.update(visible=False),  # k4
+            gr.update(visible=False),  # k5
+            gr.update(visible=False),  # k6
+            gr.update(visible=False),  # t1
+            gr.update(visible=False),  # t2
+            gr.update(visible=False),  # hfov
+        )
+    else:
+        raise ValueError(f"Invalid camera type: {camera}")
+def clear_fields():
+    return None
+def update_log():
+    return "Loading Model and Running Inference..."
+def update_visualization(target_dir, mask_black_bg, mask_far_points, is_example):
+    if is_example == "True":
+        return (
+            None,
+            "No reconstruction available. Please click the Reconstruct button first.",
+        )
+    if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+        return (
+            None,
+            "No reconstruction available. Please click the Reconstruct button first.",
+        )
+    predictions_path = os.path.join(target_dir, "predictions.npz")
+    if not os.path.exists(predictions_path):
+        return (
+            None,
+            f"No reconstruction available at {predictions_path}. Please run 'Reconstruct' first.",
+        )
+    loaded = np.load(predictions_path, allow_pickle=True)
+    predictions = {key: loaded[key] for key in loaded.keys()}
+    glbfile = os.path.join(
+        target_dir,
+        f"glbscene.glb",
+    )
+    if not os.path.exists(glbfile):
+        glbscene = predictions_to_glb(
+            predictions,
+            mask_black_bg=mask_black_bg,
+            mask_far_points=mask_far_points,
+        )
+        glbscene.export(file_obj=glbfile)
+    return glbfile, "Updating Visualization"
+if __name__ == "__main__":
+    theme = gr.themes.Citrus()
+    theme.set(
+        checkbox_label_background_fill_selected="*button_primary_background_fill",
+        checkbox_label_text_color_selected="*button_primary_text_color",
+    )
+    with gr.Blocks(
+        theme=theme,
+        css="""
+        .custom-log * {
+            font-style: italic;
+            font-size: 22px !important;
+            background-image: linear-gradient(120deg, #ff7e26 0%, #ff9c59 60%, #fff4d6 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            font-weight: bold !important;
+            color: transparent !important;
+            text-align: center !important;
+        }
+        .example-log * {
+            font-style: italic;
+            font-size: 16px !important;
+            background-image: linear-gradient(120deg, #ff7e26 0%, #ff9c59 60%, #fff4d6 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            color: transparent !important;
+        }
+        #my_radio .wrap {
+            display: flex;
+            flex-wrap: nowrap;
+            justify-content: center;
+            align-items: center;
+        }
+        #my_radio .wrap label {
+            display: flex;
+            width: 50%;
+            justify-content: center;
+            align-items: center;
+            margin: 0;
+            padding: 10px 0;
+            box-sizing: border-box;
+        }
+        """,
+    ) as demo:
+        # Instead of gr.State, we use a hidden Textbox:
+        is_example = gr.Textbox(label="is_example", visible=False, value="None")
+        gr.HTML(
+            """
+        <h1>UniK3D: Universal Camera Monocular 3D Estimation</h1>
+        <p>
+        <a href="https://github.com/lpiccinelli-eth/UniK3D">🌟 GitHub Repository</a> |
+        <a href="">🚀 Project Page</a>
+        </p>
+        <div style="font-size: 16px; line-height: 1.5;">
+        <p>Upload one image to create a 3D estimation of a scene or object. UniK3D allows to predict directly 3D of any camera and scene.</p>
+        <h3>Getting Started:</h3>
+        <ol>
+            <li><strong>Upload Your Image:</strong> Use the "Upload Images" panel to provide your input.</li>
+            <li><strong>Run:</strong> Click the "Run UniK3D" button to start the 3D estimation process.</li>
+            <li><strong>Visualize:</strong> The 3D reconstruction will appear in the viewer on the right. You can rotate, pan, and zoom to explore the model, and download the GLB file.</li>
+        </ol>
+        <p><strong style="color: #ff7e26;">Please note:</strong> <span style="color: #ff7e26; font-weight: bold;">Our model runs on CPU on HuggingFace Space. Actual inference is less than 100ms second per image on consumer-level GPUs. Web-based 3D pointcloud visualization may be slow due to Gradio's rendering. For faster visualization, use a local machine to run our demo from our <a href="https://github.com/lpiccinelli-eth/UniK3D">GitHub repository</a>. </span></p>
+        </div>
+        """
+        )
+        target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
+        with gr.Row():
+            with gr.Column():
+                camera_dropdown = gr.Dropdown(
+                    choices=[
+                        "Predicted",
+                        "Pinhole",
+                        "Fisheye624",
+                        "OPENCV",
+                        "Equirectangular",
+                    ],
+                    label="Input Camera",
+                )
+                model_dropdown = gr.Dropdown(
+                    choices=["Large", "Base", "Small"], label="Utilized Model"
+                )
+                mask_black_bg = gr.Checkbox(
+                    label="Filter Black Background", value=False
+                )
+                mask_far_points = gr.Checkbox(label="Filter Far Points", value=False)
+            with gr.Column():
+                fx = gr.Number(label="Focal length x", value=500.0, visible=False)
+                fy = gr.Number(label="Focal length y", value=500.0, visible=False)
+                cx = gr.Number(label="Center projection x", value=320.0, visible=False)
+                cy = gr.Number(label="Center projection y", value=240.0, visible=False)
+                hfov = gr.Number(
+                    label="Horizontal FoV (degree)", value=0.0, visible=False
+                )
+            with gr.Column():
+                k1 = gr.Number(label="Radial 1", value=0.0, visible=False)
+                k2 = gr.Number(label="Radial 2", value=0.0, visible=False)
+                k3 = gr.Number(label="Radial 3", value=0.0, visible=False)
+                k4 = gr.Number(label="Radial 4", value=0.0, visible=False)
+            with gr.Column():
+                k5 = gr.Number(label="Radial 5", value=0.0, visible=False)
+                k6 = gr.Number(label="Radial 6", value=0.0, visible=False)
+                t1 = gr.Number(label="Tangential 1", value=0.0, visible=False)
+                t2 = gr.Number(label="Tangential 2", value=0.0, visible=False)
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(label="Upload Images")
+                gr.Markdown("**3D Estimation**")
+                with gr.Row():
+                    log_output = gr.Markdown(
+                        "Please upload one image at a time, then click `Run UniK3D`.",
+                        elem_classes=["custom-log"],
+                    )
+                    reconstruction_npy = gr.File(
+                        label="Download 3D Pointcloud", type="filepath"
+                    )
+            with gr.Column(scale=2):
+                reconstruction_output = gr.Model3D(
+                    height=520, zoom_speed=0.5, pan_speed=0.5
+                )
+                with gr.Row():
+                    submit_btn = gr.Button("Run UniK3D", scale=1, variant="primary")
+                    clear_btn = gr.ClearButton(
+                        [
+                            input_image,
+                            reconstruction_output,
+                            log_output,
+                            target_dir_output,
+                            reconstruction_npy,
+                        ],
+                        scale=1,
+                    )
+        examples = [
+            [
+                "assets/demo/poorthings.jpg",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                True,
+                False,
+            ],
+            [
+                "assets/demo/naruto.jpg",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/bears.png",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                True,
+                False,
+            ],
+            [
+                "assets/demo/berzirk.jpg",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                True,
+                False,
+            ],
+            [
+                "assets/demo/luke.webp",
+                "Large",
+                "Predicted",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/equirectangular.jpg",
+                "Large",
+                "Equirectangular",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                360.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/venice.jpg",
+                "Large",
+                "Equirectangular",
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                360.0,
+                False,
+                True,
+            ],
+            [
+                "assets/demo/dl3dv.png",
+                "Large",
+                "OPENCV",
+                429.57611083984375,
+                429.6898193359375,
+                479.5,
+                269.5,
+                -0.0014844092074781656,
+                0.0007422995404340327,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.00012013866944471374,
+                0.001125041046179831,
+                0.0,
+                False,
+                False,
+            ],
+            [
+                "assets/demo/scannet.png",
+                "Large",
+                "Fisheye624",
+                791.90869140625,
+                792.7230834960938,
+                878.16796875,
+                585.045166015625,
+                -0.029167557135224342,
+                -0.006803446915000677,
+                -0.0012682401575148106,
+                -4.6094228309812024e-05,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                False,
+                False,
+            ],
+        ]
+        def example_pipeline(
+            input_image,
+            model_name,
+            camera_name,
+            fx,
+            fy,
+            cx,
+            cy,
+            k1,
+            k2,
+            k3,
+            k4,
+            k5,
+            k6,
+            t1,
+            t2,
+            hfov,
+            mask_black_bg,
+            mask_far_points,
+        ):
+            target_dir, image_path = handle_uploads(input_image)
+            glbfile, log_msg, prediction_save_path = gradio_demo(
+                target_dir,
+                model_name,
+                camera_name,
+                fx,
+                fy,
+                cx,
+                cy,
+                k1,
+                k2,
+                k3,
+                k4,
+                k5,
+                k6,
+                t1,
+                t2,
+                hfov,
+                mask_black_bg,
+                mask_far_points,
+            )
+            return (
+                glbfile,
+                log_msg,
+                prediction_save_path,
+                target_dir,
+                image_path,
+            )
+        gr.Markdown("Click any row to load an example.", elem_classes=["example-log"])
+        gr.Examples(
+            examples=examples,
+            inputs=[
+                input_image,
+                model_dropdown,
+                camera_dropdown,
+                fx,
+                fy,
+                cx,
+                cy,
+                k1,
+                k2,
+                k3,
+                k4,
+                k5,
+                k6,
+                t1,
+                t2,
+                hfov,
+                mask_black_bg,
+                mask_far_points,
+            ],
+            outputs=[reconstruction_output, log_output, reconstruction_npy],
+            fn=example_pipeline,
+            cache_examples=False,
+            examples_per_page=50,
+        )
+        submit_btn.click(
+            fn=clear_fields, inputs=[], outputs=[reconstruction_output]
+        ).then(fn=update_log, inputs=[], outputs=[log_output]).then(
+            fn=gradio_demo,
+            inputs=[
+                target_dir_output,
+                model_dropdown,
+                camera_dropdown,
+                fx,
+                fy,
+                cx,
+                cy,
+                k1,
+                k2,
+                k3,
+                k4,
+                k5,
+                k6,
+                t1,
+                t2,
+                hfov,
+                mask_black_bg,
+                mask_far_points,
+            ],
+            outputs=[reconstruction_output, log_output, reconstruction_npy],
+        ).then(
+            fn=lambda: "False", inputs=[], outputs=[is_example]
+        )
+        mask_black_bg.change(
+            update_visualization,
+            [target_dir_output, mask_black_bg, mask_far_points, is_example],
+            [reconstruction_output, log_output],
+        )
+        mask_far_points.change(
+            update_visualization,
+            [target_dir_output, mask_black_bg, mask_far_points, is_example],
+            [reconstruction_output, log_output],
+        )
+        input_image.change(
+            fn=update_gallery_on_upload,
+            inputs=[input_image],
+            outputs=[target_dir_output, log_output],
+        )
+        # Dynamically update intrinsic parameter visibility when camera selection changes.
+        camera_dropdown.change(
+            fn=update_parameters,
+            inputs=camera_dropdown,
+            outputs=[fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, t1, t2, hfov],
+        )
+        # demo.queue(max_size=20).launch(show_error=True, share=False, ssr_mode=False)
+        demo.launch(
+            show_error=True,
+        )

hubconf.py ADDED Viewed

	@@ -0,0 +1,29 @@

+dependencies = ["torch", "huggingface_hub"]
+import os
+import json
+import torch
+import huggingface_hub
+from unik3d.models import UniK3D as UniK3D_
+BACKBONES = ["vitl", "vitb", "vits"]
+def UniK3D(backbone="vitl", pretrained=True):
+    assert backbone in BACKBONES, f"backbone must be one of {BACKBONES}"
+    repo_dir = os.path.dirname(os.path.realpath(__file__))
+    with open(os.path.join(repo_dir, "configs", f"config_{backbone}.json")) as f:
+        config = json.load(f)
+    model = UniK3D_(config)
+    if pretrained:
+        path = huggingface_hub.hf_hub_download(repo_id=f"lpiccinelli/unik3d-{backbone}", filename=f"pytorch_model.bin", repo_type="model")
+        info = model.load_state_dict(torch.load(path), strict=False)
+        print(f"UniK3D-{backbone} is loaded with:")
+        print(f"\t missing keys: {info.missing_keys}")
+        print(f"\t additional keys: {info.unexpected_keys}")
+    return model

pyproject.toml ADDED Viewed

	@@ -0,0 +1,25 @@

+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[tool.pyright]
+include = ["unik3d"]
+[project]
+name = "unik3d"
+version = "0.1"
+authors = [{name = "Luigi Piccinelli", email = "lpiccinelli@ethz.ch"}]
+description = "UniK3D: Universal Monocular Metric Depth Estimation"
+readme = "README.md"
+license = { text="Creatives Common BY-NC 4.0 license"}
+requires-python = ">=3.11.0"
+dynamic = ["dependencies"]
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+[tool.setuptools.package-data]
+"*" = ["py.typed"]
+[tool.setuptools.packages.find]
+include = ["unik3d*"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,84 @@

+appdirs
+attrs
+black
+blosc2
+botocore>=1.34.54
+certifi>=2022.12.7
+charset-normalizer
+click
+contourpy
+cycler
+docker-pycreds
+einops>=0.7.0
+filelock
+flake8>=7.0.0
+flake8-bugbear>=24.2.6
+flake8-comprehensions>=3.14.0
+fonttools
+fsspec
+fvcore>=0.1.5.post20221221
+gitdb
+GitPython
+gradio
+h5py>=3.10.0
+huggingface-hub>=0.22.0
+idna
+imageio
+imath
+iopath
+isort
+Jinja2
+jmespath
+kiwisolver
+MarkupSafe
+matplotlib
+mccabe
+mpmath
+msgpack
+mypy-extensions
+ndindex
+networkx
+ninja
+numexpr
+numpy<2.0.0
+opencv-python
+OpenEXR
+packaging
+pandas
+pathspec
+pillow>=10.2.0
+platformdirs
+portalocker
+protobuf>=4.25.3
+psutil
+py-cpuinfo
+pycodestyle
+pyflakes
+pyparsing
+python-dateutil
+pytz
+PyYAML
+requests
+safetensors
+scipy
+sentry-sdk
+setproctitle
+six
+smmap
+sympy
+tables
+tabulate
+termcolor
+timm
+tqdm
+trimesh
+triton>=2.4.0
+typing_extensions
+tzdata==2024.1
+urllib3==1.26.13
+wandb
+yacs
+torch>=2.4.0
+torchvision>=0.19.0
+torchaudio>=2.4.0
+xformers>=0.0.26

requirements_demo.txt ADDED Viewed

	@@ -0,0 +1,84 @@

+appdirs
+attrs
+black
+blosc2
+botocore>=1.34.54
+certifi>=2022.12.7
+charset-normalizer
+click
+contourpy
+cycler
+docker-pycreds
+einops>=0.7.0
+filelock
+flake8>=7.0.0
+flake8-bugbear>=24.2.6
+flake8-comprehensions>=3.14.0
+fonttools
+fsspec
+fvcore>=0.1.5.post20221221
+gitdb
+GitPython
+gradio
+h5py>=3.10.0
+huggingface-hub>=0.22.0
+idna
+imageio
+imath
+iopath
+isort
+Jinja2
+jmespath
+kiwisolver
+MarkupSafe
+matplotlib
+mccabe
+mpmath
+msgpack
+mypy-extensions
+ndindex
+networkx
+ninja
+numexpr
+numpy<2.0.0
+opencv-python
+OpenEXR
+packaging
+pandas
+pathspec
+pillow>=10.2.0
+platformdirs
+portalocker
+protobuf>=4.25.3
+psutil
+py-cpuinfo
+pycodestyle
+pyflakes
+pyparsing
+python-dateutil
+pytz
+PyYAML
+requests
+safetensors
+scipy
+sentry-sdk
+setproctitle
+six
+smmap
+sympy
+tables
+tabulate
+termcolor
+timm
+tqdm
+trimesh
+triton>=2.4.0
+typing_extensions
+tzdata==2024.1
+urllib3==1.26.13
+wandb
+yacs
+torch>=2.4.0
+torchvision>=0.19.0
+torchaudio>=2.4.0
+xformers>=0.0.26

scripts/README.md ADDED Viewed

	@@ -0,0 +1,55 @@

+## Training
+We provide the `train.py` script that allows to load the dataset, initialize and start the training. From the root of the repo:
+```bash
+export REPO=`pwd`
+export PYTHONPATH=${REPO}:${PYTHONPATH}
+# Adapt all this to your setup
+export TMPDIR="/tmp"
+export TORCH_HOME=${TMPDIR}
+export HUGGINGFACE_HUB_CACHE=${TMPDIR}
+export WANDB_HOME=${TMPDIR}
+export DATAROOT=<where-you-stored-the-hdf5>
+export MASTER_PORT=$((( RANDOM % 600 ) + 29400 ))
+if [ $NNODES -gt 1 ]; then
+    export MASTER_PORT=29400
+fi
+# this is the config will be used
+export CFG="config_vitl.json"
+```
+If you are on a machine without SLURM you can run the following:
+```bash
+# make the following input-dependent for multi-node
+export NNODES=1
+export RANK=0
+export MASTER_ADDR=127.0.0.1
+export CUDA_VISIBLE_DEVICES="0" # set yours
+export GPUS=$(echo ${CUDA_VISIBLE_DEVICES} | tr ',' '\n' | wc -l)
+echo "Start script with python from: `which python`"
+torchrun --rdzv-backend=c10d --nnodes=${NNODES} --nproc_per_node=${GPUS} --rdzv-endpoint ${MASTER_ADDR}:${MASTER_PORT} ${REPO}/scripts/train.py --config-file ${REPO}/configs/${CFG} --distributed
+```
+If you system has SLURM, all the information will be set by the scheduler and you have to run just:
+```bash
+srun -c ${SLURM_CPUS_PER_TASK} --kill-on-bad-exit=1 python -u ${REPO}/scripts/train.py --config-file ${REPO}/configs/${CFG} --master-port ${MASTER_PORT} --distributed
+```
+### Datasets
+We used both image-based and sequence-based dataset. The `ImageDataset` class is actually for legacy only as we moved image-based dataset to be "dummy" single-frame sequences.<br>
+We [provide two example dataset to get familiar to the pipeline and structure, namely iBims-1 and Sintel](https://drive.google.com/drive/folders/1FKsa5-b3EX0ukZq7bxord5fC5OfUiy16?usp=sharing), image- and sequence-based, respectively.<br>
+You can adapt the data loading and processing to your example; however, you will need to keep the same interface for the model to be consisten and train "out-of-the-box" the model.<br>
+### Additional dependencies
+We require chamfer distance for the evaluation, you can compile the knn operation under `ops/knn`: `bash compile.sh` from the directory `$REPO/unik3d/ops/knn`. Set the correct `export TORCH_CUDA_ARCH_LIST`, according to the hardware you are working on.
+For training and to perform augmentation, you can use `camera_augmenter.py`; however the splatting requires you to install operations by cloning and installing from `github.com/hperrot/splatting`.

scripts/demo.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import argparse
+import json
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from unik3d.models import UniK3D
+from unik3d.utils.camera import (MEI, OPENCV, BatchCamera, Fisheye624, Pinhole,
+                                 Spherical)
+from unik3d.utils.visualization import colorize, save_file_ply
+SAVE = False
+BASE_PATH = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "assets", "demo"
+)
+def infer(model, rgb_path, camera_path, rays=None):
+    rgb = np.array(Image.open(rgb_path))
+    rgb_torch = torch.from_numpy(rgb).permute(2, 0, 1)
+    camera = None
+    if camera_path is not None:
+        with open(camera_path, "r") as f:
+            camera_dict = json.load(f)
+        params = torch.tensor(camera_dict["params"])
+        name = camera_dict["name"]
+        assert name in ["Fisheye624", "Spherical", "OPENCV", "Pinhole", "MEI"]
+        camera = eval(name)(params=params)
+    outputs = model.infer(rgb=rgb_torch, camera=camera, normalize=True, rays=rays)
+    return rgb_torch, outputs
+def infer_equirectangular(model, rgb_path):
+    rgb = np.array(Image.open(rgb_path))
+    rgb_torch = torch.from_numpy(rgb).permute(2, 0, 1)
+    # assuming full equirectangular image horizontally
+    H, W = rgb.shape[:2]
+    hfov_half = np.pi
+    vfov_half = np.pi * H / W
+    assert vfov_half <= np.pi / 2
+    params = [W, H, hfov_half, vfov_half]
+    camera = Spherical(params=torch.tensor([1.0] * 4 + params))
+    outputs = model.infer(rgb=rgb_torch, camera=camera, normalize=True)
+    return rgb_torch, outputs
+def save(rgb, outputs, name, base_path, save_pointcloud=False):
+    depth = outputs["depth"]
+    rays = outputs["rays"]
+    points = outputs["points"]
+    depth = depth.cpu().numpy()
+    rays = ((rays + 1) * 127.5).clip(0, 255)
+    Image.fromarray(colorize(depth.squeeze())).save(
+        os.path.join(base_path, f"{name}_depth.png")
+    )
+    Image.fromarray(rgb.squeeze().permute(1, 2, 0).cpu().numpy()).save(
+        os.path.join(base_path, f"{name}_rgb.png")
+    )
+    Image.fromarray(rays.squeeze().permute(1, 2, 0).byte().cpu().numpy()).save(
+        os.path.join(base_path, f"{name}_rays.png")
+    )
+    if save_pointcloud:
+        predictions_3d = points.permute(0, 2, 3, 1).reshape(-1, 3).cpu().numpy()
+        rgb = rgb.permute(1, 2, 0).reshape(-1, 3).cpu().numpy()
+        save_file_ply(predictions_3d, rgb, os.path.join(base_path, f"{name}.ply"))
+def demo(model):
+    # RGB + CAMERA
+    rgb, outputs = infer(
+        model,
+        os.path.join(BASE_PATH, f"scannet.png"),
+        os.path.join(BASE_PATH, "scannet.json"),
+    )
+    if SAVE:
+        save(rgb, outputs, name="scannet", base_path=BASE_PATH)
+    # get GT and pred
+    pts_pred = outputs["points"].squeeze().cpu().permute(1, 2, 0).numpy()
+    pts_gt = np.load("./assets/demo/scannet.npy").astype(float)
+    mask = np.linalg.norm(pts_gt, axis=-1) > 0
+    error = np.linalg.norm(pts_pred - pts_gt, axis=-1)
+    error = np.mean(error[mask] ** 2) ** 0.5
+    # Trade-off between speed and resolution
+    model.resolution_level = 1
+    rgb, outputs = infer(
+        model,
+        os.path.join(BASE_PATH, f"scannet.png"),
+        os.path.join(BASE_PATH, "scannet.json"),
+    )
+    if SAVE:
+        save(rgb, outputs, name="scannet_lowres", base_path=BASE_PATH)
+    # RGB
+    rgb, outputs = infer(model, os.path.join(BASE_PATH, f"poorthings.jpg"), None)
+    if SAVE:
+        save(rgb, outputs, name="poorthings", base_path=BASE_PATH)
+    # RGB + CAMERA
+    rgb, outputs = infer(
+        model,
+        os.path.join(BASE_PATH, f"dl3dv.png"),
+        os.path.join(BASE_PATH, "dl3dv.json"),
+    )
+    if SAVE:
+        save(rgb, outputs, name="dl3dv", base_path=BASE_PATH)
+    # EQUIRECTANGULAR
+    rgb, outputs = infer_equirectangular(
+        model, os.path.join(BASE_PATH, f"equirectangular.jpg")
+    )
+    if SAVE:
+        save(rgb, outputs, name="equirectangular", base_path=BASE_PATH)
+    print("Output keys are", outputs.keys())
+    if SAVE:
+        print("Done! Results saved in", BASE_PATH)
+    print(f"RMSE on 3D clouds for ScanNet++ sample: {100*error:.1f}cm")
+if __name__ == "__main__":
+    print("Torch version:", torch.__version__)
+    type_ = "l"  # available types: s, b, l
+    name = f"unik3d-vit{type_}"
+    model = UniK3D.from_pretrained(f"lpiccinelli/{name}")
+    # set resolution level in [0,10) and output interpolation
+    model.resolution_level = 9
+    model.interpolation_mode = "bilinear"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device).eval()
+    demo(model)

scripts/train.py ADDED Viewed

	@@ -0,0 +1,630 @@

+import argparse
+import json
+import os
+import random
+import uuid
+from contextlib import nullcontext
+from copy import deepcopy
+from datetime import datetime as dt
+from functools import partial
+from math import log2
+from time import sleep, time
+from typing import Any, Dict
+import git
+import numpy as np
+import psutil
+import torch
+import torch.nn as nn
+import torch.utils.data.distributed
+import wandb
+from PIL import Image
+from torch import distributed as dist
+from torch import optim
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from tqdm import tqdm
+import unik3d.datasets as datasets
+from unik3d.datasets import (ConcatDataset, DistributedSamplerNoDuplicate,
+                             collate_fn, get_weights)
+from unik3d.models import UniK3D
+from unik3d.ops.scheduler import CosineScheduler
+from unik3d.utils import (barrier, format_seconds, is_main_process,
+                          log_train_artifacts, validate)
+from unik3d.utils.distributed import (create_local_process_group,
+                                      local_broadcast_process_authkey,
+                                      setup_multi_processes, setup_slurm,
+                                      sync_string_across_gpus,
+                                      sync_tensor_across_gpus)
+from unik3d.utils.ema_torch import (DummyExponentialMovingAverage,
+                                    ExponentialMovingAverage)
+from unik3d.utils.misc import calculate_mean_values
+EMA_INTERVAL = 10
+EMA_TAU = 10000
+EMA_START = 50000
+MAP_DTYPE = {
+    "f16": torch.float16,
+    "bf16": torch.bfloat16,
+    "f32": torch.float32,
+}
+def aggregate_sync_losses(dict_: dict[str, torch.Tensor], device):
+    keys = list(dict_.keys())
+    values = torch.tensor(list(dict_.values()), device=device)
+    keys = sync_string_across_gpus(keys, device)
+    values = sync_tensor_across_gpus(values, dim=0).cpu().tolist()
+    dict_ = calculate_mean_values(keys, values)
+    return dict_
+def main_worker(config: Dict[str, Any], args: argparse.Namespace):
+    current_process = psutil.Process(os.getpid())
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    seed = config["generic"]["seed"]
+    if not args.distributed:
+        args.rank = 0
+        args.local_rank = 0
+        args.world_size = 1
+    else:
+        # initializes the distributed backend which will take care of synchronizing nodes/GPUs
+        setup_multi_processes(config)
+        is_slurm = "SLURM_PROCID" in os.environ
+        if is_slurm:
+            setup_slurm("nccl", port=args.master_port)
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.local_rank = device = int(os.environ["LOCAL_RANK"])
+        if not is_slurm:
+            import datetime
+            dist.init_process_group(
+                "nccl",
+                rank=args.rank,
+                world_size=args.world_size,
+                timeout=datetime.timedelta(seconds=30 * 60),
+            )
+            torch.cuda.set_device(device)
+        create_local_process_group()
+        local_broadcast_process_authkey()
+        print(
+            f"Start running DDP on: {args.rank} (local: {args.local_rank}) with seed {seed + args.rank}."
+        )
+        config["training"]["batch_size"] = int(
+            config["training"]["batch_size"] / args.world_size
+        )
+        dist.barrier()
+    # Fix seed
+    # Different for every machine to avoid sampling
+    # the same element across machines
+    seed = seed + args.rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    batch_size = config["training"]["batch_size"]
+    if is_main_process():
+        print("Config: ", args.config_file)
+        print(
+            f"Torch version:{torch.__version__}, cuda:{torch.version.cuda}, cudnn:{torch.backends.cudnn.version()}, threads:{torch.get_num_threads()}"
+        )
+        print("BatchSize per GPU: ", batch_size)
+        print(
+            f"Divided into {config['training']['nsteps_accumulation_gradient']} accumulation step"
+        )
+    ##############################
+    ########### MODEL ############
+    ##############################
+    # Build model
+    model = UniK3D(config).to(device)
+    model.eval()
+    print(f"MODEL: {model.__class__.__name__} at {model.device}")
+    torch.cuda.empty_cache()
+    if args.distributed:
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        model = DistributedDataParallel(
+            model,
+            find_unused_parameters=False,
+            device_ids=[device],
+            output_device=device,
+        )
+    ##############################
+    ######### OPTIMIZER ##########
+    ##############################
+    dtype_16bit = config["training"]["f16"]
+    is_16bit = dtype_16bit != "f32"
+    clipping = config["training"].get("clipping", None)
+    # Optimize
+    ddp_model = model.module if args.distributed else model
+    params = ddp_model.get_params(config)
+    optimizer = optim.AdamW(
+        params,
+        eps=6e-8 if is_16bit else 1e-8,  # smallest subnormal fp16 number is 5.96e-8
+        # amsgrad=is_16bit, # use max instead of avg v_hat, avoid small number divisions?
+    )
+    # Load Model:
+    step = 0
+    if config["training"].get("pretrained", None) is not None:
+        ddp_model.load_pretrained(config["training"]["pretrained"])
+        pretrained = torch.load(
+            config["training"]["pretrained"], map_location="cpu", weights_only=False
+        )
+        try:
+            optimizer.load_state_dict(pretrained["optimizer"])
+        except Exception as e:
+            if is_main_process():
+                print("Could not load optimizer state dict:", e)
+        step = pretrained.get("step", 0)
+        ddp_model.pixel_decoder.steps = step
+    # EMA
+    ema_class = (
+        ExponentialMovingAverage
+        if config["training"]["ema"] > 0.0
+        else DummyExponentialMovingAverage
+    )
+    ema_handle = ema_class(
+        ddp_model.parameters_grad(),
+        1 - (1 - config["training"]["ema"]) * EMA_INTERVAL,
+        update_after_step=config["training"]["warmup_iters"] / EMA_INTERVAL,
+        switch=True,
+        tau=EMA_TAU // EMA_INTERVAL,
+    )
+    setattr(ema_handle, "num_updates", step // EMA_INTERVAL)
+    ##############################
+    ######### GENERICS ###########
+    ##############################
+    resize_method = config["data"].get("resize_method", "hard")
+    crop = config["data"].get("crop", "garg")
+    augmentations_db = config["data"].get("augmentations", {})
+    shape_constraints = config["data"].get("shape_constraints", {})
+    image_shape = config["data"]["image_shape"]
+    mini = config["data"]["mini"]
+    nsteps_accumulation_gradient = config["training"]["nsteps_accumulation_gradient"]
+    batch_size = config["training"]["batch_size"]
+    clipping_fn = torch.nn.utils.clip_grad_norm_
+    is_shell = int(os.environ.get("SHELL_JOB", 0))
+    run_id = sync_string_across_gpus(
+        [f"{dt.now().strftime('%d-%h_%H-%M')}-{uuid.uuid4()}"], device
+    )[0]
+    if not is_shell and is_main_process():
+        repo_folder = os.path.dirname(os.path.realpath(__file__))
+        try:
+            repo = git.Repo(repo_folder)
+            current_head = repo.head if repo.head.is_detached else repo.active_branch
+            notes = f"MESSAGE: {current_head.commit.message} HASH:{current_head.commit.hexsha} BRANCH:{current_head.name}"
+        except:
+            print(f"problem with {repo_folder}, does it exist?")
+            notes = ""
+        # restore the original batchsize, not acquired by other calls from now on
+        if args.distributed:
+            config["training"]["batch_size"] = (
+                config["training"]["batch_size"] * args.world_size
+            )
+        wandb.init(
+            project="UniK3D",
+            name=run_id,
+            config=config,
+            tags=None,
+            notes=notes,
+            dir=os.environ.get("WANDB_HOME", os.environ.get("TMPDIR", "/tmp")),
+        )
+        wandb.watch(model)
+    ##############################
+    ########## DATASET ###########
+    ##############################
+    # Datasets loading
+    train_datasets, val_datasets = {}, {}
+    if is_main_process():
+        print("Loading training datasets...")
+    dims = 0
+    for dataset in config["data"]["train_datasets"]:
+        assert hasattr(datasets, dataset), f"{dataset} not a custom dataset"
+        train_dataset: datasets.BaseDataset = getattr(datasets, dataset)
+        train_datasets[dataset] = train_dataset(
+            image_shape=image_shape,
+            split_file=train_dataset.train_split,
+            test_mode=False,
+            crop=crop,
+            augmentations_db=augmentations_db,
+            shape_constraints=shape_constraints,
+            normalize=config["data"].get("normalization", "imagenet"),
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=config["data"].get("num_frames", 1),
+            fps_range=[1, 5],
+            num_copies=config["data"]["pair"],
+        )
+        dim = (
+            train_datasets[dataset].dataset._addr.numel() * 8
+            + train_datasets[dataset].dataset._lst.numel()
+        ) / (2**20)
+        if hasattr(train_datasets[dataset], "sequences"):
+            dim += (
+                train_datasets[dataset].sequences._addr.numel() * 8
+                + train_datasets[dataset].sequences._lst.numel()
+            ) / (2**20)
+        dims = dims + dim
+        if is_main_process():
+            print(f"{dataset}: {dim:.1f}MB")
+    print(f"All training datasets loaded, with total size: {dims:.1f}MB")
+    barrier()
+    assert batch_size % config["data"]["pair"] == 0
+    batch_size = batch_size // config["data"]["pair"]
+    assert batch_size % nsteps_accumulation_gradient == 0
+    batch_chunk = batch_size // nsteps_accumulation_gradient
+    train_dataset = ConcatDataset(
+        list(train_datasets.values()),
+        shape_constraints=shape_constraints,
+    )
+    if is_main_process():
+        print("Loading validation datasets...")
+    for dataset in config["data"]["val_datasets"]:
+        val_dataset: datasets.BaseDataset = getattr(datasets, dataset)
+        val_datasets[dataset] = val_dataset(
+            image_shape=image_shape,
+            split_file=val_dataset.test_split,
+            test_mode=True,
+            crop=crop,
+            shape_constraints=shape_constraints,
+            augmentations_db=augmentations_db,
+            normalize=config["data"].get("normalization", "imagenet"),
+            resize_method=resize_method,
+            num_frames=1,
+            mini=1.0,
+            num_copies=1,
+        )
+    # Dataset samplers, create distributed sampler pinned to rank
+    if args.distributed:
+        sampling = deepcopy(config["data"]["sampling"])
+        weights, num_samples = get_weights(train_datasets, sampling)
+        train_sampler = torch.utils.data.WeightedRandomSampler(
+            weights, num_samples, replacement=True
+        )
+        valid_samplers = {
+            k: DistributedSamplerNoDuplicate(
+                v,
+                num_replicas=args.world_size,
+                rank=args.rank,
+                shuffle=False,
+                drop_last=False,
+            )
+            for k, v in val_datasets.items()
+        }
+    else:
+        train_sampler = RandomSampler(train_dataset)
+        valid_samplers = {k: SequentialSampler(v) for k, v in val_datasets.items()}
+    train_sampler = torch.utils.data.BatchSampler(
+        train_sampler, batch_size=batch_size, drop_last=True
+    )
+    # Dataset loader
+    val_batch_size = 1
+    num_workers = int(os.environ.get("SLURM_CPUS_PER_TASK", 4))
+    train_loader = DataLoader(
+        train_dataset,
+        num_workers=num_workers,
+        sampler=train_sampler,
+        pin_memory=True,
+        collate_fn=partial(collate_fn, is_batched=True),
+        persistent_workers=True if num_workers else None,
+    )
+    val_loaders = {
+        name_dataset: DataLoader(
+            dataset,
+            batch_size=val_batch_size,
+            shuffle=False,
+            num_workers=num_workers,
+            sampler=valid_samplers[name_dataset],
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, is_batched=False),
+        )
+        for name_dataset, dataset in val_datasets.items()
+    }
+    # SCHEDULERS!
+    scheduler_wd = CosineScheduler(
+        optimizer,
+        key="weight_decay",
+        init_value=config["training"]["wd"],
+        base_value=config["training"]["wd"],
+        final_value=config["training"]["wd_final"],
+        warmup_iters=0,
+        total_iters=config["training"]["n_iters"],
+        flat_iters=config["training"]["warmup_iters"],
+        step_init=step - 1,
+    )
+    scheduler_lr = CosineScheduler(
+        optimizer,
+        key="lr",
+        init_value=config["training"]["lr"] * config["training"].get("lr_warmup", 1.0),
+        final_value=config["training"]["lr_final"],
+        warmup_iters=5000,
+        flat_iters=config["training"]["warmup_iters"],
+        total_iters=config["training"]["n_iters"],
+        step_init=step - 1,
+    )
+    scheduler_betas = CosineScheduler(
+        optimizer,
+        key="betas",
+        init_value=0.95 if config["training"].get("cycle_betas", True) else 0.9,
+        base_value=0.85 if config["training"].get("cycle_betas", True) else 0.9,
+        final_value=0.95 if config["training"].get("cycle_betas", True) else 0.9,
+        warmup_iters=config["training"]["warmup_iters"],
+        total_iters=config["training"]["n_iters"],
+        step_init=step - 1,
+    )
+    # Set loss scaler for half precision training + sanity zeroing grads
+    dtype = MAP_DTYPE[dtype_16bit]
+    if not torch.cuda.is_bf16_supported() and is_16bit:
+        dtype = torch.float16
+    context = torch.autocast(device_type="cuda", dtype=dtype, enabled=is_16bit)
+    # use float16 to check for instability at inference an avoid bfloat16 for coarseness
+    context_val = torch.autocast(
+        device_type="cuda", dtype=torch.float16, enabled=is_16bit
+    )
+    optimizer.zero_grad(set_to_none=True)
+    ##############################
+    ########## TRAINING ##########
+    ##############################
+    # Remember that if i-th layer is frozen, this will break gradient checkpointing
+    # in layer i+1-th. This is because CheckpointFunction treats the i+1-th input as
+    # without gradient, thus the i+1-th layer does not have grads (?). To solve it,
+    # just add requires_grad_() to the inputs coming from the frozen layer
+    ddp_model.train()
+    start = time()
+    n_steps = config["training"]["n_iters"]
+    init_steps = int(step)
+    track_pbar = is_shell
+    if is_main_process():
+        print("Is a shell job?", is_shell)
+        print("Use dtype:", dtype if is_16bit else torch.float32)
+        print(
+            f'Train for {config["training"]["n_iters"]} steps, validate every {config["training"]["validation_interval"]} steps'
+        )
+        print(f"START with {num_workers} workers")
+        if track_pbar:
+            pbar = tqdm(total=n_steps - init_steps)
+    scaler = torch.amp.GradScaler(
+        "cuda",
+        init_scale=2**14 if dtype_16bit == "f16" else 2**40,
+        enabled=is_16bit,
+        growth_factor=1.2,
+        backoff_factor=0.8,
+        growth_interval=500,
+    )
+    track_losses, track_grad = {}, {}
+    system_memory = dict(psutil.virtual_memory()._asdict())["available"] / 2**30
+    cpid_memory = current_process.memory_info()[0] / 2.0**30
+    gpu_mem = (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 2**30
+    while True:
+        for j, batches in enumerate(train_loader):
+            system_memory = (
+                0.99 * system_memory
+                + 0.01 * dict(psutil.virtual_memory()._asdict())["available"] / 2**30
+            )
+            cpid_memory = (
+                0.99 * cpid_memory + 0.01 * current_process.memory_info()[0] / 2.0**30
+            )
+            gpu_mem = (
+                0.99 * gpu_mem
+                + 0.01
+                * (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0])
+                / 2**30
+            )
+            if j % 1000 == 0 and is_main_process():
+                print(f"System information at step {j}")
+                print(f"System-wide RAM available: {system_memory:.2f}GB")
+                print(f"CPU utilization: {psutil.cpu_percent(interval=None)}%")
+                print(f"GPU memory utilized: {gpu_mem:.2f}GB")
+            batches["data"] = {
+                k: v.to(model.device, non_blocking=True)
+                for k, v in batches["data"].items()
+            }
+            for idx in range(nsteps_accumulation_gradient):
+                batch = {}
+                batch_slice = slice(idx * batch_chunk, (idx + 1) * batch_chunk)
+                batch["data"] = {k: v[batch_slice] for k, v in batches["data"].items()}
+                batch["img_metas"] = batches["img_metas"][batch_slice]
+                with (
+                    model.no_sync()
+                    if idx < nsteps_accumulation_gradient - 1
+                    else nullcontext()
+                ):
+                    with context:
+                        preds, losses = model(batch["data"], batch["img_metas"])
+                    loss = sum(losses["opt"].values())
+                    scaler.scale(loss).backward()
+                losses_dict = {
+                    k: v.detach() for loss in losses.values() for k, v in loss.items()
+                }
+                track_losses.update(
+                    {
+                        k: track_losses.get(k, 0.0)
+                        + torch.nan_to_num(v, nan=1e5, posinf=1e5, neginf=1e5)
+                        for k, v in losses_dict.items()
+                    }
+                )
+                ddp_model.loss_history = track_losses
+            if clipping is not None:
+                scaler.unscale_(optimizer)
+                grad_norm = clipping_fn(ddp_model.parameters_grad(), clipping)
+                if torch.isfinite(grad_norm):
+                    track_losses.update(
+                        {"Grad_Norm": track_losses.get("Grad_Norm", 0.0) + grad_norm}
+                    )
+            # there is a deeper issue, either log/sqrt of negative loss
+            # or the inputs create large values and destroy model weights
+            if is_16bit and scaler.get_scale() < 1:
+                raise ValueError("Scale went less than 1, ISSUE!!!")
+            scaler.step(optimizer)
+            scaler.update()
+            scheduler_wd.step()
+            scheduler_lr.step()
+            scheduler_betas.step()
+            model.module.step()
+            optimizer.zero_grad(set_to_none=True)
+            if step % EMA_INTERVAL == 0:
+                ema_handle.update()
+            if is_main_process() and track_pbar:
+                pbar.update(1)
+            step += 1
+            # LOGGING
+            if step % 100 == 0 and is_main_process():
+                log_num = min(10, preds["depth"].shape[0])
+                log_train_artifacts(
+                    batch["data"]["image"][-log_num:, 0].float(),
+                    (
+                        batch["data"]["depth"][-log_num:, 0].float()
+                        if "depth" in batch["data"]
+                        else []
+                    ),
+                    preds["depth"][-log_num:, 0].detach().float(),
+                    infos={
+                        k: v[-log_num:, 0] for k, v in preds.get("infos", {}).items()
+                    },
+                    step=step,
+                )
+            if step % 50 == 0:
+                track_losses = {
+                    k: v / (50 * nsteps_accumulation_gradient)
+                    for k, v in track_losses.items()
+                }
+                # grad norm is for every step!
+                track_losses["Grad_Norm"] = (
+                    track_losses["Grad_Norm"] * nsteps_accumulation_gradient
+                )
+                track_losses = aggregate_sync_losses(track_losses, device=model.device)
+                if is_main_process():
+                    elapsed = int(time() - start)
+                    eta = int(elapsed * (n_steps - step) / max(1, step - init_steps))
+                    print(
+                        f"Step {step}/{n_steps} [{format_seconds(elapsed)}<{format_seconds(eta)}]"
+                    )
+                    try:
+                        wandb.log(
+                            {
+                                **{f"Train/{k}": v for k, v in track_losses.items()},
+                                **{f"Train/lr": scheduler_lr.get()[-1]},
+                                **{f"Train/wd": scheduler_wd.get()[-2]},
+                                **{f"Train/scale_f16": log2(scaler.get_scale())},
+                            },
+                            step=step,
+                        )
+                    except Exception as e:
+                        print("Not logging loss because of:", e)
+                        if step % 100 == 0:
+                            log_loss_dict = {
+                                f"Train/{k}": v for k, v in track_losses.items()
+                            }
+                            print(
+                                ", ".join(
+                                    [f"{k}: {v:.5f}" for k, v in log_loss_dict.items()]
+                                )
+                            )
+                track_losses = {}  # reinit every 50 steps, average the current 50 steps
+            # Validation
+            is_last_step = step >= config["training"]["n_iters"]
+            is_validation = step % config["training"]["validation_interval"] == 0
+            if is_last_step or is_validation:
+                torch.cuda.empty_cache()
+                barrier()
+                if is_main_process():
+                    print(f"Validation at {step}th step...")
+                ddp_model.eval()
+                start_validation = time()
+                with torch.no_grad(), ema_handle.average_parameters():
+                    validate(
+                        model,
+                        test_loaders=val_loaders,
+                        step=step,
+                        run_id=run_id,
+                        idxs=(64, 96, 224, 256),  # random
+                        context=context_val,
+                    )
+                if is_main_process():
+                    print(f"Elapsed: {format_seconds(int(time() - start_validation))}")
+                ddp_model.train()
+                torch.cuda.empty_cache()
+            if step >= config["training"]["n_iters"]:
+                if is_main_process() and track_pbar:
+                    pbar.close()
+                wandb.finish(0)
+                dist.destroy_process_group()
+                return 0
+if __name__ == "__main__":
+    if "SLURM_PROCID" in os.environ:
+        os.environ["TRITON_CACHE_DIR"] = "/tmp"
+    # Arguments
+    parser = argparse.ArgumentParser(
+        description="Training script", conflict_handler="resolve"
+    )
+    parser.add_argument("--config-file", type=str, required=True)
+    parser.add_argument("--master-port", type=str)
+    parser.add_argument("--distributed", action="store_true")
+    parser.add_argument("--local_rank", type=int, default=0)
+    args = parser.parse_args()
+    with open(args.config_file, "r") as f:
+        config = json.load(f)
+    deterministic = config["generic"].get("deterministic", True)
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.set_float32_matmul_precision("high")
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.set_num_threads(1)
+    main_worker(config, args)

unik3d/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .models import UniK3D

unik3d/datasets/_2d3ds.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Any
+import torch
+from unik3d.datasets.pipelines import Compose, PanoCrop, PanoRoll
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class d2D3DS(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 10.0
+    depth_scale = 512.0
+    test_split = "train.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = [f"2D3DS.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["cam2w", "camera_params"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+        self.resizer = Compose(
+            [PanoCrop(), PanoRoll(test_mode=test_mode), self.resizer]
+        )
+    def preprocess(self, results):
+        self.resizer.ctx = None
+        if self.test_mode:
+            for i, seq in enumerate(results["sequence_fields"]):
+                results[seq]["points"] = results[seq]["camera"].reconstruct(
+                    results[seq]["depth"]
+                )
+                results[seq]["depth"] = results[seq]["points"][:, -1:]
+                results[seq]["gt_fields"].add("points")
+        return super().preprocess(results)
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["synthetic"] = [False] * self.num_frames * self.num_copies
+        results["quality"] = [1] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/_4dor.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import Any
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class d4DOR(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 10.0
+    depth_scale = 1000.0
+    default_fps = 10
+    test_split = "train.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = ["4DOR.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["camera_params", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["synthetic"] = [False] * self.num_frames * self.num_copies
+        results["si"] = [False] * self.num_frames * self.num_copies
+        results["quality"] = [2] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from ._2d3ds import d2D3DS
+from ._4dor import d4DOR
+from .a2d2 import A2D2
+from .adt import ADT
+from .aimotive import aiMotive
+from .argoverse import Argoverse
+from .argoverse2 import Argoverse2
+from .arkit import ARKit
+from .ase import ASE
+from .base_dataset import BaseDataset
+from .bdd import BDD
+from .bedlam import BEDLAM
+from .behave import Behave
+from .blendedmvg import BlendedMVG
+from .cityscape import Cityscape
+from .ddad import DDAD
+from .deep360 import Deep360
+from .dense import DENSE
+from .diml import DIML
+from .diode import DiodeIndoor, DiodeIndoor_F
+from .dl3dv import DL3DV
+from .driving_stereo import DrivingStereo
+from .dtu_rmvd import DTURMVD
+from .dummy import Dummy
+from .dynamic_replica import DynReplica
+from .eden import EDEN
+from .eth3d import ETH3D, ETH3D_F, ETH3DRMVD
+from .facedepth import FaceDepth
+from .flsea import FLSea
+from .futurehouse import FutureHouse
+from .gibson import Gibson
+from .hammer import HAMMER
+from .hm3d import HM3D
+from .hoi4d import HOI4D
+from .hypersim import HyperSim
+from .ibims import IBims, IBims_F
+from .ken_burns import KenBurns
+from .kitti import KITTI, KITTIRMVD, KITTIBenchmark
+from .kitti360 import KITTI360
+from .lyft import Lyft
+from .mapillary import Mapillary
+from .matrix_city import MatrixCity
+from .matterport3d import Matterport3D
+from .megadepth import MegaDepth
+from .megadepth_s import MegaDepthS
+from .midair import MidAir
+from .mip import MIP
+from .ms2 import MS2
+from .mvimgnet import MVImgNet
+from .mvsynth import MVSynth
+from .nerds360 import NeRDS360
+from .niantic_mapfree import NianticMapFree
+from .nuscenes import Nuscenes
+from .nyuv2 import NYUv2Depth
+from .point_odyssey import PointOdyssey
+from .proteus import Proteus
+from .samplers import (DistributedSamplerNoDuplicate,
+                       DistributedSamplerWrapper, ShardedInfiniteSampler)
+from .scannet import ScanNet
+from .scannetpp import ScanNetpp, ScanNetpp_F
+from .sintel import Sintel
+from .sunrgbd import SUNRGBD
+from .synscapes import Synscapes
+from .tartanair import TartanAir
+from .taskonomy import Taskonomy
+from .tat_rmvd import TATRMVD
+from .theo import Theo
+from .unrealstereo4k import UnrealStereo4K
+from .urbansyn import UrbanSyn
+from .utils import ConcatDataset, collate_fn, get_weights
+from .vkitti import VKITTI
+from .void import VOID
+from .waymo import Waymo
+from .wildrgbd import WildRGBD
+__all__ = [
+    "Dummy",
+    "BaseDataset",
+    "get_weights" "DistributedSamplerNoDuplicate",
+    "ShardedInfiniteSampler",
+    "DistributedSamplerWrapper",
+    "ConcatDataset",
+    "PairDataset",
+    "collate_fn",
+    # additional, do not count
+    "WaymoImage",
+    "MegaDepth",
+    "COCO2017",
+    "ImageNet",
+    "OASISv2",
+    # image based
+    "Argoverse",
+    "DDAD",
+    "IBims",
+    "NYUv2Depth",
+    "DrivingStereo",
+    "VOID",
+    "Mapillary",
+    "ScanNet",
+    "Taskonomy",
+    "BDD",
+    "A2D2",
+    "Nuscenes",
+    "SUNRGBD",
+    "ETH3D",
+    "HAMMER",
+    "Cityscape",
+    "KITTI",
+    "DENSE",
+    "DIML",
+    "DiodeIndoor",
+    "FLSea",
+    "ARKitScenes",
+    "Lyft",
+    "HyperSim",
+    "KenBurns",
+    "HRWSI",
+    "UrbanSyn",
+    "Synscapes",
+    "Gibson",
+    "Matterport3D",
+    "_2D3DS",
+    # sequence based
+    "TartanAir",
+    "WildRGBD",
+    "ScanNetS",
+    "ScanNetpp",
+    "MVImgNet",
+    "NianticMapFree",
+    "DL3DV",
+    "PointOdyssey",
+    "KITTIMulti",
+    "Waymo",
+    "Argoverse2",
+    "UnrealStereo4K",
+    "MatrixCity",
+    "HM3D",
+    "MVSynth",
+    "EDEN",
+    # sequence based, but not usable for seq, only image
+    "BEDLAM",
+    "NeRDS360",
+    "BlendedMVG",
+    "DynReplica",
+    "ARKitS",
+    "Sintel",
+    "VKITTI",
+    "MegaDepthS",
+    # benchmarks
+    "KITTIBenchmark",
+    "ETH3DRMVD",
+    "DTURMVD",
+    "KITTIRMVD",
+    "TATRMVD",
+    "DiodeIndoor_F",
+    "IBims_F",
+    "ETH3D_F",
+    "KITTI360",
+    "ScanNetpp_F",
+    "ADT",
+]

unik3d/datasets/a2d2.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import json
+import os
+import h5py
+import numpy as np
+import torch
+from unik3d.datasets.image_dataset import ImageDataset
+from unik3d.datasets.utils import DatasetFromList
+class A2D2(ImageDataset):
+    min_depth = 0.01
+    max_depth = 120.0
+    depth_scale = 256.0
+    train_split = "train_clean.txt"
+    intrisics_file = "intrinsics.json"
+    hdf5_paths = ["a2d2.hdf5"]
+    def __init__(
+        self,
+        image_shape,
+        split_file,
+        test_mode,
+        crop=None,
+        benchmark=False,
+        augmentations_db={},
+        normalize=True,
+        resize_method="hard",
+        mini=1.0,
+        **kwargs,
+    ):
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            **kwargs,
+        )
+        self.test_mode = test_mode
+        self.load_dataset()
+    def load_dataset(self):
+        h5file = h5py.File(
+            os.path.join(self.data_root, self.hdf5_paths[0]),
+            "r",
+            libver="latest",
+            swmr=True,
+        )
+        txt_file = np.array(h5file[self.split_file])
+        txt_string = txt_file.tostring().decode("ascii")[:-1]  # correct the -1
+        intrinsics = np.array(h5file[self.intrisics_file]).tostring().decode("ascii")
+        intrinsics = json.loads(intrinsics)
+        h5file.close()
+        dataset = []
+        for line in txt_string.split("\n"):
+            image_filename, depth_filename = line.strip().split(" ")
+            intrinsics_val = torch.tensor(
+                intrinsics[os.path.join(*image_filename.split("/")[:2])]
+            ).squeeze()[:, :3]
+            sample = [image_filename, depth_filename, intrinsics_val]
+            dataset.append(sample)
+        if not self.test_mode:
+            dataset = self.chunk(dataset, chunk_dim=1, pct=self.mini)
+        self.dataset = DatasetFromList(dataset)
+        self.log_load_dataset()
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [False] * self.num_copies
+        results["quality"] = [1] * self.num_copies
+        return results

unik3d/datasets/adt.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from typing import Any
+import torch
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class ADT(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 20.0
+    depth_scale = 1000.0
+    test_split = "val.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = [f"ADT.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["camera_params", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,  # if not test_mode else [*decode_fields, "points"],
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def preprocess(self, results):
+        self.resizer.ctx = None
+        for i, seq in enumerate(results["sequence_fields"]):
+            # Create a mask where the distance from the center is less than H/2
+            H, W = results[seq]["image"].shape[-2:]
+            x = torch.linspace(-W / 2 - 0.5, W / 2 + 0.5, W)
+            y = torch.linspace(-H / 2 - 0.5, H / 2 + 0.5, H)
+            xv, yv = torch.meshgrid(x, y, indexing="xy")
+            distance_from_center = torch.sqrt(xv**2 + yv**2).reshape(1, 1, H, W)
+            results[seq]["validity_mask"] = distance_from_center < (H / 2) + 20
+            results[seq]["depth_mask"] = results[seq]["validity_mask"].clone()
+            results[seq]["mask_fields"].add("depth_mask")
+            results[seq]["mask_fields"].add("validity_mask")
+        return super().preprocess(results)
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["synthetic"] = [True] * self.num_frames * self.num_copies
+        results["quality"] = [0] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/aimotive.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from typing import Any
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class aiMotive(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 100.0
+    depth_scale = 256.0
+    default_fps = 10
+    test_split = "train.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = ["aiMotive.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["camera_params", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [False] * self.num_frames * self.num_copies
+        results["synthetic"] = [False] * self.num_frames * self.num_copies
+        results["quality"] = [2] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/argoverse.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import json
+import os
+import h5py
+import numpy as np
+import torch
+from unik3d.datasets.image_dataset import ImageDataset
+from unik3d.datasets.utils import DatasetFromList
+class Argoverse(ImageDataset):
+    min_depth = 0.05
+    max_depth = 120.0
+    depth_scale = 256.0
+    test_split = "argo_val.txt"
+    train_split = "argo_train.txt"
+    intrisics_file = "argo_intrinsics.json"
+    hdf5_paths = ["argoverse11.hdf5"]
+    def __init__(
+        self,
+        image_shape,
+        split_file,
+        test_mode,
+        crop=None,
+        benchmark=False,
+        augmentations_db={},
+        normalize=True,
+        resize_method="hard",
+        mini=1.0,
+        **kwargs,
+    ):
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            **kwargs,
+        )
+        self.test_mode = test_mode
+        self.crop = crop
+        self.load_dataset()
+    def load_dataset(self):
+        h5file = h5py.File(
+            os.path.join(self.data_root, self.hdf5_paths[0]),
+            "r",
+            libver="latest",
+            swmr=True,
+        )
+        txt_file = np.array(h5file[self.split_file])
+        txt_string = txt_file.tostring().decode("ascii")[:-1]  # correct the -1
+        intrinsics = np.array(h5file[self.intrisics_file]).tostring().decode("ascii")
+        intrinsics = json.loads(intrinsics)
+        h5file.close()
+        dataset = []
+        for line in txt_string.split("\n"):
+            image_filename, depth_filename = line.strip().split(" ")
+            intrinsics_val = torch.tensor(intrinsics[image_filename]).squeeze()[:, :3]
+            sample = [image_filename, depth_filename, intrinsics_val]
+            dataset.append(sample)
+        if not self.test_mode:
+            dataset = self.chunk(dataset, chunk_dim=1, pct=self.mini)
+        self.dataset = DatasetFromList(dataset)
+        self.log_load_dataset()

unik3d/datasets/argoverse2.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from typing import Any
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class Argoverse2(SequenceDataset):
+    min_depth = 0.05
+    max_depth = 120.0
+    depth_scale = 256.0
+    test_split = "val.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences_clean.json"
+    hdf5_paths = [f"AV2_viz.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["K", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [False] * self.num_frames * self.num_copies
+        results["quality"] = [1] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/arkit.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from typing import Any
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class ARKit(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 10.0
+    depth_scale = 1000.0
+    test_split = "Training.txt"
+    train_split = "Training.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = ["ARKitS.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["K", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["quality"] = [2] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/ase.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import Any
+import torch
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class ASE(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 20.0
+    depth_scale = 1000.0
+    test_split = "val.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = [f"ASE.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["camera_params", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def preprocess(self, results):
+        self.resizer.ctx = None
+        for i, seq in enumerate(results["sequence_fields"]):
+            # Create a mask where the distance from the center is less than H/2
+            H, W = results[seq]["image"].shape[-2:]
+            x = torch.linspace(-W / 2 - 0.5, W / 2 + 0.5, W)
+            y = torch.linspace(-H / 2 - 0.5, H / 2 + 0.5, H)
+            xv, yv = torch.meshgrid(x, y, indexing="xy")
+            distance_from_center = torch.sqrt(xv**2 + yv**2).reshape(1, 1, H, W)
+            results[seq]["validity_mask"] = distance_from_center < (H / 2) + 20
+            results[seq]["mask_fields"].add("validity_mask")
+        return super().preprocess(results)
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["synthetic"] = [True] * self.num_frames * self.num_copies
+        results["quality"] = [0] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/base_dataset.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import os
+from abc import abstractmethod
+from copy import deepcopy
+from math import ceil, log
+from typing import Any, Dict, Tuple
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+import unik3d.datasets.pipelines as pipelines
+from unik3d.utils import (eval_3d, eval_depth, identity, is_main_process,
+                          recursive_index, sync_tensor_across_gpus)
+from unik3d.utils.constants import (IMAGENET_DATASET_MEAN,
+                                    IMAGENET_DATASET_STD, OPENAI_DATASET_MEAN,
+                                    OPENAI_DATASET_STD)
+class BaseDataset(Dataset):
+    min_depth = 0.01
+    max_depth = 1000.0
+    def __init__(
+        self,
+        image_shape: Tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: Dict[str, Any],
+        shape_constraints: Dict[str, Any],
+        resize_method: str,
+        mini: float,
+        num_copies: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        assert normalize in [None, "imagenet", "openai"]
+        self.split_file = split_file
+        self.test_mode = test_mode
+        self.data_root = os.environ["DATAROOT"]
+        self.image_shape = image_shape
+        self.resize_method = resize_method
+        self.mini = mini
+        self.num_frames = 1
+        self.num_copies = num_copies
+        self.metrics_store = {}
+        self.metrics_count = {}
+        if normalize == "imagenet":
+            self.normalization_stats = {
+                "mean": torch.tensor(IMAGENET_DATASET_MEAN),
+                "std": torch.tensor(IMAGENET_DATASET_STD),
+            }
+        elif normalize == "openai":
+            self.normalization_stats = {
+                "mean": torch.tensor(OPENAI_DATASET_MEAN),
+                "std": torch.tensor(OPENAI_DATASET_STD),
+            }
+        else:
+            self.normalization_stats = {
+                "mean": torch.tensor([0.0, 0.0, 0.0]),
+                "std": torch.tensor([1.0, 1.0, 1.0]),
+            }
+        for k, v in augmentations_db.items():
+            setattr(self, k, v)
+        self.shape_constraints = shape_constraints
+        if not self.test_mode:
+            self._augmentation_space()
+        self.masker = pipelines.AnnotationMask(
+            min_value=0.0,
+            max_value=self.max_depth if test_mode else None,
+            custom_fn=identity,
+        )
+        self.filler = pipelines.RandomFiller(test_mode=test_mode)
+        shape_mult = self.shape_constraints["shape_mult"]
+        self.image_shape = [
+            ceil(self.image_shape[0] / shape_mult) * shape_mult,
+            ceil(self.image_shape[1] / shape_mult) * shape_mult,
+        ]
+        self.resizer = pipelines.ContextCrop(
+            image_shape=self.image_shape,
+            train_ctx_range=(1.0 / self.random_scale, 1.0 * self.random_scale),
+            test_min_ctx=self.test_context,
+            keep_original=test_mode,
+            shape_constraints=self.shape_constraints,
+        )
+        self.collecter = pipelines.Collect(
+            keys=["image_fields", "mask_fields", "gt_fields", "camera_fields"]
+        )
+    def __len__(self):
+        return len(self.dataset)
+    def pack_batch(self, results):
+        results["paddings"] = [
+            results[x]["paddings"][0] for x in results["sequence_fields"]
+        ]
+        for fields_name in [
+            "image_fields",
+            "gt_fields",
+            "mask_fields",
+            "camera_fields",
+        ]:
+            fields = results.get(fields_name)
+            packed = {
+                field: torch.cat(
+                    [results[seq][field] for seq in results["sequence_fields"]]
+                )
+                for field in fields
+            }
+            results.update(packed)
+        return results
+    def unpack_batch(self, results):
+        for fields_name in [
+            "image_fields",
+            "gt_fields",
+            "mask_fields",
+            "camera_fields",
+        ]:
+            fields = results.get(fields_name)
+            unpacked = {
+                field: {
+                    seq: results[field][idx : idx + 1]
+                    for idx, seq in enumerate(results["sequence_fields"])
+                }
+                for field in fields
+            }
+            results.update(unpacked)
+        return results
+    def _augmentation_space(self):
+        self.augmentations_dict = {
+            "Flip": pipelines.RandomFlip(prob=self.flip_p),
+            "Jitter": pipelines.RandomColorJitter(
+                (-self.random_jitter, self.random_jitter), prob=self.jitter_p
+            ),
+            "Gamma": pipelines.RandomGamma(
+                (-self.random_gamma, self.random_gamma), prob=self.gamma_p
+            ),
+            "Blur": pipelines.GaussianBlur(
+                kernel_size=13, sigma=(0.1, self.random_blur), prob=self.blur_p
+            ),
+            "Grayscale": pipelines.RandomGrayscale(prob=self.grayscale_p),
+        }
+    def augment(self, results):
+        for name, aug in self.augmentations_dict.items():
+            results = aug(results)
+        return results
+    def prepare_depth_eval(self, inputs, preds):
+        new_preds = {}
+        keyframe_idx = getattr(self, "keyframe_idx", None)
+        slice_idx = slice(
+            keyframe_idx, keyframe_idx + 1 if keyframe_idx is not None else None
+        )
+        new_gts = inputs["depth"][slice_idx]
+        new_masks = inputs["depth_mask"][slice_idx].bool()
+        for key, val in preds.items():
+            if "depth" in key:
+                new_preds[key] = val[slice_idx]
+        return new_gts, new_preds, new_masks
+    def prepare_points_eval(self, inputs, preds):
+        new_preds = {}
+        new_gts = inputs["points"]
+        new_masks = inputs["depth_mask"].bool()
+        if "points_mask" in inputs:
+            new_masks = inputs["points_mask"].bool()
+        for key, val in preds.items():
+            if "points" in key:
+                new_preds[key] = val
+        return new_gts, new_preds, new_masks
+    def add_points(self, inputs):
+        inputs["points"] = inputs.get("camera_original", inputs["camera"]).reconstruct(
+            inputs["depth"]
+        )
+        return inputs
+    @torch.autocast(device_type="cuda", enabled=False, dtype=torch.float32)
+    def accumulate_metrics(
+        self,
+        inputs,
+        preds,
+        keyframe_idx=None,
+        metrics=["depth", "points", "flow_fwd", "pairwise"],
+    ):
+        if "depth" in inputs and "points" not in inputs:
+            inputs = self.add_points(inputs)
+        available_metrics = []
+        for metric in metrics:
+            metric_in_gt = any((metric in k for k in inputs.keys()))
+            metric_in_pred = any((metric in k for k in preds.keys()))
+            if metric_in_gt and metric_in_pred:
+                available_metrics.append(metric)
+        if keyframe_idx is not None:
+            inputs = recursive_index(inputs, slice(keyframe_idx, keyframe_idx + 1))
+            preds = recursive_index(preds, slice(keyframe_idx, keyframe_idx + 1))
+        if "depth" in available_metrics:
+            depth_gt, depth_pred, depth_masks = self.prepare_depth_eval(inputs, preds)
+            self.accumulate_metrics_depth(depth_gt, depth_pred, depth_masks)
+        if "points" in available_metrics:
+            points_gt, points_pred, points_masks = self.prepare_points_eval(
+                inputs, preds
+            )
+            self.accumulate_metrics_3d(points_gt, points_pred, points_masks)
+    @torch.autocast(device_type="cuda", enabled=False, dtype=torch.float32)
+    def accumulate_metrics_depth(self, gts, preds, masks):
+        for eval_type, pred in preds.items():
+            log_name = eval_type.replace("depth", "").strip("-").strip("_")
+            if log_name not in self.metrics_store:
+                self.metrics_store[log_name] = {}
+            current_count = self.metrics_count.get(
+                log_name, torch.tensor([], device=gts.device)
+            )
+            new_count = masks.view(gts.shape[0], -1).sum(dim=-1)
+            self.metrics_count[log_name] = torch.cat([current_count, new_count])
+            for k, v in eval_depth(gts, pred, masks, max_depth=self.max_depth).items():
+                current_metric = self.metrics_store[log_name].get(
+                    k, torch.tensor([], device=gts.device)
+                )
+                self.metrics_store[log_name][k] = torch.cat([current_metric, v])
+    @torch.autocast(device_type="cuda", enabled=False, dtype=torch.float32)
+    def accumulate_metrics_3d(self, gts, preds, masks):
+        thresholds = torch.linspace(
+            log(self.min_depth),
+            log(self.max_depth / 20),
+            steps=100,
+            device=gts.device,
+        ).exp()
+        for eval_type, pred in preds.items():
+            log_name = eval_type.replace("points", "").strip("-").strip("_")
+            if log_name not in self.metrics_store:
+                self.metrics_store[log_name] = {}
+            current_count = self.metrics_count.get(
+                log_name, torch.tensor([], device=gts.device)
+            )
+            new_count = masks.view(gts.shape[0], -1).sum(dim=-1)
+            self.metrics_count[log_name] = torch.cat([current_count, new_count])
+            for k, v in eval_3d(gts, pred, masks, thresholds=thresholds).items():
+                current_metric = self.metrics_store[log_name].get(
+                    k, torch.tensor([], device=gts.device)
+                )
+                self.metrics_store[log_name][k] = torch.cat([current_metric, v])
+    def get_evaluation(self, metrics=None):
+        metric_vals = {}
+        for eval_type in metrics if metrics is not None else self.metrics_store.keys():
+            assert self.metrics_store[eval_type]
+            cnts = sync_tensor_across_gpus(self.metrics_count[eval_type])
+            for name, val in self.metrics_store[eval_type].items():
+                # vals_r = (sync_tensor_across_gpus(val) * cnts / cnts.sum()).sum()
+                vals_r = sync_tensor_across_gpus(val).mean()
+                metric_vals[f"{eval_type}_{name}".strip("_")] = np.round(
+                    vals_r.cpu().item(), 5
+                )
+            self.metrics_store[eval_type] = {}
+        self.metrics_count = {}
+        return metric_vals
+    def replicate(self, results):
+        for i in range(1, self.num_copies):
+            results[(0, i)] = {k: deepcopy(v) for k, v in results[(0, 0)].items()}
+            results["sequence_fields"].append((0, i))
+        return results
+    def log_load_dataset(self):
+        if is_main_process():
+            info = f"Loaded {self.__class__.__name__} with {len(self)} images."
+            print(info)
+    def pre_pipeline(self, results):
+        results["image_fields"] = results.get("image_fields", set())
+        results["gt_fields"] = results.get("gt_fields", set())
+        results["mask_fields"] = results.get("mask_fields", set())
+        results["sequence_fields"] = results.get("sequence_fields", set())
+        results["camera_fields"] = results.get("camera_fields", set())
+        results["dataset_name"] = (
+            [self.__class__.__name__] * self.num_frames * self.num_copies
+        )
+        results["depth_scale"] = [self.depth_scale] * self.num_frames * self.num_copies
+        results["si"] = [False] * self.num_frames * self.num_copies
+        results["dense"] = [False] * self.num_frames * self.num_copies
+        results["synthetic"] = [False] * self.num_frames * self.num_copies
+        results["quality"] = [0] * self.num_frames * self.num_copies
+        results["valid_camera"] = [True] * self.num_frames * self.num_copies
+        results["valid_pose"] = [True] * self.num_frames * self.num_copies
+        return results
+    def eval_mask(self, valid_mask):
+        return valid_mask
+    def chunk(self, dataset, chunk_dim=1, pct=1.0):
+        subsampled_datasets = [
+            x
+            for i in range(0, len(dataset), int(1 / pct * chunk_dim))
+            for x in dataset[i : i + chunk_dim]
+        ]
+        return subsampled_datasets
+    @abstractmethod
+    def preprocess(self, results):
+        raise NotImplementedError
+    @abstractmethod
+    def postprocess(self, results):
+        raise NotImplementedError
+    @abstractmethod
+    def get_mapper(self):
+        raise NotImplementedError
+    @abstractmethod
+    def get_intrinsics(self, idx, image_name):
+        raise NotImplementedError
+    @abstractmethod
+    def get_extrinsics(self, idx, image_name):
+        raise NotImplementedError
+    @abstractmethod
+    def load_dataset(self):
+        raise NotImplementedError
+    @abstractmethod
+    def get_single_item(self, idx, sample=None, mapper=None):
+        raise NotImplementedError
+    @abstractmethod
+    def __getitem__(self, idx):
+        raise NotImplementedError

unik3d/datasets/bdd.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import json
+import os
+import h5py
+import numpy as np
+import torch
+from unik3d.datasets.image_dataset import ImageDataset
+from unik3d.datasets.utils import DatasetFromList
+class BDD(ImageDataset):
+    min_depth = 0.01
+    max_depth = 70.0
+    depth_scale = 256.0
+    test_split = "val.txt"
+    train_split = "train_clean.txt"
+    intrisics_file = "intrinsics.json"
+    hdf5_paths = ["BDD.hdf5"]
+    def __init__(
+        self,
+        image_shape,
+        split_file,
+        test_mode,
+        benchmark=False,
+        augmentations_db={},
+        normalize=True,
+        resize_method="hard",
+        mini=1.0,
+        **kwargs,
+    ):
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            **kwargs,
+        )
+        self.test_mode = test_mode
+        self.load_dataset()
+    def load_dataset(self):
+        h5file = h5py.File(
+            os.path.join(self.data_root, self.hdf5_paths[0]),
+            "r",
+            libver="latest",
+            swmr=True,
+        )
+        txt_file = np.array(h5file[self.split_file])
+        txt_string = txt_file.tostring().decode("ascii")  # [:-1] # correct the -1
+        intrinsics = np.array(h5file[self.intrisics_file]).tostring().decode("ascii")
+        intrinsics = json.loads(intrinsics)
+        dataset = []
+        for line in txt_string.split("\n"):
+            image_filename, depth_filename = line.strip().split(" ")
+            intrinsics_val = torch.tensor(
+                intrinsics[os.path.join(*image_filename.split("/")[:2])]
+            ).squeeze()[:, :3]
+            sample = [image_filename, depth_filename, intrinsics_val]
+            dataset.append(sample)
+        h5file.close()
+        if not self.test_mode:
+            dataset = self.chunk(dataset, chunk_dim=1, pct=self.mini)
+        if self.test_mode:
+            dataset = self.chunk(dataset, chunk_dim=1, pct=0.1)
+        self.dataset = DatasetFromList(dataset)
+        self.log_load_dataset()
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["si"] = [True] * self.num_copies
+        results["valid_camera"] = [False] * self.num_copies
+        results["dense"] = [False] * self.num_copies
+        results["quality"] = [2] * self.num_copies
+        return results

unik3d/datasets/bedlam.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Any
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class BEDLAM(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 256.0
+    depth_scale = 1000.0
+    test_split = "train.txt"
+    train_split = "val.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = ["BEDLAM.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["K", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["synthetic"] = [True] * self.num_frames * self.num_copies
+        results["quality"] = [0] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/behave.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import Any
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class Behave(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 10.0
+    depth_scale = 1000.0
+    default_fps = 10
+    test_split = "train.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = ["Behave.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["camera_params", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["synthetic"] = [False] * self.num_frames * self.num_copies
+        results["si"] = [False] * self.num_frames * self.num_copies
+        results["quality"] = [1] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/blendedmvg.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Any
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class BlendedMVG(SequenceDataset):
+    min_depth = 0.01
+    max_depth = 5000.0
+    depth_scale = 1000.0
+    test_split = "train.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences_clean.json"
+    hdf5_paths = ["BlendedMVG_.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["K", "cam2w"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["si"] = [False] * self.num_frames * self.num_copies
+        results["quality"] = [2] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/cityscape.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import json
+import os
+import h5py
+import numpy as np
+import torch
+from unik3d.datasets.image_dataset import ImageDataset
+from unik3d.datasets.utils import DatasetFromList
+class Cityscape(ImageDataset):
+    min_depth = 0.05
+    max_depth = 80.0
+    depth_scale = 256.0
+    test_split = "val.txt"
+    train_split = "train.txt"
+    intrisics_file = "intrinsics.json"
+    hdf5_paths = ["cityscape.hdf5"]
+    def __init__(
+        self,
+        image_shape,
+        split_file,
+        test_mode,
+        crop=None,
+        benchmark=False,
+        augmentations_db={},
+        normalize=True,
+        resize_method="hard",
+        mini=1.0,
+        **kwargs,
+    ):
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            **kwargs,
+        )
+        self.test_mode = test_mode
+        self.crop = crop
+        self.load_dataset()
+    def load_dataset(self):
+        h5file = h5py.File(
+            os.path.join(self.data_root, self.hdf5_paths[0]),
+            "r",
+            libver="latest",
+            swmr=True,
+        )
+        txt_file = np.array(h5file[self.split_file])
+        txt_string = txt_file.tostring().decode("ascii")[:-1]  # correct the -1
+        intrinsics = np.array(h5file[self.intrisics_file]).tostring().decode("ascii")
+        intrinsics = json.loads(intrinsics)
+        h5file.close()
+        dataset = []
+        for line in txt_string.split("\n"):
+            image_filename, depth_filename = line.strip().split(" ")
+            intrinsics_val = torch.tensor(intrinsics[image_filename]).squeeze()[:, :3]
+            sample = [image_filename, depth_filename, intrinsics_val]
+            dataset.append(sample)
+        if not self.test_mode:
+            dataset = self.chunk(dataset, chunk_dim=1, pct=self.mini)
+        self.dataset = DatasetFromList(dataset)
+        self.log_load_dataset()
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["quality"] = [2] * self.num_copies
+        return results

unik3d/datasets/ddad.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import json
+import os
+import h5py
+import numpy as np
+import torch
+from unik3d.datasets.image_dataset import ImageDataset
+from unik3d.datasets.utils import DatasetFromList
+class DDAD(ImageDataset):
+    min_depth = 0.05
+    max_depth = 120.0
+    depth_scale = 256.0
+    test_split = "val.txt"
+    train_split = "train.txt"
+    intrisics_file = "intrinsics.json"
+    hdf5_paths = [f"ddad/ddad_{i}.hdf5" for i in range(8)]
+    def __init__(
+        self,
+        image_shape,
+        split_file,
+        test_mode,
+        benchmark=False,
+        augmentations_db={},
+        normalize=True,
+        resize_method="hard",
+        mini=1.0,
+        **kwargs,
+    ):
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            **kwargs,
+        )
+        self.test_mode = test_mode
+        self.load_dataset()
+    def load_dataset(self):
+        h5file = h5py.File(
+            os.path.join(self.data_root, self.hdf5_paths[0]),
+            "r",
+            libver="latest",
+            swmr=True,
+        )
+        txt_file = np.array(h5file[self.split_file])
+        txt_string = txt_file.tostring().decode("ascii").strip("\n")
+        intrinsics = np.array(h5file[self.intrisics_file]).tostring().decode("ascii")
+        intrinsics = json.loads(intrinsics)
+        h5file.close()
+        dataset = []
+        for line in txt_string.split("\n"):
+            image_filename, depth_filename, chunk_idx = line.strip().split(" ")
+            intrinsics_val = torch.tensor(intrinsics[image_filename]).squeeze()[:, :3]
+            sample = [image_filename, depth_filename, intrinsics_val, chunk_idx]
+            dataset.append(sample)
+        if not self.test_mode:
+            dataset = self.chunk(dataset, chunk_dim=1, pct=self.mini)
+        self.dataset = DatasetFromList(dataset)
+        self.log_load_dataset()
+    def get_mapper(self):
+        return {
+            "image_filename": 0,
+            "depth_filename": 1,
+            "K": 2,
+            "chunk_idx": 3,
+        }
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [False] * self.num_copies
+        results["quality"] = [1] * self.num_copies
+        return results

unik3d/datasets/deep360.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import Any
+import torch
+from unik3d.datasets.pipelines import Compose, PanoCrop, PanoRoll
+from unik3d.datasets.sequence_dataset import SequenceDataset
+class Deep360(SequenceDataset):
+    min_depth = 0.1
+    max_depth = 1000.0
+    depth_scale = 1000.0
+    test_split = "train.txt"
+    train_split = "train.txt"
+    sequences_file = "sequences.json"
+    hdf5_paths = [f"Deep360.hdf5"]
+    def __init__(
+        self,
+        image_shape: tuple[int, int],
+        split_file: str,
+        test_mode: bool,
+        normalize: bool,
+        augmentations_db: dict[str, Any],
+        resize_method: str,
+        mini: float = 1.0,
+        num_frames: int = 1,
+        benchmark: bool = False,
+        decode_fields: list[str] = ["image", "depth"],
+        inplace_fields: list[str] = ["cam2w", "camera_params"],
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            num_frames=num_frames,
+            decode_fields=decode_fields,
+            inplace_fields=inplace_fields,
+            **kwargs,
+        )
+        self.resizer = Compose(
+            [PanoCrop(), PanoRoll(test_mode=test_mode), self.resizer]
+        )
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [True] * self.num_frames * self.num_copies
+        results["synthetic"] = [True] * self.num_frames * self.num_copies
+        results["quality"] = [0] * self.num_frames * self.num_copies
+        return results

unik3d/datasets/dense.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import h5py
+import numpy as np
+import torch
+from unik3d.datasets.image_dataset import ImageDataset
+from unik3d.datasets.utils import DatasetFromList
+class DENSE(ImageDataset):
+    CAM_INTRINSIC = {
+        "ALL": torch.tensor(
+            [
+                [1177.8614, 0.0, 474.319027],
+                [0.0, 1177.8614, 224.275919],
+                [0.0, 0.0, 1.0],
+            ]
+        )
+    }
+    min_depth = 0.05
+    max_depth = 80.0
+    depth_scale = 255.0
+    test_split = "train.txt"
+    train_split = "train.txt"
+    hdf5_paths = ["DENSE.hdf5"]
+    def __init__(
+        self,
+        image_shape,
+        split_file,
+        test_mode,
+        benchmark=False,
+        augmentations_db={},
+        normalize=True,
+        resize_method="hard",
+        mini=1.0,
+        **kwargs,
+    ):
+        super().__init__(
+            image_shape=image_shape,
+            split_file=split_file,
+            test_mode=test_mode,
+            benchmark=benchmark,
+            normalize=normalize,
+            augmentations_db=augmentations_db,
+            resize_method=resize_method,
+            mini=mini,
+            **kwargs,
+        )
+        self.test_mode = test_mode
+        self.intrisics = {}
+        self.load_dataset()
+    def load_dataset(self):
+        h5file = h5py.File(
+            os.path.join(self.data_root, self.hdf5_paths[0]),
+            "r",
+            libver="latest",
+            swmr=True,
+        )
+        txt_file = np.array(h5file[self.split_file])
+        txt_string = txt_file.tostring().decode("ascii")[:-1]  # correct the -1
+        h5file.close()
+        dataset = []
+        for line in txt_string.split("\n"):
+            image_filename, depth_filename = line.strip().split(" ")
+            sample = [image_filename, depth_filename]
+            dataset.append(sample)
+        if not self.test_mode:
+            dataset = self.chunk(dataset, chunk_dim=1, pct=self.mini)
+        self.dataset = DatasetFromList(dataset)
+        self.log_load_dataset()
+    def get_intrinsics(self, idx, image_name):
+        return self.CAM_INTRINSIC["ALL"].clone()
+    def get_mapper(self):
+        return {
+            "image_filename": 0,
+            "depth_filename": 1,
+        }
+    def pre_pipeline(self, results):
+        results = super().pre_pipeline(results)
+        results["dense"] = [False] * self.num_copies
+        results["quality"] = [1] * self.num_copies
+        return results