OmniPart

Build error

App Files Files Community

omnipart commited on 29 days ago

Commit

491eded

0 Parent(s):

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +44 -0
.gitignore +6 -0
LICENSE +22 -0
NOTICE +15 -0
README.md +13 -0
app.py +184 -0
app_utils.py +412 -0
assets/example_data/Batman.png +3 -0
assets/example_data/astronaut.png +3 -0
assets/example_data/car.png +3 -0
assets/example_data/crossbow.jpg +0 -0
assets/example_data/knight.png +3 -0
assets/example_data/robot.jpg +0 -0
assets/example_data/robot1.jpeg +3 -0
assets/example_data/robot_dog.jpg +0 -0
assets/example_data/ship.jpg +0 -0
assets/example_data/snake.png +3 -0
assets/example_data/warhammer.png +3 -0
configs/bbox_gen.yaml +34 -0
modules/PartField/configs/final/correspondence_demo.yaml +44 -0
modules/PartField/configs/final/demo.yaml +28 -0
modules/PartField/partfield/config/__init__.py +26 -0
modules/PartField/partfield/config/defaults.py +92 -0
modules/PartField/partfield/model/PVCNN/conv_pointnet.py +251 -0
modules/PartField/partfield/model/PVCNN/dnnlib_util.py +1074 -0
modules/PartField/partfield/model/PVCNN/encoder_pc.py +243 -0
modules/PartField/partfield/model/PVCNN/pc_encoder.py +90 -0
modules/PartField/partfield/model/PVCNN/pv_module/__init__.py +2 -0
modules/PartField/partfield/model/PVCNN/pv_module/ball_query.py +34 -0
modules/PartField/partfield/model/PVCNN/pv_module/frustum.py +141 -0
modules/PartField/partfield/model/PVCNN/pv_module/functional/__init__.py +1 -0
modules/PartField/partfield/model/PVCNN/pv_module/functional/devoxelization.py +12 -0
modules/PartField/partfield/model/PVCNN/pv_module/loss.py +10 -0
modules/PartField/partfield/model/PVCNN/pv_module/pointnet.py +113 -0
modules/PartField/partfield/model/PVCNN/pv_module/pvconv.py +38 -0
modules/PartField/partfield/model/PVCNN/pv_module/shared_mlp.py +35 -0
modules/PartField/partfield/model/PVCNN/pv_module/voxelization.py +80 -0
modules/PartField/partfield/model/PVCNN/unet_3daware.py +427 -0
modules/PartField/partfield/model/UNet/buildingblocks.py +546 -0
modules/PartField/partfield/model/UNet/model.py +170 -0
modules/PartField/partfield/model/model_utils.py +54 -0
modules/PartField/partfield/model/triplane.py +331 -0
modules/PartField/partfield/model_trainer_pvcnn_only_demo.py +283 -0
modules/PartField/partfield/partfield_encoder.py +103 -0
modules/PartField/partfield/utils.py +5 -0
modules/bbox_gen/config.py +57 -0
modules/bbox_gen/models/autogressive_bbox_gen.py +305 -0
modules/bbox_gen/models/bbox_gen_models.py +215 -0
modules/bbox_gen/models/bboxopt.py +221 -0
modules/bbox_gen/models/image_encoder.py +41 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,44 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/example_data/Batman.png filter=lfs diff=lfs merge=lfs -text
+assets/example_data/astronaut.png filter=lfs diff=lfs merge=lfs -text
+assets/example_data/car.png filter=lfs diff=lfs merge=lfs -text
+assets/example_data/knight.png filter=lfs diff=lfs merge=lfs -text
+assets/example_data/robot1.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/example_data/snake.png filter=lfs diff=lfs merge=lfs -text
+assets/example_data/warhammer.png filter=lfs diff=lfs merge=lfs -text
+modules/part_synthesis/representations/mesh/flexicubes/images/block_init.png filter=lfs diff=lfs merge=lfs -text
+modules/part_synthesis/representations/mesh/flexicubes/images/teaser_top.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__/
+output/
+ckpt/
+.DS_Store
+tmp/
+debug_images/

LICENSE ADDED Viewed

	@@ -0,0 +1,22 @@

+# MIT License
+# Copyright (c) 2025 VAST-AI-Research and contributors.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE

NOTICE ADDED Viewed

	@@ -0,0 +1,15 @@

+OmniPart
+Copyright (c) 2025 VAST-AI-Research and contributors
+This project includes code from the following open source projects:
+RMBG
+Copyright (c) BRIA AI
+License: bria-rmbg-2.0
+Source: https://huggingface.co/briaai/RMBG-2.0
+This software contains code derived from 🤗 Diffusers (https://github.com/huggingface/diffusers), available under the Apache License 2.0.
+This software contains code derived from TRELLIS (https://github.com/Microsoft/TRELLIS), available under the MIT License.
+This software contains code derived from PartPacker (https://github.com/NVlabs/PartPacker), available under the NVIDIA Source Code License.

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: OmniPart
+emoji: 📚
+colorFrom: green
+colorTo: green
+sdk: gradio
+sdk_version: 5.35.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import gradio as gr
+import spaces
+import os
+import shutil
+os.environ['SPCONV_ALGO'] = 'native'
+from huggingface_hub import hf_hub_download
+from app_utils import (
+    generate_parts,
+    prepare_models,
+    process_image,
+    apply_merge,
+    DEFAULT_SIZE_TH,
+    TMP_ROOT,
+)
+EXAMPLES = [
+    ["assets/example_data/knight.png", 1800, "6,0,26,20,7;13,1,22,11,12,2,21,27,3,24,23;5,18;4,17;19,16,14,25,28", 42],
+    ["assets/example_data/car.png", 2000, "12,10,2,11;1,7", 42],
+    ["assets/example_data/warhammer.png", 1800, "7,1,0,8", 0],
+    ["assets/example_data/snake.png", 3000, "2,3;0,1;4,5,6,7", 42],
+    ["assets/example_data/Batman.png", 1800, "4,5", 42],
+    ["assets/example_data/robot1.jpeg", 1600, "0,5;10,14,3;1,12,2;13,11,4;7,15", 42],
+    ["assets/example_data/astronaut.png", 2000, "0,4,6;1,8,9,7;2,5", 42],
+    ["assets/example_data/crossbow.jpg", 2000, "2,9;10,12,0,7,11,8,13;4,3", 42],
+    ["assets/example_data/robot.jpg", 1600, "7,19;15,0;6,18", 42],
+    ["assets/example_data/robot_dog.jpg", 1000, "21,9;2,12,10,15,17;11,7;1,0;13,19;4,16", 0],
+    ["assets/example_data/crossbow.jpg", 1600, "9,2;10,15,13;7,14,8,11;0,12,16;5,3,1", 42],
+    ["assets/example_data/robot.jpg", 1800, "1,2,3,5,4,16,17;11,7,19;10,14;18,6,0,15;13,9;12,8", 0],
+    ["assets/example_data/robot_dog.jpg", 1000, "2,12,10,15,17,8,3,5,13,19,6,14;11,7;1,0,21,9,11;4,16", 0],
+]
+HEADER = """
+# OmniPart: Part-Aware 3D Generation with Semantic Decoupling and Structural Cohesion
+🔮 Generate **part-aware 3D content** from a single 2D image with **2D mask control**.
+## How to Use
+**🚀 Quick Start**: Select an example below and click **"▶️ Run Example"**
+**📋 Custom Image Processing**:
+1. **Upload Image** - Select your image file
+2. **Click "Segment Image"** - Get initial 2D segmentation
+3. **Merge Segments** - Enter merge groups like `0,1;3,4` and click **"Apply Merge"** (Recommend keeping **2-15 parts**)
+4. **Click "Generate 3D Model"** - Create the final 3D results
+"""
+def start_session(req: gr.Request):
+    user_dir = os.path.join(TMP_ROOT, str(req.session_hash))
+    os.makedirs(user_dir, exist_ok=True)
+def end_session(req: gr.Request):
+    user_dir = os.path.join(TMP_ROOT, str(req.session_hash))
+    shutil.rmtree(user_dir)
+with gr.Blocks(title="OmniPart") as demo:
+    gr.Markdown(HEADER)
+    state = gr.State({})
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("<div style='text-align: center'>\n\n## Input\n\n</div>")
+            input_image = gr.Image(label="Upload Image", type="filepath", height=250, width=250)
+            with gr.Row():
+                segment_btn = gr.Button("Segment Image", variant="primary", size="lg")
+                run_example_btn = gr.Button("▶️ Run Example", variant="secondary", size="lg")
+            size_threshold = gr.Slider(
+                minimum=600,
+                maximum=4000,
+                value=DEFAULT_SIZE_TH,
+                step=200,
+                label="Minimum Segment Size (pixels)",
+                info="Segments smaller than this will be ignored"
+            )
+            gr.Markdown("### Merge Controls")
+            merge_input = gr.Textbox(
+                label="Merge Groups",
+                placeholder="0,1;3,4",
+                lines=2,
+                info="Specify which segments to merge (e.g., '0,1;3,4' merges segments 0&1 together and 3&4 together)"
+            )
+            merge_btn = gr.Button("Apply Merge", variant="primary", size="lg")
+            gr.Markdown("### 3D Generation Controls")
+            seed_slider = gr.Slider(
+                minimum=0,
+                maximum=10000,
+                value=42,
+                step=1,
+                label="Generation Seed",
+                info="Random seed for 3D model generation"
+            )
+            cfg_slider = gr.Slider(
+                minimum=0.0,
+                maximum=15.0,
+                value=7.5,
+                step=0.5,
+                label="CFG Strength",
+                info="Classifier-Free Guidance strength"
+            )
+            generate_mesh_btn = gr.Button("Generate 3D Model", variant="secondary", size="lg")
+        with gr.Column(scale=2):
+            gr.Markdown("<div style='text-align: center'>\n\n## Results Display\n\n</div>")
+            with gr.Row():
+                initial_seg = gr.Image(label="Init Seg", height=220, width=220)
+                pre_merge_vis = gr.Image(label="Pre-merge", height=220, width=220)
+                merged_seg = gr.Image(label="Merged Seg", height=220, width=220)
+            with gr.Row():
+                bbox_mesh = gr.Model3D(label="Bounding Boxes", height=350)
+                whole_mesh = gr.Model3D(label="Combined Parts", height=350)
+                exploded_mesh = gr.Model3D(label="Exploded Parts", height=350)
+            with gr.Row():
+                combined_gs = gr.Model3D(label="Combined 3D Gaussians", clear_color=(0.0, 0.0, 0.0, 0.0), height=350)
+                exploded_gs = gr.Model3D(label="Exploded 3D Gaussians", clear_color=(0.0, 0.0, 0.0, 0.0), height=350)
+    with gr.Row():
+        examples = gr.Examples(
+            examples=EXAMPLES,
+            inputs=[input_image, size_threshold, merge_input, seed_slider],
+            cache_examples=False,
+        )
+    demo.load(start_session)
+    demo.unload(end_session)
+    segment_btn.click(
+        process_image,
+        inputs=[input_image, size_threshold],
+        outputs=[initial_seg, pre_merge_vis, state]
+    )
+    merge_btn.click(
+        apply_merge,
+        inputs=[merge_input, state],
+        outputs=[merged_seg, state]
+    )
+    generate_mesh_btn.click(
+        generate_parts,
+        inputs=[state, seed_slider, cfg_slider],
+        outputs=[bbox_mesh, whole_mesh, exploded_mesh, combined_gs, exploded_gs]
+    )
+    run_example_btn.click(
+        fn=process_image,
+        inputs=[input_image, size_threshold],
+        outputs=[initial_seg, pre_merge_vis, state]
+    ).then(
+        fn=apply_merge,
+        inputs=[merge_input, state],
+        outputs=[merged_seg, state]
+    ).then(
+        fn=generate_parts,
+        inputs=[state, seed_slider, cfg_slider],
+        outputs=[bbox_mesh, whole_mesh, exploded_mesh, combined_gs, exploded_gs]
+    )
+if __name__ == "__main__":
+    os.makedirs("ckpt", exist_ok=True)
+    sam_ckpt_path = hf_hub_download(repo_id="omnipart/OmniPart_modules", filename="sam_vit_h_4b8939.pth", local_dir="ckpt")
+    partfield_ckpt_path = hf_hub_download(repo_id="omnipart/OmniPart_modules", filename="partfield_encoder.ckpt", local_dir="ckpt")
+    bbox_gen_ckpt_path = hf_hub_download(repo_id="omnipart/OmniPart_modules", filename="bbox_gen.ckpt", local_dir="ckpt")
+    prepare_models(sam_ckpt_path, partfield_ckpt_path, bbox_gen_ckpt_path)
+    demo.launch()

app_utils.py ADDED Viewed

	@@ -0,0 +1,412 @@

+import gradio as gr
+import spaces
+import os
+import numpy as np
+import trimesh
+import time
+import traceback
+import torch
+from PIL import Image
+import cv2
+import shutil
+from segment_anything import SamAutomaticMaskGenerator, build_sam
+from omegaconf import OmegaConf
+from modules.bbox_gen.models.autogressive_bbox_gen import BboxGen
+from modules.part_synthesis.process_utils import save_parts_outputs
+from modules.inference_utils import load_img_mask, prepare_bbox_gen_input, prepare_part_synthesis_input, gen_mesh_from_bounds, vis_voxel_coords, merge_parts
+from modules.part_synthesis.pipelines import OmniPartImageTo3DPipeline
+from modules.label_2d_mask.visualizer import Visualizer
+from transformers import AutoModelForImageSegmentation
+from modules.label_2d_mask.label_parts import (
+    prepare_image,
+    get_sam_mask,
+    get_mask,
+    clean_segment_edges,
+    resize_and_pad_to_square,
+    size_th as DEFAULT_SIZE_TH
+)
+# Constants
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16
+MAX_SEED = np.iinfo(np.int32).max
+TMP_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
+os.makedirs(TMP_ROOT, exist_ok=True)
+sam_mask_generator = None
+rmbg_model = None
+bbox_gen_model = None
+part_synthesis_pipeline = None
+size_th = DEFAULT_SIZE_TH
+def prepare_models(sam_ckpt_path, partfield_ckpt_path, bbox_gen_ckpt_path):
+    global sam_mask_generator, rmbg_model, bbox_gen_model, part_synthesis_pipeline
+    if sam_mask_generator is None:
+        print("Loading SAM model...")
+        sam_model = build_sam(checkpoint=sam_ckpt_path).to(device=DEVICE)
+        sam_mask_generator = SamAutomaticMaskGenerator(sam_model)
+    if rmbg_model is None:
+        print("Loading BriaRMBG 2.0 model...")
+        rmbg_model = AutoModelForImageSegmentation.from_pretrained('briaai/RMBG-2.0', trust_remote_code=True)
+        rmbg_model.to(DEVICE)
+        rmbg_model.eval()
+    if part_synthesis_pipeline is None:
+        print("Loading PartSynthesis model...")
+        part_synthesis_pipeline = OmniPartImageTo3DPipeline.from_pretrained('omnipart/OmniPart')
+        part_synthesis_pipeline.to(DEVICE)
+    if bbox_gen_model is None:
+        print("Loading BboxGen model...")
+        bbox_gen_config = OmegaConf.load("configs/bbox_gen.yaml").model.args
+        bbox_gen_config.partfield_encoder_path = partfield_ckpt_path
+        bbox_gen_model = BboxGen(bbox_gen_config)
+        bbox_gen_model.load_state_dict(torch.load(bbox_gen_ckpt_path), strict=False)
+        bbox_gen_model.to(DEVICE)
+        bbox_gen_model.eval().half()
+    print("Models ready")
+@spaces.GPU
+def process_image(image_path, threshold, req: gr.Request):
+    """Process image and generate initial segmentation"""
+    global size_th
+    user_dir = os.path.join(TMP_ROOT, str(req.session_hash))
+    os.makedirs(user_dir, exist_ok=True)
+    img_name = os.path.basename(image_path).split(".")[0]
+    size_th = threshold
+    img = Image.open(image_path).convert("RGB")
+    processed_image = prepare_image(img, rmbg_net=rmbg_model.to(DEVICE))
+    processed_image = resize_and_pad_to_square(processed_image)
+    white_bg = Image.new("RGBA", processed_image.size, (255, 255, 255, 255))
+    white_bg_img = Image.alpha_composite(white_bg, processed_image.convert("RGBA"))
+    image = np.array(white_bg_img.convert('RGB'))
+    rgba_path = os.path.join(user_dir, f"{img_name}_processed.png")
+    processed_image.save(rgba_path)
+    print("Generating raw SAM masks without post-processing...")
+    raw_masks = sam_mask_generator.generate(image)
+    raw_sam_vis = np.copy(image)
+    raw_sam_vis = np.ones_like(image) * 255
+    sorted_masks = sorted(raw_masks, key=lambda x: x["area"], reverse=True)
+    for i, mask_data in enumerate(sorted_masks):
+        if mask_data["area"] < size_th:
+            continue
+        color_r = (i * 50 + 80) % 256
+        color_g = (i * 120 + 40) % 256
+        color_b = (i * 180 + 20) % 256
+        color = np.array([color_r, color_g, color_b])
+        mask = mask_data["segmentation"]
+        raw_sam_vis[mask] = color
+    visual = Visualizer(image)
+    group_ids, pre_merge_im = get_sam_mask(
+        image,
+        sam_mask_generator,
+        visual,
+        merge_groups=None,
+        rgba_image=processed_image,
+        img_name=img_name,
+        save_dir=user_dir,
+        size_threshold=size_th
+    )
+    pre_merge_path = os.path.join(user_dir, f"{img_name}_mask_pre_merge.png")
+    Image.fromarray(pre_merge_im).save(pre_merge_path)
+    pre_split_vis = np.ones_like(image) * 255
+    unique_ids = np.unique(group_ids)
+    unique_ids = unique_ids[unique_ids >= 0]
+    for i, unique_id in enumerate(unique_ids):
+        color_r = (i * 50 + 80) % 256
+        color_g = (i * 120 + 40) % 256
+        color_b = (i * 180 + 20) % 256
+        color = np.array([color_r, color_g, color_b])
+        mask = (group_ids == unique_id)
+        pre_split_vis[mask] = color
+        y_indices, x_indices = np.where(mask)
+        if len(y_indices) > 0:
+            center_y = int(np.mean(y_indices))
+            center_x = int(np.mean(x_indices))
+            cv2.putText(pre_split_vis, str(unique_id),
+                        (center_x, center_y), cv2.FONT_HERSHEY_SIMPLEX,
+                        0.5, (0, 0, 0), 1, cv2.LINE_AA)
+    pre_split_path = os.path.join(user_dir, f"{img_name}_pre_split.png")
+    Image.fromarray(pre_split_vis).save(pre_split_path)
+    print(f"Pre-split segmentation (before disconnected parts handling) saved to {pre_split_path}")
+    get_mask(group_ids, image, ids=2, img_name=img_name, save_dir=user_dir)
+    init_seg_path = os.path.join(user_dir, f"{img_name}_mask_segments_2.png")
+    seg_img = Image.open(init_seg_path)
+    if seg_img.mode == 'RGBA':
+        white_bg = Image.new('RGBA', seg_img.size, (255, 255, 255, 255))
+        seg_img = Image.alpha_composite(white_bg, seg_img)
+        seg_img.save(init_seg_path)
+    state = {
+        "image": image.tolist(),
+        "processed_image": rgba_path,
+        "group_ids": group_ids.tolist() if isinstance(group_ids, np.ndarray) else group_ids,
+        "original_group_ids": group_ids.tolist() if isinstance(group_ids, np.ndarray) else group_ids,
+        "img_name": img_name,
+        "pre_split_path": pre_split_path,
+    }
+    return init_seg_path, pre_merge_path, state
+def apply_merge(merge_input, state, req: gr.Request):
+    """Apply merge parameters and generate merged segmentation"""
+    global sam_mask_generator
+    if not state:
+        return None, None, state
+    user_dir = os.path.join(TMP_ROOT, str(req.session_hash))
+    # Convert back from list to numpy array
+    image = np.array(state["image"])
+    # Use original group IDs instead of the most recent ones
+    group_ids = np.array(state["original_group_ids"])
+    img_name = state["img_name"]
+    # Load processed image from path
+    processed_image = Image.open(state["processed_image"])
+    # Display the original IDs before merging, SORTED for easier reading
+    unique_ids = np.unique(group_ids)
+    unique_ids = unique_ids[unique_ids >= 0]  # Exclude background
+    print(f"Original segment IDs (used for merging): {sorted(unique_ids.tolist())}")
+    # Parse merge groups
+    merge_groups = None
+    try:
+        if merge_input:
+            merge_groups = []
+            group_sets = merge_input.split(';')
+            for group_set in group_sets:
+                ids = [int(x) for x in group_set.split(',')]
+                if ids:
+                    # Validate if these IDs exist in the segmentation
+                    existing_ids = [id for id in ids if id in unique_ids]
+                    missing_ids = [id for id in ids if id not in unique_ids]
+                    if missing_ids:
+                        print(f"Warning: These IDs don't exist in the segmentation: {missing_ids}")
+                    # Only add group if it has valid IDs
+                    if existing_ids:
+                        merge_groups.append(ids)
+                        print(f"Valid merge group: {ids} (missing: {missing_ids if missing_ids else 'none'})")
+                    else:
+                        print(f"Skipping merge group with no valid IDs: {ids}")
+            print(f"Using merge groups: {merge_groups}")
+    except Exception as e:
+        print(f"Error parsing merge groups: {e}")
+        return None, None, state
+    # Initialize visualizer
+    visual = Visualizer(image)
+    # Generate merged segmentation starting from original IDs
+    # Add skip_split=True to prevent splitting after merging
+    new_group_ids, merged_im = get_sam_mask(
+        image,
+        sam_mask_generator,
+        visual,
+        merge_groups=merge_groups,
+        existing_group_ids=group_ids,
+        rgba_image=processed_image,
+        skip_split=True,
+        img_name=img_name,
+        save_dir=user_dir,
+        size_threshold=size_th
+    )
+    # Display the new IDs after merging for future reference
+    new_unique_ids = np.unique(new_group_ids)
+    new_unique_ids = new_unique_ids[new_unique_ids >= 0]  # Exclude background
+    print(f"New segment IDs (after merging): {new_unique_ids.tolist()}")
+    # Clean edges
+    new_group_ids = clean_segment_edges(new_group_ids)
+    # Save merged segmentation visualization
+    get_mask(new_group_ids, image, ids=3, img_name=img_name, save_dir=user_dir)
+    # Path to merged segmentation
+    merged_seg_path = os.path.join(user_dir, f"{img_name}_mask_segments_3.png")
+    save_mask = new_group_ids + 1
+    save_mask = save_mask.reshape(518, 518, 1).repeat(3, axis=-1)
+    cv2.imwrite(os.path.join(user_dir, f"{img_name}_mask.exr"), save_mask.astype(np.float32))
+    # Update state with the new group IDs but keep original IDs unchanged
+    state["group_ids"] = new_group_ids.tolist() if isinstance(new_group_ids, np.ndarray) else new_group_ids
+    state["save_mask_path"] = os.path.join(user_dir, f"{img_name}_mask.exr")
+    return merged_seg_path, state
+def explode_mesh(mesh, explosion_scale=0.4):
+    if isinstance(mesh, trimesh.Scene):
+        scene = mesh
+    elif isinstance(mesh, trimesh.Trimesh):
+        print("Warning: Single mesh provided, can't create exploded view")
+        scene = trimesh.Scene(mesh)
+        return scene
+    else:
+        print(f"Warning: Unexpected mesh type: {type(mesh)}")
+        scene = mesh
+    if len(scene.geometry) <= 1:
+        print("Only one geometry found - nothing to explode")
+        return scene
+    print(f"[EXPLODE_MESH] Starting mesh explosion with scale {explosion_scale}")
+    print(f"[EXPLODE_MESH] Processing {len(scene.geometry)} parts")
+    exploded_scene = trimesh.Scene()
+    part_centers = []
+    geometry_names = []
+    for geometry_name, geometry in scene.geometry.items():
+        if hasattr(geometry, 'vertices'):
+            transform = scene.graph[geometry_name][0]
+            vertices_global = trimesh.transformations.transform_points(
+                geometry.vertices, transform)
+            center = np.mean(vertices_global, axis=0)
+            part_centers.append(center)
+            geometry_names.append(geometry_name)
+            print(f"[EXPLODE_MESH] Part {geometry_name}: center = {center}")
+    if not part_centers:
+        print("No valid geometries with vertices found")
+        return scene
+    part_centers = np.array(part_centers)
+    global_center = np.mean(part_centers, axis=0)
+    print(f"[EXPLODE_MESH] Global center: {global_center}")
+    for i, (geometry_name, geometry) in enumerate(scene.geometry.items()):
+        if hasattr(geometry, 'vertices'):
+            if i < len(part_centers):
+                part_center = part_centers[i]
+                direction = part_center - global_center
+                direction_norm = np.linalg.norm(direction)
+                if direction_norm > 1e-6:
+                    direction = direction / direction_norm
+                else:
+                    direction = np.random.randn(3)
+                    direction = direction / np.linalg.norm(direction)
+                offset = direction * explosion_scale
+            else:
+                offset = np.zeros(3)
+            original_transform = scene.graph[geometry_name][0].copy()
+            new_transform = original_transform.copy()
+            new_transform[:3, 3] = new_transform[:3, 3] + offset
+            exploded_scene.add_geometry(
+                geometry,
+                transform=new_transform,
+                geom_name=geometry_name
+            )
+            print(f"[EXPLODE_MESH] Part {geometry_name}: moved by {np.linalg.norm(offset):.4f}")
+    print("[EXPLODE_MESH] Mesh explosion complete")
+    return exploded_scene
+@spaces.GPU(duration=90)
+def generate_parts(state, seed, cfg_strength, req: gr.Request):
+    explode_factor=0.3
+    img_path = state["processed_image"]
+    mask_path = state["save_mask_path"]
+    user_dir = os.path.join(TMP_ROOT, str(req.session_hash))
+    img_white_bg, img_black_bg, ordered_mask_input, img_mask_vis = load_img_mask(img_path, mask_path)
+    img_mask_vis.save(os.path.join(user_dir, "img_mask_vis.png"))
+    voxel_coords = part_synthesis_pipeline.get_coords(img_black_bg, num_samples=1, seed=seed, sparse_structure_sampler_params={"steps": 25, "cfg_strength": 7.5})
+    voxel_coords = voxel_coords.cpu().numpy()
+    np.save(os.path.join(user_dir, "voxel_coords.npy"), voxel_coords)
+    voxel_coords_ply = vis_voxel_coords(voxel_coords)
+    voxel_coords_ply.export(os.path.join(user_dir, "voxel_coords_vis.ply"))
+    print("[INFO] Voxel coordinates saved")
+    bbox_gen_input = prepare_bbox_gen_input(os.path.join(user_dir, "voxel_coords.npy"), img_white_bg, ordered_mask_input)
+    bbox_gen_output = bbox_gen_model.generate(bbox_gen_input)
+    np.save(os.path.join(user_dir, "bboxes.npy"), bbox_gen_output['bboxes'][0])
+    bboxes_vis = gen_mesh_from_bounds(bbox_gen_output['bboxes'][0])
+    bboxes_vis.export(os.path.join(user_dir, "bboxes_vis.glb"))
+    print("[INFO] BboxGen output saved")
+    part_synthesis_input = prepare_part_synthesis_input(os.path.join(user_dir, "voxel_coords.npy"), os.path.join(user_dir, "bboxes.npy"), ordered_mask_input)
+    torch.cuda.empty_cache()
+    part_synthesis_output = part_synthesis_pipeline.get_slat(
+        img_black_bg,
+        part_synthesis_input['coords'],
+        [part_synthesis_input['part_layouts']],
+        part_synthesis_input['masks'],
+        seed=seed,
+        slat_sampler_params={"steps": 25, "cfg_strength": cfg_strength},
+        formats=['mesh', 'gaussian'],
+        preprocess_image=False,
+    )
+    save_parts_outputs(
+        part_synthesis_output,
+        output_dir=user_dir,
+        simplify_ratio=0.0,
+        save_video=False,
+        save_glb=True,
+        textured=False,
+    )
+    merge_parts(user_dir)
+    print("[INFO] PartSynthesis output saved")
+    bbox_mesh_path = os.path.join(user_dir, "bboxes_vis.glb")
+    whole_mesh_path = os.path.join(user_dir, "mesh_segment.glb")
+    combined_mesh = trimesh.load(whole_mesh_path)
+    exploded_mesh_result = explode_mesh(combined_mesh, explosion_scale=explode_factor)
+    exploded_mesh_result.export(os.path.join(user_dir, "exploded_parts.glb"))
+    exploded_mesh_path = os.path.join(user_dir, "exploded_parts.glb")
+    combined_gs_path = os.path.join(user_dir, "merged_gs.ply")
+    exploded_gs_path = os.path.join(user_dir, "exploded_gs.ply")
+    return bbox_mesh_path, whole_mesh_path, exploded_mesh_path, combined_gs_path, exploded_gs_path

assets/example_data/Batman.png ADDED Viewed

Git LFS Details

SHA256: 9a9a80321c27ee38899bbc2bb4f346d449422898f3dc3214dba4dcd6e5cf6397
Pointer size: 131 Bytes
Size of remote file: 510 kB

assets/example_data/astronaut.png ADDED Viewed

Git LFS Details

SHA256: 49712b3a29aa24862e8a4d3c1c69326459585ef9f7aa15ff8c2b2d90101f3784
Pointer size: 131 Bytes
Size of remote file: 252 kB

assets/example_data/car.png ADDED Viewed

Git LFS Details

SHA256: 82239d215901c12d12ddaa5fbb5b6c3f928e3a6fec19bc1a2b26a4aa084d482d
Pointer size: 133 Bytes
Size of remote file: 10.1 MB

assets/example_data/crossbow.jpg ADDED Viewed

assets/example_data/knight.png ADDED Viewed

Git LFS Details

SHA256: 291db3fca9c1d63b91609d28352b3f6fbc1e9f143f7783b70dc9ec35a911d77c
Pointer size: 131 Bytes
Size of remote file: 604 kB

assets/example_data/robot.jpg ADDED Viewed

assets/example_data/robot1.jpeg ADDED Viewed

Git LFS Details

SHA256: 7131acb0e194caf8bac6bee72d668def184a18df14848ce731380b96486e996b
Pointer size: 131 Bytes
Size of remote file: 133 kB

assets/example_data/robot_dog.jpg ADDED Viewed

assets/example_data/ship.jpg ADDED Viewed

assets/example_data/snake.png ADDED Viewed

Git LFS Details

SHA256: fa4ec58625fed4dd0e5b65e323333c89ecace02fbe4161327d4e06e0ea4b678a
Pointer size: 132 Bytes
Size of remote file: 1.41 MB

assets/example_data/warhammer.png ADDED Viewed

Git LFS Details

SHA256: bc63bda34774288092d069808b7cb28c9544dd253cfbcfb33a98b22c9ec19537
Pointer size: 131 Bytes
Size of remote file: 163 kB

configs/bbox_gen.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+model:
+  name: bbox_gen
+  args:
+    encoder_dim_feat: 448
+    encoder_dim: 64
+    encoder_heads: 4
+    encoder_token_num: 2048
+    encoder_qkv_bias: false
+    encoder_use_ln_post: true
+    encoder_use_checkpoint: true
+    encoder_num_embed_freqs: 8
+    encoder_embed_include_pi: false
+    encoder_init_scale: 0.25
+    encoder_random_fps: true
+    encoder_learnable_query: true
+    encoder_layers: 8
+    max_group_size: 50
+    vocab_size: 67
+    decoder_hidden_size: 1024
+    decoder_num_hidden_layers: 24
+    decoder_ffn_dim: 4096
+    decoder_heads: 16
+    decoder_use_flash_attention: true
+    decoder_gradient_checkpointing: false
+    bins: 64
+    BOS_id: 64
+    EOS_id: 65
+    PAD_id: 66
+    max_length: 2187
+    voxel_token_length: 1886
+    voxel_token_placeholder: -1

modules/PartField/configs/final/correspondence_demo.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+result_name: partfield_features/correspondence_demo
+continue_ckpt: model/model.ckpt
+triplane_channels_low: 128
+triplane_channels_high: 512
+triplane_resolution: 128
+vertex_feature: True
+n_point_per_face: 1000
+n_sample_each: 10000
+is_pc: True
+remesh_demo: False
+correspondence_demo: True
+preprocess_mesh: True
+dataset:
+  type: "Mix"
+  data_path: data/DenseCorr3D
+  train_batch_size: 1
+  val_batch_size: 1
+  train_num_workers: 8
+  all_files:
+    # pairs of example to run correspondence
+    - animals/071b8_toy_animals_017/simple_mesh.obj
+    - animals/bdfd0_toy_animals_016/simple_mesh.obj
+    - animals/2d6b3_toy_animals_009/simple_mesh.obj
+    - animals/96615_toy_animals_018/simple_mesh.obj
+    - chairs/063d1_chair_006/simple_mesh.obj
+    - chairs/bea57_chair_012/simple_mesh.obj
+    - chairs/fe0fe_chair_004/simple_mesh.obj
+    - chairs/288dc_chair_011/simple_mesh.obj
+    # consider decimating animals/../color_mesh.obj yourself for better mesh topology than the provided simple_mesh.obj
+    # (e.g. <50k vertices for functional map efficiency).
+loss:
+  triplet: 1.0
+use_2d_feat: False
+pvcnn:
+  point_encoder_type: 'pvcnn'
+  z_triplane_channels: 256
+  z_triplane_resolution: 128

modules/PartField/configs/final/demo.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+result_name: demo_test
+continue_ckpt: model/model.ckpt
+triplane_channels_low: 128
+triplane_channels_high: 512
+triplane_resolution: 128
+n_point_per_face: 1000
+n_sample_each: 10000
+is_pc : True
+remesh_demo : False
+dataset:
+  type: "Mix"
+  data_path: "objaverse_data"
+  train_batch_size: 1
+  val_batch_size: 1
+  train_num_workers: 8
+loss:
+  triplet: 1.0
+use_2d_feat: False
+pvcnn:
+  point_encoder_type: 'pvcnn'
+  z_triplane_channels: 256
+  z_triplane_resolution: 128

modules/PartField/partfield/config/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+import os.path as osp
+from datetime import datetime
+import pytz
+def default_argument_parser(add_help=True, default_config_file=""):
+    parser = argparse.ArgumentParser(add_help=add_help)
+    parser.add_argument("--config-file", '-c', default=default_config_file, metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+def setup(args, freeze=True):
+    from .defaults import _C as cfg
+    cfg = cfg.clone()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    dt = datetime.now(pytz.timezone('America/Los_Angeles')).strftime('%y%m%d-%H%M%S')
+    cfg.output_dir = osp.join(cfg.output_dir, cfg.name, dt)
+    if freeze:
+        cfg.freeze()
+    return cfg

modules/PartField/partfield/config/defaults.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from yacs.config import CfgNode as CN
+_C = CN()
+_C.seed = 0
+_C.output_dir = "results"
+_C.result_name = "test_all"
+_C.triplet_sampling = "random"
+_C.load_original_mesh = False
+_C.num_pos = 64
+_C.num_neg_random = 256
+_C.num_neg_hard_pc = 128
+_C.num_neg_hard_emb = 128
+_C.vertex_feature = False  # if true, sample feature on vertices; if false, sample feature on faces
+_C.n_point_per_face = 2000
+_C.n_sample_each = 10000
+_C.preprocess_mesh = False
+_C.regress_2d_feat = False
+_C.is_pc = False
+_C.cut_manifold = False
+_C.remesh_demo = False
+_C.correspondence_demo = False
+_C.save_every_epoch = 10
+_C.training_epochs = 30
+_C.continue_training = False
+_C.continue_ckpt = None
+_C.epoch_selected = "epoch=50.ckpt"
+_C.triplane_resolution = 128
+_C.triplane_channels_low = 128
+_C.triplane_channels_high = 512
+_C.lr = 1e-3
+_C.train = True
+_C.test = False
+_C.inference_save_pred_sdf_to_mesh=True
+_C.inference_save_feat_pca=True
+_C.name = "test"
+_C.test_subset = False
+_C.test_corres = False
+_C.test_partobjaversetiny = False
+_C.dataset = CN()
+_C.dataset.type = "Demo_Dataset"
+_C.dataset.data_path = "objaverse_data/"
+_C.dataset.train_num_workers = 64
+_C.dataset.val_num_workers = 32
+_C.dataset.train_batch_size = 2
+_C.dataset.val_batch_size = 2
+_C.dataset.all_files = []  # only used for correspondence demo
+_C.voxel2triplane = CN()
+_C.voxel2triplane.transformer_dim = 1024
+_C.voxel2triplane.transformer_layers = 6
+_C.voxel2triplane.transformer_heads = 8
+_C.voxel2triplane.triplane_low_res = 32
+_C.voxel2triplane.triplane_high_res = 256
+_C.voxel2triplane.triplane_dim = 64
+_C.voxel2triplane.normalize_vox_feat = False
+_C.loss = CN()
+_C.loss.triplet = 0.0
+_C.loss.sdf = 1.0
+_C.loss.feat = 10.0
+_C.loss.l1 = 0.0
+_C.use_pvcnn = False
+_C.use_pvcnnonly = True
+_C.pvcnn = CN()
+_C.pvcnn.point_encoder_type = 'pvcnn'
+_C.pvcnn.use_point_scatter = True
+_C.pvcnn.z_triplane_channels = 64
+_C.pvcnn.z_triplane_resolution = 256
+_C.pvcnn.unet_cfg = CN()
+_C.pvcnn.unet_cfg.depth = 3
+_C.pvcnn.unet_cfg.enabled = True
+_C.pvcnn.unet_cfg.rolled = True
+_C.pvcnn.unet_cfg.use_3d_aware = True
+_C.pvcnn.unet_cfg.start_hidden_channels = 32
+_C.pvcnn.unet_cfg.use_initial_conv = False
+_C.use_2d_feat = False
+_C.inference_metrics_only = False

modules/PartField/partfield/model/PVCNN/conv_pointnet.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Taken from gensdf
+https://github.com/princeton-computational-imaging/gensdf
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from dnnlib.util import printarr
+try:
+    from torch_scatter import scatter_mean, scatter_max
+except:
+    pass
+# from .unet import UNet
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Resnet Blocks
+class ResnetBlockFC(nn.Module):
+    ''' Fully connected ResNet Block class.
+    Args:
+        size_in (int): input dimension
+        size_out (int): output dimension
+        size_h (int): hidden dimension
+    '''
+    def __init__(self, size_in, size_out=None, size_h=None):
+        super().__init__()
+        # Attributes
+        if size_out is None:
+            size_out = size_in
+        if size_h is None:
+            size_h = min(size_in, size_out)
+        self.size_in = size_in
+        self.size_h = size_h
+        self.size_out = size_out
+        # Submodules
+        self.fc_0 = nn.Linear(size_in, size_h)
+        self.fc_1 = nn.Linear(size_h, size_out)
+        self.actvn = nn.ReLU()
+        if size_in == size_out:
+            self.shortcut = None
+        else:
+            self.shortcut = nn.Linear(size_in, size_out, bias=False)
+        # Initialization
+        nn.init.zeros_(self.fc_1.weight)
+    def forward(self, x):
+        net = self.fc_0(self.actvn(x))
+        dx = self.fc_1(self.actvn(net))
+        if self.shortcut is not None:
+            x_s = self.shortcut(x)
+        else:
+            x_s = x
+        return x_s + dx
+class ConvPointnet(nn.Module):
+    ''' PointNet-based encoder network with ResNet blocks for each point.
+        Number of input points are fixed.
+    Args:
+        c_dim (int): dimension of latent code c
+        dim (int): input points dimension
+        hidden_dim (int): hidden dimension of the network
+        scatter_type (str): feature aggregation when doing local pooling
+        unet (bool): weather to use U-Net
+        unet_kwargs (str): U-Net parameters
+        plane_resolution (int): defined resolution for plane feature
+        plane_type (str): feature type, 'xz' - 1-plane, ['xz', 'xy', 'yz'] - 3-plane, ['grid'] - 3D grid volume
+        padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+        n_blocks (int): number of blocks ResNetBlockFC layers
+    '''
+    def __init__(self, c_dim=128, dim=3, hidden_dim=128, scatter_type='max',
+                #  unet=False, unet_kwargs=None,
+                 plane_resolution=None, plane_type=['xz', 'xy', 'yz'], padding=0.1, n_blocks=5):
+        super().__init__()
+        self.c_dim = c_dim
+        self.fc_pos = nn.Linear(dim, 2*hidden_dim)
+        self.blocks = nn.ModuleList([
+            ResnetBlockFC(2*hidden_dim, hidden_dim) for i in range(n_blocks)
+        ])
+        self.fc_c = nn.Linear(hidden_dim, c_dim)
+        self.actvn = nn.ReLU()
+        self.hidden_dim = hidden_dim
+        # if unet:
+        #     self.unet = UNet(c_dim, in_channels=c_dim, **unet_kwargs)
+        # else:
+        #     self.unet = None
+        self.reso_plane = plane_resolution
+        self.plane_type = plane_type
+        self.padding = padding
+        if scatter_type == 'max':
+            self.scatter = scatter_max
+        elif scatter_type == 'mean':
+            self.scatter = scatter_mean
+    # takes in "p": point cloud and "query": sdf_xyz
+    # sample plane features for unlabeled_query as well
+    def forward(self, p):#, query2):
+        batch_size, T, D = p.size()
+        # acquire the index for each point
+        coord = {}
+        index = {}
+        if 'xz' in self.plane_type:
+            coord['xz'] = self.normalize_coordinate(p.clone(), plane='xz', padding=self.padding)
+            index['xz'] = self.coordinate2index(coord['xz'], self.reso_plane)
+        if 'xy' in self.plane_type:
+            coord['xy'] = self.normalize_coordinate(p.clone(), plane='xy', padding=self.padding)
+            index['xy'] = self.coordinate2index(coord['xy'], self.reso_plane)
+        if 'yz' in self.plane_type:
+            coord['yz'] = self.normalize_coordinate(p.clone(), plane='yz', padding=self.padding)
+            index['yz'] = self.coordinate2index(coord['yz'], self.reso_plane)
+        net = self.fc_pos(p)
+        net = self.blocks[0](net)
+        for block in self.blocks[1:]:
+            pooled = self.pool_local(coord, index, net)
+            net = torch.cat([net, pooled], dim=2)
+            net = block(net)
+        c = self.fc_c(net)
+        fea = {}
+        plane_feat_sum = 0
+        #second_sum = 0
+        if 'xz' in self.plane_type:
+            fea['xz'] = self.generate_plane_features(p, c, plane='xz') # shape: batch, latent size, resolution, resolution (e.g. 16, 256, 64, 64)
+            # plane_feat_sum += self.sample_plane_feature(query, fea['xz'], 'xz')
+            #second_sum += self.sample_plane_feature(query2, fea['xz'], 'xz')
+        if 'xy' in self.plane_type:
+            fea['xy'] = self.generate_plane_features(p, c, plane='xy')
+            # plane_feat_sum += self.sample_plane_feature(query, fea['xy'], 'xy')
+            #second_sum += self.sample_plane_feature(query2, fea['xy'], 'xy')
+        if 'yz' in self.plane_type:
+            fea['yz'] = self.generate_plane_features(p, c, plane='yz')
+            # plane_feat_sum += self.sample_plane_feature(query, fea['yz'], 'yz')
+            #second_sum += self.sample_plane_feature(query2, fea['yz'], 'yz')
+        return fea
+        # return plane_feat_sum.transpose(2,1)#, second_sum.transpose(2,1)
+    def normalize_coordinate(self, p, padding=0.1, plane='xz'):
+        ''' Normalize coordinate to [0, 1] for unit cube experiments
+        Args:
+            p (tensor): point
+            padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+            plane (str): plane feature type, ['xz', 'xy', 'yz']
+        '''
+        if plane == 'xz':
+            xy = p[:, :, [0, 2]]
+        elif plane =='xy':
+            xy = p[:, :, [0, 1]]
+        else:
+            xy = p[:, :, [1, 2]]
+        xy_new = xy / (1 + padding + 10e-6) # (-0.5, 0.5)
+        xy_new = xy_new + 0.5 # range (0, 1)
+        # f there are outliers out of the range
+        if xy_new.max() >= 1:
+            xy_new[xy_new >= 1] = 1 - 10e-6
+        if xy_new.min() < 0:
+            xy_new[xy_new < 0] = 0.0
+        return xy_new
+    def coordinate2index(self, x, reso):
+        ''' Normalize coordinate to [0, 1] for unit cube experiments.
+            Corresponds to our 3D model
+        Args:
+            x (tensor): coordinate
+            reso (int): defined resolution
+            coord_type (str): coordinate type
+        '''
+        x = (x * reso).long()
+        index = x[:, :, 0] + reso * x[:, :, 1]
+        index = index[:, None, :]
+        return index
+    # xy is the normalized coordinates of the point cloud of each plane
+    # I'm pretty sure the keys of xy are the same as those of index, so xy isn't needed here as input
+    def pool_local(self, xy, index, c):
+        bs, fea_dim = c.size(0), c.size(2)
+        keys = xy.keys()
+        c_out = 0
+        for key in keys:
+            # scatter plane features from points
+            fea = self.scatter(c.permute(0, 2, 1), index[key], dim_size=self.reso_plane**2)
+            if self.scatter == scatter_max:
+                fea = fea[0]
+            # gather feature back to points
+            fea = fea.gather(dim=2, index=index[key].expand(-1, fea_dim, -1))
+            c_out += fea
+        return c_out.permute(0, 2, 1)
+    def generate_plane_features(self, p, c, plane='xz'):
+        # acquire indices of features in plane
+        xy = self.normalize_coordinate(p.clone(), plane=plane, padding=self.padding) # normalize to the range of (0, 1)
+        index = self.coordinate2index(xy, self.reso_plane)
+        # scatter plane features from points
+        fea_plane = c.new_zeros(p.size(0), self.c_dim, self.reso_plane**2)
+        c = c.permute(0, 2, 1) # B x 512 x T
+        fea_plane = scatter_mean(c, index, out=fea_plane) # B x 512 x reso^2
+        fea_plane = fea_plane.reshape(p.size(0), self.c_dim, self.reso_plane, self.reso_plane) # sparce matrix (B x 512 x reso x reso)
+        # printarr(fea_plane, c, p, xy, index)
+        # import pdb; pdb.set_trace()
+        # process the plane features with UNet
+        # if self.unet is not None:
+        #     fea_plane = self.unet(fea_plane)
+        return fea_plane
+    # sample_plane_feature function copied from /src/conv_onet/models/decoder.py
+    # uses values from plane_feature and pixel locations from vgrid to interpolate feature
+    def sample_plane_feature(self, query, plane_feature, plane):
+        xy = self.normalize_coordinate(query.clone(), plane=plane, padding=self.padding)
+        xy = xy[:, :, None].float()
+        vgrid = 2.0 * xy - 1.0 # normalize to (-1, 1)
+        sampled_feat = F.grid_sample(plane_feature, vgrid, padding_mode='border', align_corners=True, mode='bilinear').squeeze(-1)
+        return sampled_feat

modules/PartField/partfield/model/PVCNN/dnnlib_util.py ADDED Viewed

	@@ -0,0 +1,1074 @@

+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
+"""Miscellaneous utility classes and functions."""
+from collections import namedtuple
+import time
+import ctypes
+import fnmatch
+import importlib
+import inspect
+import numpy as np
+import json
+import os
+import shutil
+import sys
+import types
+import io
+import pickle
+import re
+# import requests
+import html
+import hashlib
+import glob
+import tempfile
+import urllib
+import urllib.request
+import uuid
+import boto3
+import threading
+from contextlib import ContextDecorator
+from contextlib import contextmanager, nullcontext
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union
+import importlib
+from loguru import logger
+# import wandb
+import torch
+import psutil
+import subprocess
+import random
+import string
+import pdb
+# Util classes
+# ------------------------------------------------------------------------------------------
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+class Logger(object):
+    """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file."""
+    def __init__(self, file_name: str = None, file_mode: str = "w", should_flush: bool = True):
+        self.file = None
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+        sys.stdout = self
+        sys.stderr = self
+    def __enter__(self) -> "Logger":
+        return self
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+    def write(self, text: Union[str, bytes]) -> None:
+        """Write text to stdout (and a file) and optionally flush."""
+        if isinstance(text, bytes):
+            text = text.decode()
+        if len(text) == 0:  # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+        if self.file is not None:
+            self.file.write(text)
+        self.stdout.write(text)
+        if self.should_flush:
+            self.flush()
+    def flush(self) -> None:
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+        self.stdout.flush()
+    def close(self) -> None:
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+        if self.file is not None:
+            self.file.close()
+            self.file = None
+# Cache directories
+# ------------------------------------------------------------------------------------------
+_dnnlib_cache_dir = None
+def set_cache_dir(path: str) -> None:
+    global _dnnlib_cache_dir
+    _dnnlib_cache_dir = path
+def make_cache_dir_path(*paths: str) -> str:
+    if _dnnlib_cache_dir is not None:
+        return os.path.join(_dnnlib_cache_dir, *paths)
+    if 'DNNLIB_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths)
+# Small util functions
+# ------------------------------------------------------------------------------------------
+def format_time(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60)
+    else:
+        return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60)
+def format_time_brief(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m".format(s // (60 * 60), (s // 60) % 60)
+    else:
+        return "{0}d {1:02}h".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24)
+def ask_yes_no(question: str) -> bool:
+    """Ask the user the question until the user inputs a valid answer."""
+    while True:
+        try:
+            print("{0} [y/n]".format(question))
+            return strtobool(input().lower())
+        except ValueError:
+            pass
+def tuple_product(t: Tuple) -> Any:
+    """Calculate the product of the tuple elements."""
+    result = 1
+    for v in t:
+        result *= v
+    return result
+_str_to_ctype = {
+    "uint8": ctypes.c_ubyte,
+    "uint16": ctypes.c_uint16,
+    "uint32": ctypes.c_uint32,
+    "uint64": ctypes.c_uint64,
+    "int8": ctypes.c_byte,
+    "int16": ctypes.c_int16,
+    "int32": ctypes.c_int32,
+    "int64": ctypes.c_int64,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double
+}
+def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
+    """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes."""
+    type_str = None
+    if isinstance(type_obj, str):
+        type_str = type_obj
+    elif hasattr(type_obj, "__name__"):
+        type_str = type_obj.__name__
+    elif hasattr(type_obj, "name"):
+        type_str = type_obj.name
+    else:
+        raise RuntimeError("Cannot infer type name from input")
+    assert type_str in _str_to_ctype.keys()
+    my_dtype = np.dtype(type_str)
+    my_ctype = _str_to_ctype[type_str]
+    assert my_dtype.itemsize == ctypes.sizeof(my_ctype)
+    return my_dtype, my_ctype
+def is_pickleable(obj: Any) -> bool:
+    try:
+        with io.BytesIO() as stream:
+            pickle.dump(obj, stream)
+        return True
+    except:
+        return False
+# Functionality to import modules/objects by name, and call functions by name
+# ------------------------------------------------------------------------------------------
+def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]:
+    """Searches for the underlying module behind the name to some python object.
+    Returns the module and the object name (original name with module part removed)."""
+    # allow convenience shorthands, substitute them by full names
+    obj_name = re.sub("^np.", "numpy.", obj_name)
+    obj_name = re.sub("^tf.", "tensorflow.", obj_name)
+    # list alternatives for (module_name, local_obj_name)
+    parts = obj_name.split(".")
+    name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)]
+    # try each alternative in turn
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name)  # may raise ImportError
+            get_obj_from_module(module, local_obj_name)  # may raise AttributeError
+            return module, local_obj_name
+        except:
+            pass
+    # maybe some of the modules themselves contain errors?
+    for module_name, _local_obj_name in name_pairs:
+        try:
+            importlib.import_module(module_name)  # may raise ImportError
+        except ImportError:
+            if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"):
+                raise
+    # maybe the requested attribute is missing?
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name)  # may raise ImportError
+            get_obj_from_module(module, local_obj_name)  # may raise AttributeError
+        except ImportError:
+            pass
+    # we are out of luck, but we have no idea why
+    raise ImportError(obj_name)
+def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
+    """Traverses the object name and returns the last (rightmost) python object."""
+    if obj_name == '':
+        return module
+    obj = module
+    for part in obj_name.split("."):
+        obj = getattr(obj, part)
+    return obj
+def get_obj_by_name(name: str) -> Any:
+    """Finds the python object with the given name."""
+    module, obj_name = get_module_from_obj_name(name)
+    return get_obj_from_module(module, obj_name)
+def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
+    """Finds the python object with the given name and calls it as a function."""
+    assert func_name is not None
+    func_obj = get_obj_by_name(func_name)
+    assert callable(func_obj)
+    return func_obj(*args, **kwargs)
+def construct_class_by_name(*args, class_name: str = None, **kwargs) -> Any:
+    """Finds the python class with the given name and constructs it with the given arguments."""
+    return call_func_by_name(*args, func_name=class_name, **kwargs)
+def get_module_dir_by_obj_name(obj_name: str) -> str:
+    """Get the directory path of the module containing the given object name."""
+    module, _ = get_module_from_obj_name(obj_name)
+    return os.path.dirname(inspect.getfile(module))
+def is_top_level_function(obj: Any) -> bool:
+    """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'."""
+    return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__
+def get_top_level_function_name(obj: Any) -> str:
+    """Return the fully-qualified name of a top-level function."""
+    assert is_top_level_function(obj)
+    module = obj.__module__
+    if module == '__main__':
+        module = os.path.splitext(os.path.basename(sys.modules[module].__file__))[0]
+    return module + "." + obj.__name__
+# File system helpers
+# ------------------------------------------------------------------------------------------
+def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]:
+    """List all files recursively in a given directory while ignoring given file and directory names.
+    Returns list of tuples containing both absolute and relative paths."""
+    assert os.path.isdir(dir_path)
+    base_name = os.path.basename(os.path.normpath(dir_path))
+    if ignores is None:
+        ignores = []
+    result = []
+    for root, dirs, files in os.walk(dir_path, topdown=True):
+        for ignore_ in ignores:
+            dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)]
+            # dirs need to be edited in-place
+            for d in dirs_to_remove:
+                dirs.remove(d)
+            files = [f for f in files if not fnmatch.fnmatch(f, ignore_)]
+        absolute_paths = [os.path.join(root, f) for f in files]
+        relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths]
+        if add_base_to_relative:
+            relative_paths = [os.path.join(base_name, p) for p in relative_paths]
+        assert len(absolute_paths) == len(relative_paths)
+        result += zip(absolute_paths, relative_paths)
+    return result
+def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
+    """Takes in a list of tuples of (src, dst) paths and copies files.
+    Will create all necessary directories."""
+    for file in files:
+        target_dir_name = os.path.dirname(file[1])
+        # will create all intermediate-level directories
+        if not os.path.exists(target_dir_name):
+            os.makedirs(target_dir_name)
+        shutil.copyfile(file[0], file[1])
+# URL helpers
+# ------------------------------------------------------------------------------------------
+def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
+    """Determine whether the given object is a valid URL string."""
+    if not isinstance(obj, str) or not "://" in obj:
+        return False
+    if allow_file_urls and obj.startswith('file://'):
+        return True
+    try:
+        res = requests.compat.urlparse(obj)
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+        res = requests.compat.urlparse(requests.compat.urljoin(obj, "/"))
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+    except:
+        return False
+    return True
+def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    assert not (return_filename and (not cache))
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+    assert is_url(url)
+    # Lookup from cache.
+    if cache_dir is None:
+        cache_dir = make_cache_dir_path('downloads')
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    if cache:
+        cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*"))
+        if len(cache_files) == 1:
+            filename = cache_files[0]
+            return filename if return_filename else open(filename, "rb")
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+    # Save to cache.
+    if cache:
+        safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name)
+        cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name)
+        temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(temp_file, "wb") as f:
+            f.write(url_data)
+        os.replace(temp_file, cache_file)  # atomic
+        if return_filename:
+            return cache_file
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
+# ------------------------------------------------------------------------------------------
+# util function modified from https://github.com/nv-tlabs/LION/blob/0467d2199076e95a7e88bafd99dcd7d48a04b4a7/utils/model_helper.py
+def import_class(model_str):
+    from torch_utils.dist_utils import is_rank0
+    if is_rank0():
+        logger.info('import: {}', model_str)
+    p, m = model_str.rsplit('.', 1)
+    mod = importlib.import_module(p)
+    Model = getattr(mod, m)
+    return Model
+class ScopedTorchProfiler(ContextDecorator):
+    """
+    Marks ranges for both nvtx profiling (with nsys) and torch autograd profiler
+    """
+    __global_counts = {}
+    enabled=False
+    def __init__(self, unique_name: str):
+        """
+        Names must be unique!
+        """
+        ScopedTorchProfiler.__global_counts[unique_name] = 0
+        self._name = unique_name
+        self._autograd_scope = torch.profiler.record_function(unique_name)
+    def __enter__(self):
+        if ScopedTorchProfiler.enabled:
+            torch.cuda.nvtx.range_push(self._name)
+        self._autograd_scope.__enter__()
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._autograd_scope.__exit__(exc_type, exc_value, traceback)
+        if ScopedTorchProfiler.enabled:
+            torch.cuda.nvtx.range_pop()
+class TimingsMonitor():
+    CUDATimer = namedtuple('CUDATimer', ['start', 'end'])
+    def __init__(self, device, enabled=True, timing_names:List[str]=[], cuda_timing_names:List[str]=[]):
+        """
+        Usage:
+            tmonitor = TimingsMonitor(device)
+            for i in range(n_iter):
+                # Record arbitrary scopes
+                with tmonitor.timing_scope('regular_scope_name'):
+                    ...
+                    with tmonitor.cuda_timing_scope('nested_scope_name'):
+                        ...
+                with tmonitor.cuda_timing_scope('cuda_scope_name'):
+                    ...
+                tmonitor.record_timing('duration_name', end_time - start_time)
+                # Gather timings
+                tmonitor.record_all_cuda_timings()
+                tmonitor.update_all_averages()
+                averages = tmonitor.get_average_timings()
+                all_timings = tmonitor.get_timings()
+        Two types of timers, standard report timing and cuda timings.
+        Cuda timing supports scoped context manager cuda_event_scope.
+        Args:
+            device: device to time on (needed for cuda timers)
+            # enabled: HACK to only report timings from rank 0, set enabled=(global_rank==0)
+            timing_names: timings to report optional (will auto add new names)
+            cuda_timing_names: cuda periods to time optional (will auto add new names)
+        """
+        self.enabled=enabled
+        self.device = device
+        # Normal timing
+        # self.all_timings_dict = {k:None for k in timing_names + cuda_timing_names}
+        self.all_timings_dict = {}
+        self.avg_meter_dict = {}
+        # Cuda event timers to measure time spent on pushing data to gpu and on training step
+        self.cuda_event_timers = {}
+        for k in timing_names:
+            self.add_new_timing(k)
+        for k in cuda_timing_names:
+            self.add_new_cuda_timing(k)
+        # Running averages
+        # self.avg_meter_dict = {k:AverageMeter() for k in self.all_timings_dict}
+    def add_new_timing(self, name):
+        self.avg_meter_dict[name] = AverageMeter()
+        self.all_timings_dict[name] = None
+    def add_new_cuda_timing(self, name):
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        self.cuda_event_timers[name] = self.CUDATimer(start=start_event, end=end_event)
+        self.add_new_timing(name)
+    def clear_timings(self):
+        self.all_timings_dict = {k:None for k in self.all_timings_dict}
+    def get_timings(self):
+        return self.all_timings_dict
+    def get_average_timings(self):
+        return {k:v.avg for k,v in self.avg_meter_dict.items()}
+    def update_all_averages(self):
+        """
+        Once per iter, when timings have been finished recording, one should
+        call update_average_iter to keep running average of timings.
+        """
+        for k,v in self.all_timings_dict.items():
+            if v is None:
+                print("none_timing", k)
+                continue
+            self.avg_meter_dict[k].update(v)
+    def record_timing(self, name, value):
+        if name not in self.all_timings_dict: self.add_new_timing(name)
+        # assert name in self.all_timings_dict
+        self.all_timings_dict[name] = value
+    def _record_cuda_event_start(self, name):
+        if name in self.cuda_event_timers:
+            self.cuda_event_timers[name].start.record(
+                torch.cuda.current_stream(self.device))
+    def _record_cuda_event_end(self, name):
+        if name in self.cuda_event_timers:
+            self.cuda_event_timers[name].end.record(
+                torch.cuda.current_stream(self.device))
+    @contextmanager
+    def cuda_timing_scope(self, name, profile=True):
+        if name not in self.all_timings_dict: self.add_new_cuda_timing(name)
+        with ScopedTorchProfiler(name) if profile else nullcontext():
+            self._record_cuda_event_start(name)
+            try:
+                yield
+            finally:
+                self._record_cuda_event_end(name)
+    @contextmanager
+    def timing_scope(self, name, profile=True):
+        if name not in self.all_timings_dict: self.add_new_timing(name)
+        with ScopedTorchProfiler(name) if profile else nullcontext():
+            start_time = time.time()
+            try:
+                yield
+            finally:
+                self.record_timing(name, time.time()-start_time)
+    def record_all_cuda_timings(self):
+        """ After all the cuda events call this to synchronize and record down the cuda timings. """
+        for k, events in self.cuda_event_timers.items():
+            with torch.no_grad():
+                events.end.synchronize()
+                # Convert to seconds
+                time_elapsed = events.start.elapsed_time(events.end)/1000.
+                self.all_timings_dict[k] = time_elapsed
+def init_s3(config_file):
+    config = json.load(open(config_file, 'r'))
+    s3_client = boto3.client("s3", **config)
+    return s3_client
+def download_from_s3(file_path, target_path, cfg):
+    tic = time.time()
+    s3_client = init_s3(cfg.checkpoint.write_s3_config)  # use to test the s3_client can be init
+    bucket_name = file_path.split('/')[2]
+    file_key = file_path.split(bucket_name+'/')[-1]
+    print(bucket_name, file_key)
+    s3_client.download_file(bucket_name, file_key, target_path)
+    logger.info(f'finish download from ! s3://{bucket_name}/{file_key} to {target_path} %.1f sec'%(
+        time.time() - tic))
+def upload_to_s3(buffer, bucket_name, key, config_dict):
+    logger.info(f'start upload_to_s3! bucket_name={bucket_name}, key={key}')
+    tic = time.time()
+    s3 = boto3.client('s3', **config_dict)
+    s3.put_object(Bucket=bucket_name, Key=key, Body=buffer.getvalue())
+    logger.info(f'finish upload_to_s3! s3://{bucket_name}/{key} %.1f sec'%(time.time() - tic))
+def write_ckpt_to_s3(cfg, all_model_dict, ckpt_name):
+    buffer = io.BytesIO()
+    tic = time.time()
+    torch.save(all_model_dict, buffer)  # take ~0.25 sec
+    # logger.info('write ckpt to buffer: %.2f sec'%(time.time() - tic))
+    group, name = cfg.outdir.rstrip("/").split("/")[-2:]
+    key = f"checkpoints/{group}/{name}/ckpt/{ckpt_name}"
+    bucket_name = cfg.checkpoint.write_s3_bucket
+    s3_client = init_s3(cfg.checkpoint.write_s3_config)  # use to test the s3_client can be init
+    config_dict = json.load(open(cfg.checkpoint.write_s3_config, 'r'))
+    upload_thread = threading.Thread(target=upload_to_s3, args=(buffer, bucket_name, key, config_dict))
+    upload_thread.start()
+    path = f"s3://{bucket_name}/{key}"
+    return path
+def upload_file_to_s3(cfg, file_path, key_name=None):
+    # file_path is the local file path, can be a yaml file
+    # this function is used to upload the ckecpoint only
+    tic = time.time()
+    group, name = cfg.outdir.rstrip("/").split("/")[-2:]
+    if key_name is None:
+        key = os.path.basename(file_path)
+    key = f"checkpoints/{group}/{name}/{key}"
+    bucket_name = cfg.checkpoint.write_s3_bucket
+    s3_client = init_s3(cfg.checkpoint.write_s3_config)
+    # Upload the file
+    with open(file_path, 'rb') as f:
+        s3_client.upload_fileobj(f, bucket_name, key)
+    full_s3_path = f"s3://{bucket_name}/{key}"
+    logger.info(f'upload_to_s3: {file_path} {full_s3_path} | use time: {time.time()-tic}')
+    return full_s3_path
+def load_from_s3(file_path, cfg, load_fn):
+    """
+        ckpt_path example:
+            s3://xzeng/checkpoints/2023_0413/vae_kl_5e-1/ckpt/snapshot_epo000163_iter164000.pt
+    """
+    s3_client = init_s3(cfg.checkpoint.write_s3_config)  # use to test the s3_client can be init
+    bucket_name = file_path.split("s3://")[-1].split('/')[0]
+    key = file_path.split(f'{bucket_name}/')[-1]
+    # logger.info(f"-> try to load s3://{bucket_name}/{key} ")
+    tic = time.time()
+    for attemp in range(10):
+        try:
+            # Download the state dict from S3 into memory (as a binary stream)
+            with io.BytesIO() as buffer:
+                s3_client.download_fileobj(bucket_name, key, buffer)
+                buffer.seek(0)
+                # Load the state dict into a PyTorch model
+                # out = torch.load(buffer, map_location=torch.device("cpu"))
+                out = load_fn(buffer)
+            break
+        except:
+            logger.info(f"fail to load s3://{bucket_name}/{key} attemp: {attemp}")
+    from torch_utils.dist_utils import is_rank0
+    if is_rank0():
+        logger.info(f'loaded {file_path} | use time: {time.time()-tic:.1f} sec')
+    return out
+def load_torch_dict_from_s3(ckpt_path, cfg):
+    """
+        ckpt_path example:
+            s3://xzeng/checkpoints/2023_0413/vae_kl_5e-1/ckpt/snapshot_epo000163_iter164000.pt
+    """
+    s3_client = init_s3(cfg.checkpoint.write_s3_config)  # use to test the s3_client can be init
+    bucket_name = ckpt_path.split("s3://")[-1].split('/')[0]
+    key = ckpt_path.split(f'{bucket_name}/')[-1]
+    for attemp in range(10):
+        try:
+            # Download the state dict from S3 into memory (as a binary stream)
+            with io.BytesIO() as buffer:
+                s3_client.download_fileobj(bucket_name, key, buffer)
+                buffer.seek(0)
+                # Load the state dict into a PyTorch model
+                out = torch.load(buffer, map_location=torch.device("cpu"))
+            break
+        except:
+            logger.info(f"fail to load s3://{bucket_name}/{key} attemp: {attemp}")
+    return out
+def count_parameters_in_M(model):
+    return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name) / 1e6
+def printarr(*arrs, float_width=6, **kwargs):
+    """
+    Print a pretty table giving name, shape, dtype, type, and content information for input tensors or scalars.
+    Call like: printarr(my_arr, some_other_arr, maybe_a_scalar). Accepts a variable number of arguments.
+    Inputs can be:
+        - Numpy tensor arrays
+        - Pytorch tensor arrays
+        - Jax tensor arrays
+        - Python ints / floats
+        - None
+    It may also work with other array-like types, but they have not been tested.
+    Use the `float_width` option specify the precision to which floating point types are printed.
+    Author: Nicholas Sharp (nmwsharp.com)
+    Canonical source: https://gist.github.com/nmwsharp/54d04af87872a4988809f128e1a1d233
+    License: This snippet may be used under an MIT license, and it is also released into the public domain.
+             Please retain this docstring as a reference.
+    """
+    frame = inspect.currentframe().f_back
+    default_name = "[temporary]"
+    ## helpers to gather data about each array
+    def name_from_outer_scope(a):
+        if a is None:
+            return '[None]'
+        name = default_name
+        for k, v in frame.f_locals.items():
+            if v is a:
+                name = k
+                break
+        return name
+    def type_strip(type_str):
+        return type_str.lstrip('<class ').rstrip('>').replace('torch.', '').strip("'")
+    def dtype_str(a):
+        if a is None:
+            return 'None'
+        if isinstance(a, int):
+            return 'int'
+        if isinstance(a, float):
+            return 'float'
+        if isinstance(a, list) and len(a)>0:
+            return type_strip(str(type(a[0])))
+        if hasattr(a, 'dtype'):
+            return type_strip(str(a.dtype))
+        else:
+            return ''
+    def shape_str(a):
+        if a is None:
+            return 'N/A'
+        if isinstance(a, int):
+            return 'scalar'
+        if isinstance(a, float):
+            return 'scalar'
+        if isinstance(a, list):
+            return f"[{shape_str(a[0]) if len(a)>0 else '?'}]*{len(a)}"
+        if hasattr(a, 'shape'):
+            return str(tuple(a.shape))
+        else:
+            return ''
+    def type_str(a):
+        return type_strip(str(type(a))) # TODO this is is weird... what's the better way?
+    def device_str(a):
+        if hasattr(a, 'device'):
+            device_str = str(a.device)
+            if len(device_str) < 10:
+                # heuristic: jax returns some goofy long string we don't want, ignore it
+                return device_str
+        return ""
+    def format_float(x):
+        return f"{x:{float_width}g}"
+    def minmaxmean_str(a):
+        if a is None:
+            return ('N/A', 'N/A', 'N/A', 'N/A')
+        if isinstance(a, int) or isinstance(a, float):
+            return (format_float(a),)*4
+        # compute min/max/mean. if anything goes wrong, just print 'N/A'
+        min_str = "N/A"
+        try: min_str = format_float(a.min())
+        except: pass
+        max_str = "N/A"
+        try: max_str = format_float(a.max())
+        except: pass
+        mean_str = "N/A"
+        try: mean_str = format_float(a.mean())
+        except: pass
+        try: median_str = format_float(a.median())
+        except:
+            try: median_str = format_float(np.median(np.array(a)))
+            except: median_str = 'N/A'
+        return (min_str, max_str, mean_str, median_str)
+    def get_prop_dict(a,k=None):
+        minmaxmean = minmaxmean_str(a)
+        props = {
+            'name' : name_from_outer_scope(a) if k is None else k,
+            # 'type' : str(type(a)).replace('torch.',''),
+            'dtype' : dtype_str(a),
+            'shape' : shape_str(a),
+            'type' : type_str(a),
+            'device' : device_str(a),
+            'min' : minmaxmean[0],
+            'max' : minmaxmean[1],
+            'mean' : minmaxmean[2],
+            'median': minmaxmean[3]
+        }
+        return props
+    try:
+        props = ['name', 'type', 'dtype', 'shape', 'device', 'min', 'max', 'mean', 'median']
+        # precompute all of the properties for each input
+        str_props = []
+        for a in arrs:
+            str_props.append(get_prop_dict(a))
+        for k,a in kwargs.items():
+            str_props.append(get_prop_dict(a, k=k))
+        # for each property, compute its length
+        maxlen = {}
+        for p in props: maxlen[p] = 0
+        for sp in str_props:
+            for p in props:
+                maxlen[p] = max(maxlen[p], len(sp[p]))
+        # if any property got all empty strings, don't bother printing it, remove if from the list
+        props = [p for p in props if maxlen[p] > 0]
+        # print a header
+        header_str = ""
+        for p in props:
+            prefix =  "" if p == 'name' else " | "
+            fmt_key = ">" if p == 'name' else "<"
+            header_str += f"{prefix}{p:{fmt_key}{maxlen[p]}}"
+        print(header_str)
+        print("-"*len(header_str))
+        # now print the acual arrays
+        for strp in str_props:
+            for p in props:
+                prefix =  "" if p == 'name' else " | "
+                fmt_key = ">" if p == 'name' else "<"
+                print(f"{prefix}{strp[p]:{fmt_key}{maxlen[p]}}", end='')
+            print("")
+    finally:
+        del frame
+def debug_print_all_tensor_sizes(min_tot_size = 0):
+    import gc
+    print("---------------------------------------"*3)
+    for obj in gc.get_objects():
+        try:
+            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
+                if np.prod(obj.size())>=min_tot_size:
+                    print(type(obj), obj.size())
+        except:
+            pass
+def print_cpu_usage():
+    # Get current CPU usage as a percentage
+    cpu_usage = psutil.cpu_percent()
+    # Get current memory usage
+    memory_usage = psutil.virtual_memory().used
+    # Convert memory usage to a human-readable format
+    memory_usage_str = psutil._common.bytes2human(memory_usage)
+    # Print CPU and memory usage
+    msg = f"Current CPU usage: {cpu_usage}% | "
+    msg += f"Current memory usage: {memory_usage_str}"
+    return msg
+def calmsize(num_bytes):
+    if math.isnan(num_bytes):
+        return ''
+    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+        if abs(num_bytes) < 1024.0:
+            return "{:.1f}{}B".format(num_bytes, unit)
+        num_bytes /= 1024.0
+    return "{:.1f}{}B".format(num_bytes, 'Y')
+def readable_size(num_bytes: int) -> str:
+    return calmsize(num_bytes) ## '' if math.isnan(num_bytes) else '{:.1f}'.format(calmsize(num_bytes))
+def get_gpu_memory():
+    """
+    Get the current GPU memory usage for each device as a dictionary
+    """
+    output = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.used", "--format=csv"])
+    output = output.decode("utf-8")
+    gpu_memory_values = output.split("\n")[1:-1]
+    gpu_memory_values = [int(x.strip().split()[0]) for x in gpu_memory_values]
+    gpu_memory = dict(zip(range(len(gpu_memory_values)), gpu_memory_values))
+    return gpu_memory
+def get_gpu_util():
+    """
+    Get the current GPU memory usage for each device as a dictionary
+    """
+    output = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"])
+    output = output.decode("utf-8")
+    gpu_memory_values = output.split("\n")[1:-1]
+    gpu_memory_values = [int(x.strip().split()[0]) for x in gpu_memory_values]
+    gpu_util = dict(zip(range(len(gpu_memory_values)), gpu_memory_values))
+    return gpu_util
+def print_gpu_usage():
+    useage = get_gpu_memory()
+    msg = f" | GPU usage: "
+    for k, v in useage.items():
+        msg += f"{k}: {v} MB "
+    # utilization = get_gpu_util()
+    # msg + ' | util '
+    # for k, v in utilization.items():
+    #     msg += f"{k}: {v} % "
+    return msg
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+    def update(self, val, n=1):
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+def generate_random_string(length):
+    # This script will generate a string of 10 random ASCII letters (both lowercase and uppercase).
+    # You can adjust the length parameter to fit your needs.
+    letters = string.ascii_letters
+    return ''.join(random.choice(letters) for _ in range(length))
+class ForkedPdb(pdb.Pdb):
+    """
+    PDB Subclass for debugging multi-processed code
+    Suggested in: https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess
+    """
+    def interaction(self, *args, **kwargs):
+        _stdin = sys.stdin
+        try:
+            sys.stdin = open('/dev/stdin')
+            pdb.Pdb.interaction(self, *args, **kwargs)
+        finally:
+            sys.stdin = _stdin
+def check_exist_in_s3(file_path, s3_config):
+    s3 = init_s3(s3_config)
+    bucket_name, object_name = s3path_to_bucket_key(file_path)
+    try:
+        s3.head_object(Bucket=bucket_name, Key=object_name)
+        return 1
+    except:
+        logger.info(f'file not found: s3://{bucket_name}/{object_name}')
+        return 0
+def s3path_to_bucket_key(file_path):
+    bucket_name = file_path.split('/')[2]
+    object_name = file_path.split(bucket_name + '/')[-1]
+    return bucket_name, object_name
+def copy_file_to_s3(cfg, file_path_local, file_path_s3):
+    # work similar as upload_file_to_s3, but not trying to parse the file path
+    # file_path_s3: s3://{bucket}/{key}
+    bucket_name, key = s3path_to_bucket_key(file_path_s3)
+    tic = time.time()
+    s3_client = init_s3(cfg.checkpoint.write_s3_config)
+    # Upload the file
+    with open(file_path_local, 'rb') as f:
+        s3_client.upload_fileobj(f, bucket_name, key)
+    full_s3_path = f"s3://{bucket_name}/{key}"
+    logger.info(f'copy file: {file_path_local} {full_s3_path} | use time: {time.time()-tic}')
+    return full_s3_path

modules/PartField/partfield/model/PVCNN/encoder_pc.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
+from ast import Dict
+import math
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch_scatter import scatter_mean #, scatter_max
+from .unet_3daware import setup_unet #UNetTriplane3dAware
+from .conv_pointnet import ConvPointnet
+from .pc_encoder import PVCNNEncoder #PointNet
+import einops
+from .dnnlib_util import ScopedTorchProfiler, printarr
+def generate_plane_features(p, c, resolution, plane='xz'):
+    """
+    Args:
+        p: (B,3,n_p)
+        c: (B,C,n_p)
+    """
+    padding = 0.
+    c_dim = c.size(1)
+    # acquire indices of features in plane
+    xy = normalize_coordinate(p.clone(), plane=plane, padding=padding) # normalize to the range of (0, 1)
+    index = coordinate2index(xy, resolution)
+    # scatter plane features from points
+    fea_plane = c.new_zeros(p.size(0), c_dim, resolution**2)
+    fea_plane = scatter_mean(c, index, out=fea_plane) # B x 512 x reso^2
+    fea_plane = fea_plane.reshape(p.size(0), c_dim, resolution, resolution) # sparce matrix (B x 512 x reso x reso)
+    return fea_plane
+def normalize_coordinate(p, padding=0.1, plane='xz'):
+    ''' Normalize coordinate to [0, 1] for unit cube experiments
+    Args:
+        p (tensor): point
+        padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+        plane (str): plane feature type, ['xz', 'xy', 'yz']
+    '''
+    if plane == 'xz':
+        xy = p[:, :, [0, 2]]
+    elif plane =='xy':
+        xy = p[:, :, [0, 1]]
+    else:
+        xy = p[:, :, [1, 2]]
+    xy_new = xy / (1 + padding + 10e-6) # (-0.5, 0.5)
+    xy_new = xy_new + 0.5 # range (0, 1)
+    # if there are outliers out of the range
+    if xy_new.max() >= 1:
+        xy_new[xy_new >= 1] = 1 - 10e-6
+    if xy_new.min() < 0:
+        xy_new[xy_new < 0] = 0.0
+    return xy_new
+def coordinate2index(x, resolution):
+    ''' Normalize coordinate to [0, 1] for unit cube experiments.
+        Corresponds to our 3D model
+    Args:
+        x (tensor): coordinate
+        reso (int): defined resolution
+        coord_type (str): coordinate type
+    '''
+    x = (x * resolution).long()
+    index = x[:, :, 0] + resolution * x[:, :, 1]
+    index = index[:, None, :]
+    return index
+def softclip(x, min, max, hardness=5):
+    # Soft clipping for the logsigma
+    x = min + F.softplus(hardness*(x - min))/hardness
+    x = max - F.softplus(-hardness*(x - max))/hardness
+    return x
+def sample_triplane_feat(feature_triplane, normalized_pos):
+    '''
+        normalized_pos [-1, 1]
+    '''
+    tri_plane = torch.unbind(feature_triplane, dim=1)
+    x_feat = F.grid_sample(
+        tri_plane[0],
+        torch.cat(
+            [normalized_pos[:, :, 0:1], normalized_pos[:, :, 1:2]],
+            dim=-1).unsqueeze(dim=1), padding_mode='border',
+        align_corners=True)
+    y_feat = F.grid_sample(
+        tri_plane[1],
+        torch.cat(
+            [normalized_pos[:, :, 1:2], normalized_pos[:, :, 2:3]],
+            dim=-1).unsqueeze(dim=1), padding_mode='border',
+        align_corners=True)
+    z_feat = F.grid_sample(
+        tri_plane[2],
+        torch.cat(
+            [normalized_pos[:, :, 0:1], normalized_pos[:, :, 2:3]],
+            dim=-1).unsqueeze(dim=1), padding_mode='border',
+        align_corners=True)
+    final_feat = (x_feat + y_feat + z_feat)
+    final_feat = final_feat.squeeze(dim=2).permute(0, 2, 1)  # 32dimension
+    return final_feat
+# @persistence.persistent_class
+class TriPlanePC2Encoder(torch.nn.Module):
+    # Encoder that encode point cloud to triplane feature vector similar to ConvOccNet
+    def __init__(
+            self,
+            cfg,
+            device='cuda',
+            shape_min=-1.0,
+            shape_length=2.0,
+            use_2d_feat=False,
+            # point_encoder='pvcnn',
+            # use_point_scatter=False
+    ):
+        """
+        Outputs latent triplane from PC input
+        Configs:
+            max_logsigma: (float) Soft clip upper range for logsigm
+            min_logsigma: (float)
+            point_encoder_type: (str) one of ['pvcnn', 'pointnet']
+            pvcnn_flatten_voxels: (bool) for pvcnn whether to reduce voxel
+                features (instead of scattering point features)
+            unet_cfg: (dict)
+            z_triplane_channels: (int) output latent triplane
+            z_triplane_resolution: (int)
+        Args:
+        """
+        # assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0
+        super().__init__()
+        self.device = device
+        self.cfg = cfg
+        self.shape_min = shape_min
+        self.shape_length = shape_length
+        self.z_triplane_resolution = cfg.z_triplane_resolution
+        z_triplane_channels = cfg.z_triplane_channels
+        point_encoder_out_dim = z_triplane_channels #* 2
+        in_channels = 6
+        # self.resample_filter=[1, 3, 3, 1]
+        if cfg.point_encoder_type == 'pvcnn':
+            self.pc_encoder = PVCNNEncoder(point_encoder_out_dim,
+            device=self.device, in_channels=in_channels, use_2d_feat=use_2d_feat)  # Encode it to a volume vector.
+        elif cfg.point_encoder_type == 'pointnet':
+            # TODO the pointnet was buggy, investigate
+            self.pc_encoder = ConvPointnet(c_dim=point_encoder_out_dim,
+                                           dim=in_channels, hidden_dim=32,
+                                           plane_resolution=self.z_triplane_resolution,
+                                           padding=0)
+        else:
+            raise NotImplementedError(f"Point encoder {cfg.point_encoder_type} not implemented")
+        if cfg.unet_cfg.enabled:
+            self.unet_encoder = setup_unet(
+                output_channels=point_encoder_out_dim,
+                input_channels=point_encoder_out_dim,
+                unet_cfg=cfg.unet_cfg)
+        else:
+            self.unet_encoder = None
+    # @ScopedTorchProfiler('encode')
+    def encode(self, point_cloud_xyz, point_cloud_feature, mv_feat=None, pc2pc_idx=None) -> Dict:
+        # output = AttrDict()
+        point_cloud_xyz = (point_cloud_xyz - self.shape_min) / self.shape_length # [0, 1]
+        point_cloud_xyz = point_cloud_xyz - 0.5 # [-0.5, 0.5]
+        point_cloud = torch.cat([point_cloud_xyz, point_cloud_feature], dim=-1)
+        if self.cfg.point_encoder_type == 'pvcnn':
+            if mv_feat is not None:
+                pc_feat, points_feat = self.pc_encoder(point_cloud, mv_feat, pc2pc_idx)
+            else:
+                pc_feat, points_feat = self.pc_encoder(point_cloud)  # 3D feature volume: BxDx32x32x32
+            if self.cfg.use_point_scatter:
+                # Scattering from PVCNN point features
+                points_feat_ = points_feat[0]
+                # shape: batch, latent size, resolution, resolution (e.g. 16, 256, 64, 64)
+                pc_feat_1 = generate_plane_features(point_cloud_xyz, points_feat_,
+                                                    resolution=self.z_triplane_resolution, plane='xy')
+                pc_feat_2 = generate_plane_features(point_cloud_xyz, points_feat_,
+                                                    resolution=self.z_triplane_resolution, plane='yz')
+                pc_feat_3 = generate_plane_features(point_cloud_xyz, points_feat_,
+                                                    resolution=self.z_triplane_resolution, plane='xz')
+                pc_feat = pc_feat[0]
+            else:
+                pc_feat = pc_feat[0]
+                sf = self.z_triplane_resolution//32 # 32 is PVCNN's voxel dim
+                pc_feat_1 = torch.mean(pc_feat, dim=-1) #xy_plane, normalize in z plane
+                pc_feat_2 = torch.mean(pc_feat, dim=-3) #yz_plane, normalize in x plane
+                pc_feat_3 = torch.mean(pc_feat, dim=-2) #xz_plane, normalize in y plane
+                # nearest upsample
+                pc_feat_1 = einops.repeat(pc_feat_1, 'b c h w -> b c (h hm ) (w wm)', hm = sf, wm = sf)
+                pc_feat_2 = einops.repeat(pc_feat_2, 'b c h w -> b c (h hm) (w wm)', hm = sf, wm = sf)
+                pc_feat_3 = einops.repeat(pc_feat_3, 'b c h w -> b c (h hm) (w wm)', hm = sf, wm = sf)
+        elif self.cfg.point_encoder_type == 'pointnet':
+            assert self.cfg.use_point_scatter
+            # Run ConvPointnet
+            pc_feat = self.pc_encoder(point_cloud)
+            pc_feat_1 = pc_feat['xy'] #
+            pc_feat_2 = pc_feat['yz']
+            pc_feat_3 = pc_feat['xz']
+        else:
+            raise NotImplementedError()
+        if self.unet_encoder is not None:
+            # TODO eval adding a skip connection
+            # Unet expects B, 3, C, H, W
+            pc_feat_tri_plane_stack_pre = torch.stack([pc_feat_1, pc_feat_2, pc_feat_3], dim=1)
+            # dpc_feat_tri_plane_stack = self.unet_encoder(pc_feat_tri_plane_stack_pre)
+            # pc_feat_tri_plane_stack = pc_feat_tri_plane_stack_pre + dpc_feat_tri_plane_stack
+            pc_feat_tri_plane_stack = self.unet_encoder(pc_feat_tri_plane_stack_pre)
+            pc_feat_1, pc_feat_2, pc_feat_3 = torch.unbind(pc_feat_tri_plane_stack, dim=1)
+        return torch.stack([pc_feat_1, pc_feat_2, pc_feat_3], dim=1)
+    def forward(self, point_cloud_xyz, point_cloud_feature=None, mv_feat=None, pc2pc_idx=None):
+        return self.encode(point_cloud_xyz, point_cloud_feature=point_cloud_feature, mv_feat=mv_feat, pc2pc_idx=pc2pc_idx)

modules/PartField/partfield/model/PVCNN/pc_encoder.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import functools
+from .pv_module import SharedMLP, PVConv
+def create_pointnet_components(
+        blocks, in_channels, with_se=False, normalize=True, eps=0,
+        width_multiplier=1, voxel_resolution_multiplier=1, scale_pvcnn=False, device='cuda'):
+    r, vr = width_multiplier, voxel_resolution_multiplier
+    layers, concat_channels = [], 0
+    for out_channels, num_blocks, voxel_resolution in blocks:
+        out_channels = int(r * out_channels)
+        if voxel_resolution is None:
+            block = functools.partial(SharedMLP, device=device)
+        else:
+            block = functools.partial(
+                PVConv, kernel_size=3, resolution=int(vr * voxel_resolution),
+                with_se=with_se, normalize=normalize, eps=eps, scale_pvcnn=scale_pvcnn, device=device)
+        for _ in range(num_blocks):
+            layers.append(block(in_channels, out_channels))
+            in_channels = out_channels
+            concat_channels += out_channels
+    return layers, in_channels, concat_channels
+class PCMerger(nn.Module):
+# merge surface sampled PC and rendering backprojected PC (w/ 2D features):
+    def __init__(self, in_channels=204, device="cuda"):
+        super(PCMerger, self).__init__()
+        self.mlp_normal = SharedMLP(3, [128, 128], device=device)
+        self.mlp_rgb = SharedMLP(3, [128, 128], device=device)
+        self.mlp_sam = SharedMLP(204 - 6, [128, 128], device=device)
+    def forward(self, feat, mv_feat, pc2pc_idx):
+        mv_feat_normal = self.mlp_normal(mv_feat[:, :3, :])
+        mv_feat_rgb = self.mlp_rgb(mv_feat[:, 3:6, :])
+        mv_feat_sam = self.mlp_sam(mv_feat[:, 6:, :])
+        mv_feat_normal = mv_feat_normal.permute(0, 2, 1)
+        mv_feat_rgb = mv_feat_rgb.permute(0, 2, 1)
+        mv_feat_sam = mv_feat_sam.permute(0, 2, 1)
+        feat = feat.permute(0, 2, 1)
+        for i in range(mv_feat.shape[0]):
+            mask = (pc2pc_idx[i] != -1).reshape(-1)
+            idx = pc2pc_idx[i][mask].reshape(-1)
+            feat[i][mask] += mv_feat_normal[i][idx] + mv_feat_rgb[i][idx] + mv_feat_sam[i][idx]
+        return feat.permute(0, 2, 1)
+class PVCNNEncoder(nn.Module):
+    def __init__(self, pvcnn_feat_dim, device='cuda', in_channels=3, use_2d_feat=False):
+        super(PVCNNEncoder, self).__init__()
+        self.device = device
+        self.blocks = ((pvcnn_feat_dim, 1, 32), (128, 2, 16), (256, 1, 8))
+        self.use_2d_feat=use_2d_feat
+        if in_channels == 6:
+            self.append_channel = 2
+        elif in_channels == 3:
+            self.append_channel = 1
+        else:
+            raise NotImplementedError
+        layers, channels_point, concat_channels_point = create_pointnet_components(
+            blocks=self.blocks, in_channels=in_channels + self.append_channel, with_se=False, normalize=False,
+            width_multiplier=1, voxel_resolution_multiplier=1, scale_pvcnn=True,
+            device=device
+        )
+        self.encoder = nn.ModuleList(layers)#.to(self.device)
+        if self.use_2d_feat:
+            self.merger = PCMerger()
+    def forward(self, input_pc, mv_feat=None, pc2pc_idx=None):
+        features = input_pc.permute(0, 2, 1) * 2  # make point cloud [-1, 1]
+        coords = features[:, :3, :]
+        out_features_list = []
+        voxel_feature_list = []
+        zero_padding = torch.zeros(features.shape[0], self.append_channel, features.shape[-1], device=features.device, dtype=features.dtype)
+        features = torch.cat([features, zero_padding], dim=1)##################
+        for i in range(len(self.encoder)):
+            features, _, voxel_feature = self.encoder[i]((features, coords))
+            if i == 0 and mv_feat is not None:
+               features = self.merger(features, mv_feat.permute(0, 2, 1), pc2pc_idx)
+            out_features_list.append(features)
+            voxel_feature_list.append(voxel_feature)
+        return voxel_feature_list, out_features_list

modules/PartField/partfield/model/PVCNN/pv_module/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .pvconv import PVConv
2	+ from .shared_mlp import SharedMLP

modules/PartField/partfield/model/PVCNN/pv_module/ball_query.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import torch.nn as nn
+from . import functional as F
+__all__ = ['BallQuery']
+class BallQuery(nn.Module):
+    def __init__(self, radius, num_neighbors, include_coordinates=True):
+        super().__init__()
+        self.radius = radius
+        self.num_neighbors = num_neighbors
+        self.include_coordinates = include_coordinates
+    def forward(self, points_coords, centers_coords, points_features=None):
+        points_coords = points_coords.contiguous()
+        centers_coords = centers_coords.contiguous()
+        neighbor_indices = F.ball_query(centers_coords, points_coords, self.radius, self.num_neighbors)
+        neighbor_coordinates = F.grouping(points_coords, neighbor_indices)
+        neighbor_coordinates = neighbor_coordinates - centers_coords.unsqueeze(-1)
+        if points_features is None:
+            assert self.include_coordinates, 'No Features For Grouping'
+            neighbor_features = neighbor_coordinates
+        else:
+            neighbor_features = F.grouping(points_features, neighbor_indices)
+            if self.include_coordinates:
+                neighbor_features = torch.cat([neighbor_coordinates, neighbor_features], dim=1)
+        return neighbor_features
+    def extra_repr(self):
+        return 'radius={}, num_neighbors={}{}'.format(
+            self.radius, self.num_neighbors, ', include coordinates' if self.include_coordinates else '')

modules/PartField/partfield/model/PVCNN/pv_module/frustum.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from . import functional as PF
+__all__ = ['FrustumPointNetLoss', 'get_box_corners_3d']
+class FrustumPointNetLoss(nn.Module):
+    def __init__(
+            self, num_heading_angle_bins, num_size_templates, size_templates, box_loss_weight=1.0,
+            corners_loss_weight=10.0, heading_residual_loss_weight=20.0, size_residual_loss_weight=20.0):
+        super().__init__()
+        self.box_loss_weight = box_loss_weight
+        self.corners_loss_weight = corners_loss_weight
+        self.heading_residual_loss_weight = heading_residual_loss_weight
+        self.size_residual_loss_weight = size_residual_loss_weight
+        self.num_heading_angle_bins = num_heading_angle_bins
+        self.num_size_templates = num_size_templates
+        self.register_buffer('size_templates', size_templates.view(self.num_size_templates, 3))
+        self.register_buffer(
+            'heading_angle_bin_centers', torch.arange(0, 2 * np.pi, 2 * np.pi / self.num_heading_angle_bins)
+        )
+    def forward(self, inputs, targets):
+        mask_logits = inputs['mask_logits']  # (B, 2, N)
+        center_reg = inputs['center_reg']  # (B, 3)
+        center = inputs['center']  # (B, 3)
+        heading_scores = inputs['heading_scores']  # (B, NH)
+        heading_residuals_normalized = inputs['heading_residuals_normalized']  # (B, NH)
+        heading_residuals = inputs['heading_residuals']  # (B, NH)
+        size_scores = inputs['size_scores']  # (B, NS)
+        size_residuals_normalized = inputs['size_residuals_normalized']  # (B, NS, 3)
+        size_residuals = inputs['size_residuals']  # (B, NS, 3)
+        mask_logits_target = targets['mask_logits']  # (B, N)
+        center_target = targets['center']  # (B, 3)
+        heading_bin_id_target = targets['heading_bin_id']  # (B, )
+        heading_residual_target = targets['heading_residual']  # (B, )
+        size_template_id_target = targets['size_template_id']  # (B, )
+        size_residual_target = targets['size_residual']  # (B, 3)
+        batch_size = center.size(0)
+        batch_id = torch.arange(batch_size, device=center.device)
+        # Basic Classification and Regression losses
+        mask_loss = F.cross_entropy(mask_logits, mask_logits_target)
+        heading_loss = F.cross_entropy(heading_scores, heading_bin_id_target)
+        size_loss = F.cross_entropy(size_scores, size_template_id_target)
+        center_loss = PF.huber_loss(torch.norm(center_target - center, dim=-1), delta=2.0)
+        center_reg_loss = PF.huber_loss(torch.norm(center_target - center_reg, dim=-1), delta=1.0)
+        # Refinement losses for size/heading
+        heading_residuals_normalized = heading_residuals_normalized[batch_id, heading_bin_id_target]  # (B, )
+        heading_residual_normalized_target = heading_residual_target / (np.pi / self.num_heading_angle_bins)
+        heading_residual_normalized_loss = PF.huber_loss(
+            heading_residuals_normalized - heading_residual_normalized_target, delta=1.0
+        )
+        size_residuals_normalized = size_residuals_normalized[batch_id, size_template_id_target]  # (B, 3)
+        size_residual_normalized_target = size_residual_target / self.size_templates[size_template_id_target]
+        size_residual_normalized_loss = PF.huber_loss(
+            torch.norm(size_residual_normalized_target - size_residuals_normalized, dim=-1), delta=1.0
+        )
+        # Bounding box losses
+        heading = (heading_residuals[batch_id, heading_bin_id_target]
+                   + self.heading_angle_bin_centers[heading_bin_id_target])  # (B, )
+        # Warning: in origin code, size_residuals are added twice (issue #43 and #49 in charlesq34/frustum-pointnets)
+        size = (size_residuals[batch_id, size_template_id_target]
+                + self.size_templates[size_template_id_target])  # (B, 3)
+        corners = get_box_corners_3d(centers=center, headings=heading, sizes=size, with_flip=False)  # (B, 3, 8)
+        heading_target = self.heading_angle_bin_centers[heading_bin_id_target] + heading_residual_target  # (B, )
+        size_target = self.size_templates[size_template_id_target] + size_residual_target  # (B, 3)
+        corners_target, corners_target_flip = get_box_corners_3d(
+            centers=center_target, headings=heading_target,
+            sizes=size_target, with_flip=True)  # (B, 3, 8)
+        corners_loss = PF.huber_loss(
+            torch.min(
+                torch.norm(corners - corners_target, dim=1), torch.norm(corners - corners_target_flip, dim=1)
+            ), delta=1.0)
+        # Summing up
+        loss = mask_loss + self.box_loss_weight * (
+                center_loss + center_reg_loss + heading_loss + size_loss
+                + self.heading_residual_loss_weight * heading_residual_normalized_loss
+                + self.size_residual_loss_weight * size_residual_normalized_loss
+                + self.corners_loss_weight * corners_loss
+        )
+        return loss
+def get_box_corners_3d(centers, headings, sizes, with_flip=False):
+    """
+    :param centers: coords of box centers, FloatTensor[N, 3]
+    :param headings: heading angles, FloatTensor[N, ]
+    :param sizes: box sizes, FloatTensor[N, 3]
+    :param with_flip: bool, whether to return flipped box (headings + np.pi)
+    :return:
+        coords of box corners, FloatTensor[N, 3, 8]
+        NOTE: corner points are in counter clockwise order, e.g.,
+          2--1
+        3--0 5
+        7--4
+    """
+    l = sizes[:, 0]  # (N,)
+    w = sizes[:, 1]  # (N,)
+    h = sizes[:, 2]  # (N,)
+    x_corners = torch.stack([l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2], dim=1)  # (N, 8)
+    y_corners = torch.stack([h / 2, h / 2, h / 2, h / 2, -h / 2, -h / 2, -h / 2, -h / 2], dim=1)  # (N, 8)
+    z_corners = torch.stack([w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2], dim=1)  # (N, 8)
+    c = torch.cos(headings)  # (N,)
+    s = torch.sin(headings)  # (N,)
+    o = torch.ones_like(headings)  # (N,)
+    z = torch.zeros_like(headings)  # (N,)
+    centers = centers.unsqueeze(-1)  # (B, 3, 1)
+    corners = torch.stack([x_corners, y_corners, z_corners], dim=1)  # (N, 3, 8)
+    R = torch.stack([c, z, s, z, o, z, -s, z, c], dim=1).view(-1, 3, 3)  # roty matrix: (N, 3, 3)
+    if with_flip:
+        R_flip = torch.stack([-c, z, -s, z, o, z, s, z, -c], dim=1).view(-1, 3, 3)
+        return torch.matmul(R, corners) + centers, torch.matmul(R_flip, corners) + centers
+    else:
+        return torch.matmul(R, corners) + centers
+    # centers = centers.unsqueeze(1)  # (B, 1, 3)
+    # corners = torch.stack([x_corners, y_corners, z_corners], dim=-1)  # (N, 8, 3)
+    # RT = torch.stack([c, z, -s, z, o, z, s, z, c], dim=1).view(-1, 3, 3)  # (N, 3, 3)
+    # if with_flip:
+    #     RT_flip = torch.stack([-c, z, s, z, o, z, -s, z, -c], dim=1).view(-1, 3, 3)  # (N, 3, 3)
+    #     return torch.matmul(corners, RT) + centers, torch.matmul(corners, RT_flip) + centers  # (N, 8, 3)
+    # else:
+    #     return torch.matmul(corners, RT) + centers  # (N, 8, 3)
+    # corners = torch.stack([x_corners, y_corners, z_corners], dim=1)  # (N, 3, 8)
+    # R = torch.stack([c, z, s, z, o, z, -s, z, c], dim=1).view(-1, 3, 3)  # (N, 3, 3)
+    # corners = torch.matmul(R, corners) + centers.unsqueeze(2)  # (N, 3, 8)
+    # corners = corners.transpose(1, 2)  # (N, 8, 3)

modules/PartField/partfield/model/PVCNN/pv_module/functional/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .devoxelization import trilinear_devoxelize

modules/PartField/partfield/model/PVCNN/pv_module/functional/devoxelization.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from torch.autograd import Function
+import torch
+import torch.nn.functional as F
+__all__ = ['trilinear_devoxelize']
+def trilinear_devoxelize(c, coords, r, training=None):
+    coords = (coords * 2 + 1.0) / r - 1.0
+    coords = coords.permute(0, 2, 1).reshape(c.shape[0], 1, 1, -1, 3)
+    f = F.grid_sample(input=c, grid=coords, padding_mode='border', align_corners=False)
+    f = f.squeeze(dim=2).squeeze(dim=2)
+    return f

modules/PartField/partfield/model/PVCNN/pv_module/loss.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch.nn as nn
+from . import functional as F
+__all__ = ['KLLoss']
+class KLLoss(nn.Module):
+    def forward(self, x, y):
+        return F.kl_loss(x, y)

modules/PartField/partfield/model/PVCNN/pv_module/pointnet.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+import torch.nn as nn
+from . import functional as F
+from .ball_query import BallQuery
+from .shared_mlp import SharedMLP
+__all__ = ['PointNetAModule', 'PointNetSAModule', 'PointNetFPModule']
+class PointNetAModule(nn.Module):
+    def __init__(self, in_channels, out_channels, include_coordinates=True):
+        super().__init__()
+        if not isinstance(out_channels, (list, tuple)):
+            out_channels = [[out_channels]]
+        elif not isinstance(out_channels[0], (list, tuple)):
+            out_channels = [out_channels]
+        mlps = []
+        total_out_channels = 0
+        for _out_channels in out_channels:
+            mlps.append(
+                SharedMLP(
+                    in_channels=in_channels + (3 if include_coordinates else 0),
+                    out_channels=_out_channels, dim=1)
+            )
+            total_out_channels += _out_channels[-1]
+        self.include_coordinates = include_coordinates
+        self.out_channels = total_out_channels
+        self.mlps = nn.ModuleList(mlps)
+    def forward(self, inputs):
+        features, coords = inputs
+        if self.include_coordinates:
+            features = torch.cat([features, coords], dim=1)
+        coords = torch.zeros((coords.size(0), 3, 1), device=coords.device)
+        if len(self.mlps) > 1:
+            features_list = []
+            for mlp in self.mlps:
+                features_list.append(mlp(features).max(dim=-1, keepdim=True).values)
+            return torch.cat(features_list, dim=1), coords
+        else:
+            return self.mlps[0](features).max(dim=-1, keepdim=True).values, coords
+    def extra_repr(self):
+        return f'out_channels={self.out_channels}, include_coordinates={self.include_coordinates}'
+class PointNetSAModule(nn.Module):
+    def __init__(self, num_centers, radius, num_neighbors, in_channels, out_channels, include_coordinates=True):
+        super().__init__()
+        if not isinstance(radius, (list, tuple)):
+            radius = [radius]
+        if not isinstance(num_neighbors, (list, tuple)):
+            num_neighbors = [num_neighbors] * len(radius)
+        assert len(radius) == len(num_neighbors)
+        if not isinstance(out_channels, (list, tuple)):
+            out_channels = [[out_channels]] * len(radius)
+        elif not isinstance(out_channels[0], (list, tuple)):
+            out_channels = [out_channels] * len(radius)
+        assert len(radius) == len(out_channels)
+        groupers, mlps = [], []
+        total_out_channels = 0
+        for _radius, _out_channels, _num_neighbors in zip(radius, out_channels, num_neighbors):
+            groupers.append(
+                BallQuery(radius=_radius, num_neighbors=_num_neighbors, include_coordinates=include_coordinates)
+            )
+            mlps.append(
+                SharedMLP(
+                    in_channels=in_channels + (3 if include_coordinates else 0),
+                    out_channels=_out_channels, dim=2)
+            )
+            total_out_channels += _out_channels[-1]
+        self.num_centers = num_centers
+        self.out_channels = total_out_channels
+        self.groupers = nn.ModuleList(groupers)
+        self.mlps = nn.ModuleList(mlps)
+    def forward(self, inputs):
+        features, coords = inputs
+        centers_coords = F.furthest_point_sample(coords, self.num_centers)
+        features_list = []
+        for grouper, mlp in zip(self.groupers, self.mlps):
+            features_list.append(mlp(grouper(coords, centers_coords, features)).max(dim=-1).values)
+        if len(features_list) > 1:
+            return torch.cat(features_list, dim=1), centers_coords
+        else:
+            return features_list[0], centers_coords
+    def extra_repr(self):
+        return f'num_centers={self.num_centers}, out_channels={self.out_channels}'
+class PointNetFPModule(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.mlp = SharedMLP(in_channels=in_channels, out_channels=out_channels, dim=1)
+    def forward(self, inputs):
+        if len(inputs) == 3:
+            points_coords, centers_coords, centers_features = inputs
+            points_features = None
+        else:
+            points_coords, centers_coords, centers_features, points_features = inputs
+        interpolated_features = F.nearest_neighbor_interpolate(points_coords, centers_coords, centers_features)
+        if points_features is not None:
+            interpolated_features = torch.cat(
+                [interpolated_features, points_features], dim=1
+            )
+        return self.mlp(interpolated_features), points_coords

modules/PartField/partfield/model/PVCNN/pv_module/pvconv.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch.nn as nn
+from . import functional as F
+from .voxelization import Voxelization
+from .shared_mlp import SharedMLP
+import torch
+__all__ = ['PVConv']
+class PVConv(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, kernel_size, resolution, with_se=False, normalize=True, eps=0, scale_pvcnn=False,
+            device='cuda'):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.resolution = resolution
+        self.voxelization = Voxelization(resolution, normalize=normalize, eps=eps, scale_pvcnn=scale_pvcnn)
+        voxel_layers = [
+            nn.Conv3d(in_channels, out_channels, kernel_size, stride=1, padding=kernel_size // 2, device=device),
+            nn.InstanceNorm3d(out_channels, eps=1e-4, device=device),
+            nn.LeakyReLU(0.1, True),
+            nn.Conv3d(out_channels, out_channels, kernel_size, stride=1, padding=kernel_size // 2, device=device),
+            nn.InstanceNorm3d(out_channels, eps=1e-4, device=device),
+            nn.LeakyReLU(0.1, True),
+        ]
+        self.voxel_layers = nn.Sequential(*voxel_layers)
+        self.point_features = SharedMLP(in_channels, out_channels, device=device)
+    def forward(self, inputs):
+        features, coords = inputs
+        voxel_features, voxel_coords = self.voxelization(features, coords)
+        voxel_features = self.voxel_layers(voxel_features)
+        devoxel_features = F.trilinear_devoxelize(voxel_features, voxel_coords, self.resolution, self.training)
+        fused_features = devoxel_features + self.point_features(features)
+        return fused_features, coords, voxel_features

modules/PartField/partfield/model/PVCNN/pv_module/shared_mlp.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch.nn as nn
+__all__ = ['SharedMLP']
+class SharedMLP(nn.Module):
+    def __init__(self, in_channels, out_channels, dim=1, device='cuda'):
+        super().__init__()
+        # print('==> SharedMLP device: ', device)
+        if dim == 1:
+            conv = nn.Conv1d
+            bn = nn.InstanceNorm1d
+        elif dim == 2:
+            conv = nn.Conv2d
+            bn = nn.InstanceNorm1d
+        else:
+            raise ValueError
+        if not isinstance(out_channels, (list, tuple)):
+            out_channels = [out_channels]
+        layers = []
+        for oc in out_channels:
+            layers.extend(
+                [
+                    conv(in_channels, oc, 1, device=device),
+                    bn(oc, device=device),
+                    nn.ReLU(True),
+                ])
+            in_channels = oc
+        self.layers = nn.Sequential(*layers)
+    def forward(self, inputs):
+        if isinstance(inputs, (list, tuple)):
+            return (self.layers(inputs[0]), *inputs[1:])
+        else:
+            return self.layers(inputs)

modules/PartField/partfield/model/PVCNN/pv_module/voxelization.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+import torch.nn as nn
+from . import functional as F
+__all__ = ['Voxelization']
+def my_voxelization(features, coords, resolution):
+    b, c, _ = features.shape
+    result = torch.zeros(b, c + 1, resolution * resolution * resolution, device=features.device, dtype=features.dtype)
+    r = resolution
+    r2 = resolution * resolution
+    coords = coords.long()
+    indices = coords[:, 0] * r2 + coords[:, 1] * r + coords[:, 2]
+    # print(r, r2, coords[:, 0].max(), coords[:, 1].max(), coords[:, 2].max())
+    # print(f"Resolution: {resolution}")
+    # print(f"Coords shape: {coords.shape}")
+    # print(f"Coords max per dim: x={coords[:, 0].max()}, y={coords[:, 1].max()}, z={coords[:, 2].max()}")
+    # print(f"Coords min per dim: x={coords[:, 0].min()}, y={coords[:, 1].min()}, z={coords[:, 2].min()}")
+    # print(f"Indices shape: {indices.shape}")
+    # print(f"Indices max: {indices.max()}, min: {indices.min()}")
+    # print(f"Expected max index: {resolution * resolution * resolution - 1}")
+    # # 检查是否有越界的索引
+    # max_valid_index = resolution * resolution * resolution - 1
+    # invalid_mask = (indices > max_valid_index) | (indices < 0)
+    # if invalid_mask.any():
+    #     print(f"Found {invalid_mask.sum()} invalid indices!")
+    #     print(f"Invalid indices: {indices[invalid_mask]}")
+    #     # 找到对应的坐标
+    #     invalid_coords = coords[:, :, invalid_mask.any(dim=0)]
+    #     print(f"Invalid coords shape: {invalid_coords.shape}")
+    #     if invalid_coords.numel() > 0:
+    #         print(f"Sample invalid coords: {invalid_coords[:, :, :5]}")  # 显示前5个无效坐标
+    indices = indices.unsqueeze(dim=1).expand(-1, result.shape[1], -1)
+    features = torch.cat([features, torch.ones(features.shape[0], 1, features.shape[2], device=features.device, dtype=features.dtype)], dim=1)
+    out_feature = result.scatter_(index=indices.long(), src=features, dim=2, reduce='add')
+    cnt = out_feature[:, -1:, :]
+    zero_mask = (cnt == 0).to(features.dtype)
+    cnt = cnt * (1 - zero_mask) + zero_mask * 1e-5
+    vox_feature = out_feature[:, :-1, :] / cnt
+    return vox_feature.view(b, c, resolution, resolution, resolution)
+class Voxelization(nn.Module):
+    def __init__(self, resolution, normalize=True, eps=0, scale_pvcnn=False):
+        super().__init__()
+        self.r = int(resolution)
+        self.normalize = normalize
+        self.eps = eps
+        self.scale_pvcnn = scale_pvcnn
+        assert not normalize
+    def forward(self, features, coords):
+        # import pdb; pdb.set_trace()
+        with torch.no_grad():
+            coords = coords.detach()
+            if self.normalize:
+                norm_coords = norm_coords / (norm_coords.norm(dim=1, keepdim=True).max(dim=2, keepdim=True).values * 2.0 + self.eps) + 0.5
+            else:
+                if self.scale_pvcnn:
+                    norm_coords = (coords + 1) / 2.0 # [0, 1]
+                    # print(norm_coords.shape, norm_coords.max(), norm_coords.min())
+                else:
+                    # norm_coords = (norm_coords + 1) / 2.0
+                    norm_coords = (coords + 1) / 2.0
+            norm_coords = torch.clamp(norm_coords * self.r, 0, self.r - 1)
+            # print(norm_coords.shape, norm_coords.max(), norm_coords.min())
+            vox_coords = torch.round(norm_coords)
+            # print(vox_coords.shape, vox_coords.max(), vox_coords.min())
+            # print(features.shape)
+        new_vox_feat = my_voxelization(features, vox_coords, self.r)
+        return new_vox_feat, norm_coords
+    def extra_repr(self):
+        return 'resolution={}{}'.format(self.r, ', normalized eps = {}'.format(self.eps) if self.normalize else '')

modules/PartField/partfield/model/PVCNN/unet_3daware.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+import einops
+def conv3x3(in_channels, out_channels, stride=1,
+            padding=1, bias=True, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=stride,
+        padding=padding,
+        bias=bias,
+        groups=groups)
+def upconv2x2(in_channels, out_channels, mode='transpose'):
+    if mode == 'transpose':
+        return nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=2,
+            stride=2)
+    else:
+        # out_channels is always going to be the same
+        # as in_channels
+        return nn.Sequential(
+            nn.Upsample(mode='bilinear', scale_factor=2),
+            conv1x1(in_channels, out_channels))
+def conv1x1(in_channels, out_channels, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        groups=groups,
+        stride=1)
+class ConvTriplane3dAware(nn.Module):
+    """ 3D aware triplane conv (as described in RODIN) """
+    def __init__(self, internal_conv_f, in_channels, out_channels, order='xz'):
+        """
+        Args:
+            internal_conv_f: function that should return a 2D convolution Module
+                given in and out channels
+            order: if triplane input is in 'xz' order
+        """
+        super(ConvTriplane3dAware, self).__init__()
+        # Need 3 seperate convolutions
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        assert order in ['xz', 'zx']
+        self.order = order
+        # Going to stack from other planes
+        self.plane_convs =  nn.ModuleList([
+            internal_conv_f(3*self.in_channels, self.out_channels) for _ in range(3)])
+    def forward(self, triplanes_list):
+        """
+        Args:
+            triplanes_list: [(B,Ci,H,W)]*3 in xy,yz,(zx or xz) depending on order
+        Returns:
+            out_triplanes_list: [(B,Co,H,W)]*3 in xy,yz,(zx or xz) depending on order
+        """
+        inps = list(triplanes_list)
+        xp = 1 #(yz)
+        yp = 2 #(zx)
+        zp = 0 #(xy)
+        if self.order == 'xz':
+            # get into zx order
+            inps[yp] = einops.rearrange(inps[yp], 'b c x z -> b c z x')
+        oplanes = [None]*3
+        # order shouldn't matter
+        for iplane in [zp, xp, yp]:
+            # i_plane -> (j,k)
+            # need to average out i and convert to (j,k)
+            # j_plane -> (k,i)
+            # k_plane -> (i,j)
+            jplane = (iplane+1)%3
+            kplane = (iplane+2)%3
+            ifeat = inps[iplane]
+            # need to average out nonshared dim
+            # Average pool across
+            # j_plane -> (k,i) -> (k,1) -> (1,k) -> (j,k)
+            # b c k i -> b c k 1
+            jpool = torch.mean(inps[jplane], dim=3 ,keepdim=True)
+            jpool = einops.rearrange(jpool, 'b c k 1 -> b c 1 k')
+            jpool = einops.repeat(jpool, 'b c 1 k -> b c j k', j=ifeat.size(2))
+            # k_plane -> (i,j) -> (1,j) -> (j,1) -> (j,k)
+            # b c i j -> b c 1 j
+            kpool = torch.mean(inps[kplane], dim=2 ,keepdim=True)
+            kpool = einops.rearrange(kpool, 'b c 1 j -> b c j 1')
+            kpool = einops.repeat(kpool, 'b c j 1 -> b c j k', k=ifeat.size(3))
+            # b c h w
+            # jpool = jpool.expand_as(ifeat)
+            # kpool = kpool.expand_as(ifeat)
+            # concat and conv on feature dim
+            catfeat = torch.cat([ifeat, jpool, kpool], dim=1)
+            oplane = self.plane_convs[iplane](catfeat)
+            oplanes[iplane] = oplane
+        if self.order == 'xz':
+            # get back into xz order
+            oplanes[yp] = einops.rearrange(oplanes[yp], 'b c z x -> b c x z')
+        return oplanes
+def roll_triplanes(triplanes_list):
+    # B, C, tri, h, w
+    tristack = torch.stack((triplanes_list),dim=2)
+    return einops.rearrange(tristack, 'b c tri h w -> b c (tri h) w', tri=3)
+def unroll_triplanes(rolled_triplane):
+    # B, C, tri*h, w
+    tristack = einops.rearrange(rolled_triplane, 'b c (tri h) w -> b c tri h w', tri=3)
+    return torch.unbind(tristack, dim=2)
+def conv1x1triplane3daware(in_channels, out_channels, order='xz', **kwargs):
+    return ConvTriplane3dAware(lambda inp, out: conv1x1(inp,out,**kwargs),
+                               in_channels, out_channels,order=order)
+def Normalize(in_channels, num_groups=32):
+    num_groups = min(in_channels, num_groups)  # avoid error if in_channels < 32
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+def nonlinearity(x):
+    # return F.relu(x)
+    # Swish
+    return x*torch.sigmoid(x)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock3dAware(nn.Module):
+    def __init__(self, in_channels, out_channels=None):
+            #, conv_shortcut=False):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        # self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = conv3x3(self.in_channels, self.out_channels)
+        self.norm_mid = Normalize(out_channels)
+        self.conv_3daware = conv1x1triplane3daware(self.out_channels, self.out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.conv2 = conv3x3(self.out_channels, self.out_channels)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                out_channels,
+                                                kernel_size=1,
+                                                stride=1,
+                                                padding=0)
+    def forward(self, x):
+        # 3x3 plane comm
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        # 1x1 3d aware, crossplane comm
+        h = self.norm_mid(h)
+        h = nonlinearity(h)
+        h = unroll_triplanes(h)
+        h = self.conv_3daware(h)
+        h = roll_triplanes(h)
+        # 3x3 plane comm
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x+h
+class DownConv3dAware(nn.Module):
+    """
+    A helper Module that performs 2 convolutions and 1 MaxPool.
+    A ReLU activation follows each convolution.
+    """
+    def __init__(self, in_channels, out_channels, downsample=True, with_conv=False):
+        super(DownConv3dAware, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.block = ResnetBlock3dAware(in_channels=in_channels,
+            out_channels=out_channels)
+        self.do_downsample = downsample
+        self.downsample = Downsample(out_channels, with_conv=with_conv)
+    def forward(self, x):
+        """
+        rolled input, rolled output
+        Args:
+            x: rolled (b c (tri*h) w)
+        """
+        x = self.block(x)
+        before_pool = x
+        # if self.pooling:
+        #     x = self.pool(x)
+        if self.do_downsample:
+            # unroll and cat channel-wise (to prevent pooling across triplane boundaries)
+            x = einops.rearrange(x, 'b c (tri h) w -> b (c tri) h w', tri=3)
+            x = self.downsample(x)
+            # undo
+            x = einops.rearrange(x, 'b (c tri) h w -> b c (tri h) w', tri=3)
+        return x, before_pool
+class UpConv3dAware(nn.Module):
+    """
+    A helper Module that performs 2 convolutions and 1 UpConvolution.
+    A ReLU activation follows each convolution.
+    """
+    def __init__(self, in_channels, out_channels,
+                 merge_mode='concat', with_conv=False): #up_mode='transpose', ):
+        super(UpConv3dAware, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.merge_mode = merge_mode
+        self.upsample = Upsample(in_channels, with_conv)
+        if self.merge_mode == 'concat':
+            self.norm1 = Normalize(in_channels+out_channels)
+            self.block = ResnetBlock3dAware(in_channels=in_channels+out_channels,
+                out_channels=out_channels)
+        else:
+            self.norm1 = Normalize(in_channels)
+            self.block = ResnetBlock3dAware(in_channels=in_channels,
+                out_channels=out_channels)
+    def forward(self, from_down, from_up):
+        """ Forward pass
+        rolled inputs, rolled output
+        rolled (b c (tri*h) w)
+        Arguments:
+            from_down: tensor from the encoder pathway
+            from_up: upconv'd tensor from the decoder pathway
+        """
+        # from_up = self.upconv(from_up)
+        from_up = self.upsample(from_up)
+        if self.merge_mode == 'concat':
+            x = torch.cat((from_up, from_down), 1)
+        else:
+            x = from_up + from_down
+        x = self.norm1(x)
+        x = self.block(x)
+        return x
+class UNetTriplane3dAware(nn.Module):
+    def __init__(self, out_channels, in_channels=3, depth=5,
+                 start_filts=64,# up_mode='transpose',
+                 use_initial_conv=False,
+                 merge_mode='concat', **kwargs):
+        """
+        Arguments:
+            in_channels: int, number of channels in the input tensor.
+                Default is 3 for RGB images.
+            depth: int, number of MaxPools in the U-Net.
+            start_filts: int, number of convolutional filters for the
+                first conv.
+        """
+        super(UNetTriplane3dAware, self).__init__()
+        self.out_channels = out_channels
+        self.in_channels = in_channels
+        self.start_filts = start_filts
+        self.depth = depth
+        self.use_initial_conv = use_initial_conv
+        if use_initial_conv:
+            self.conv_initial = conv1x1(self.in_channels, self.start_filts)
+        self.down_convs = []
+        self.up_convs = []
+        # create the encoder pathway and add to a list
+        for i in range(depth):
+            if i == 0:
+                ins = self.start_filts if use_initial_conv else self.in_channels
+            else:
+                ins = outs
+            outs = self.start_filts*(2**i)
+            downsamp_it = True if i < depth-1 else False
+            down_conv = DownConv3dAware(ins, outs, downsample = downsamp_it)
+            self.down_convs.append(down_conv)
+        for i in range(depth-1):
+            ins = outs
+            outs = ins // 2
+            up_conv = UpConv3dAware(ins, outs,
+                merge_mode=merge_mode)
+            self.up_convs.append(up_conv)
+        # add the list of modules to current module
+        self.down_convs = nn.ModuleList(self.down_convs)
+        self.up_convs = nn.ModuleList(self.up_convs)
+        self.norm_out = Normalize(outs)
+        self.conv_final = conv1x1(outs, self.out_channels)
+        self.reset_params()
+    @staticmethod
+    def weight_init(m):
+        if isinstance(m, nn.Conv2d):
+            # init.xavier_normal_(m.weight, gain=0.1)
+            init.xavier_normal_(m.weight)
+            init.constant_(m.bias, 0)
+    def reset_params(self):
+        for i, m in enumerate(self.modules()):
+            self.weight_init(m)
+    def forward(self, x):
+        """
+        Args:
+            x: Stacked triplane expected to be in (B,3,C,H,W)
+        """
+        # Roll
+        x = einops.rearrange(x, 'b tri c h w -> b c (tri h) w', tri=3)
+        if self.use_initial_conv:
+            x = self.conv_initial(x)
+        encoder_outs = []
+        # encoder pathway, save outputs for merging
+        for i, module in enumerate(self.down_convs):
+            x, before_pool = module(x)
+            encoder_outs.append(before_pool)
+        # Spend a block in the middle
+        # x = self.block_mid(x)
+        for i, module in enumerate(self.up_convs):
+            before_pool = encoder_outs[-(i+2)]
+            x = module(before_pool, x)
+        x = self.norm_out(x)
+        # No softmax is used. This means you need to use
+        # nn.CrossEntropyLoss is your training script,
+        # as this module includes a softmax already.
+        x = self.conv_final(nonlinearity(x))
+        # Unroll
+        x = einops.rearrange(x, 'b c (tri h) w -> b tri c h w', tri=3)
+        return x
+def setup_unet(output_channels, input_channels, unet_cfg):
+    if unet_cfg['use_3d_aware']:
+        assert(unet_cfg['rolled'])
+        unet = UNetTriplane3dAware(
+                                        out_channels=output_channels,
+                                        in_channels=input_channels,
+                                        depth=unet_cfg['depth'],
+                                        use_initial_conv=unet_cfg['use_initial_conv'],
+                                        start_filts=unet_cfg['start_hidden_channels'],)
+    else:
+        raise NotImplementedError
+    return unet

modules/PartField/partfield/model/UNet/buildingblocks.py ADDED Viewed

	@@ -0,0 +1,546 @@

+#https://github.com/wolny/pytorch-3dunet/blob/master/pytorch3dunet/unet3d/buildingblocks.py
+# MIT License
+# Copyright (c) 2018 Adrian Wolny
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from functools import partial
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+# from pytorch3dunet.unet3d.se import ChannelSELayer3D, ChannelSpatialSELayer3D, SpatialSELayer3D
+def create_conv(in_channels, out_channels, kernel_size, order, num_groups, padding,
+                dropout_prob, is3d):
+    """
+    Create a list of modules with together constitute a single conv layer with non-linearity
+    and optional batchnorm/groupnorm.
+    Args:
+        in_channels (int): number of input channels
+        out_channels (int): number of output channels
+        kernel_size(int or tuple): size of the convolving kernel
+        order (string): order of things, e.g.
+            'cr' -> conv + ReLU
+            'gcr' -> groupnorm + conv + ReLU
+            'cl' -> conv + LeakyReLU
+            'ce' -> conv + ELU
+            'bcr' -> batchnorm + conv + ReLU
+            'cbrd' -> conv + batchnorm + ReLU + dropout
+            'cbrD' -> conv + batchnorm + ReLU + dropout2d
+        num_groups (int): number of groups for the GroupNorm
+        padding (int or tuple): add zero-padding added to all three sides of the input
+        dropout_prob (float): dropout probability
+        is3d (bool): is3d (bool): if True use Conv3d, otherwise use Conv2d
+    Return:
+        list of tuple (name, module)
+    """
+    assert 'c' in order, "Conv layer MUST be present"
+    assert order[0] not in 'rle', 'Non-linearity cannot be the first operation in the layer'
+    modules = []
+    for i, char in enumerate(order):
+        if char == 'r':
+            modules.append(('ReLU', nn.ReLU(inplace=True)))
+        elif char == 'l':
+            modules.append(('LeakyReLU', nn.LeakyReLU(inplace=True)))
+        elif char == 'e':
+            modules.append(('ELU', nn.ELU(inplace=True)))
+        elif char == 'c':
+            # add learnable bias only in the absence of batchnorm/groupnorm
+            bias = not ('g' in order or 'b' in order)
+            if is3d:
+                conv = nn.Conv3d(in_channels, out_channels, kernel_size, padding=padding, bias=bias)
+            else:
+                conv = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, bias=bias)
+            modules.append(('conv', conv))
+        elif char == 'g':
+            is_before_conv = i < order.index('c')
+            if is_before_conv:
+                num_channels = in_channels
+            else:
+                num_channels = out_channels
+            # use only one group if the given number of groups is greater than the number of channels
+            if num_channels < num_groups:
+                num_groups = 1
+            assert num_channels % num_groups == 0, f'Expected number of channels in input to be divisible by num_groups. num_channels={num_channels}, num_groups={num_groups}'
+            modules.append(('groupnorm', nn.GroupNorm(num_groups=num_groups, num_channels=num_channels)))
+        elif char == 'b':
+            is_before_conv = i < order.index('c')
+            if is3d:
+                bn = nn.BatchNorm3d
+            else:
+                bn = nn.BatchNorm2d
+            if is_before_conv:
+                modules.append(('batchnorm', bn(in_channels)))
+            else:
+                modules.append(('batchnorm', bn(out_channels)))
+        elif char == 'd':
+            modules.append(('dropout', nn.Dropout(p=dropout_prob)))
+        elif char == 'D':
+            modules.append(('dropout2d', nn.Dropout2d(p=dropout_prob)))
+        else:
+            raise ValueError(f"Unsupported layer type '{char}'. MUST be one of ['b', 'g', 'r', 'l', 'e', 'c', 'd', 'D']")
+    return modules
+class SingleConv(nn.Sequential):
+    """
+    Basic convolutional module consisting of a Conv3d, non-linearity and optional batchnorm/groupnorm. The order
+    of operations can be specified via the `order` parameter
+    Args:
+        in_channels (int): number of input channels
+        out_channels (int): number of output channels
+        kernel_size (int or tuple): size of the convolving kernel
+        order (string): determines the order of layers, e.g.
+            'cr' -> conv + ReLU
+            'crg' -> conv + ReLU + groupnorm
+            'cl' -> conv + LeakyReLU
+            'ce' -> conv + ELU
+        num_groups (int): number of groups for the GroupNorm
+        padding (int or tuple): add zero-padding
+        dropout_prob (float): dropout probability, default 0.1
+        is3d (bool): if True use Conv3d, otherwise use Conv2d
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, order='gcr', num_groups=8,
+                 padding=1, dropout_prob=0.1, is3d=True):
+        super(SingleConv, self).__init__()
+        for name, module in create_conv(in_channels, out_channels, kernel_size, order,
+                                        num_groups, padding, dropout_prob, is3d):
+            self.add_module(name, module)
+class DoubleConv(nn.Sequential):
+    """
+    A module consisting of two consecutive convolution layers (e.g. BatchNorm3d+ReLU+Conv3d).
+    We use (Conv3d+ReLU+GroupNorm3d) by default.
+    This can be changed however by providing the 'order' argument, e.g. in order
+    to change to Conv3d+BatchNorm3d+ELU use order='cbe'.
+    Use padded convolutions to make sure that the output (H_out, W_out) is the same
+    as (H_in, W_in), so that you don't have to crop in the decoder path.
+    Args:
+        in_channels (int): number of input channels
+        out_channels (int): number of output channels
+        encoder (bool): if True we're in the encoder path, otherwise we're in the decoder
+        kernel_size (int or tuple): size of the convolving kernel
+        order (string): determines the order of layers, e.g.
+            'cr' -> conv + ReLU
+            'crg' -> conv + ReLU + groupnorm
+            'cl' -> conv + LeakyReLU
+            'ce' -> conv + ELU
+        num_groups (int): number of groups for the GroupNorm
+        padding (int or tuple): add zero-padding added to all three sides of the input
+        upscale (int): number of the convolution to upscale in encoder if DoubleConv, default: 2
+        dropout_prob (float or tuple): dropout probability for each convolution, default 0.1
+        is3d (bool): if True use Conv3d instead of Conv2d layers
+    """
+    def __init__(self, in_channels, out_channels, encoder, kernel_size=3, order='gcr',
+                 num_groups=8, padding=1, upscale=2, dropout_prob=0.1, is3d=True):
+        super(DoubleConv, self).__init__()
+        if encoder:
+            # we're in the encoder path
+            conv1_in_channels = in_channels
+            if upscale == 1:
+                conv1_out_channels = out_channels
+            else:
+                conv1_out_channels = out_channels // 2
+            if conv1_out_channels < in_channels:
+                conv1_out_channels = in_channels
+            conv2_in_channels, conv2_out_channels = conv1_out_channels, out_channels
+        else:
+            # we're in the decoder path, decrease the number of channels in the 1st convolution
+            conv1_in_channels, conv1_out_channels = in_channels, out_channels
+            conv2_in_channels, conv2_out_channels = out_channels, out_channels
+        # check if dropout_prob is a tuple and if so
+        # split it for different dropout probabilities for each convolution.
+        if isinstance(dropout_prob, list) or isinstance(dropout_prob, tuple):
+            dropout_prob1 = dropout_prob[0]
+            dropout_prob2 = dropout_prob[1]
+        else:
+            dropout_prob1 = dropout_prob2 = dropout_prob
+        # conv1
+        self.add_module('SingleConv1',
+                        SingleConv(conv1_in_channels, conv1_out_channels, kernel_size, order, num_groups,
+                                   padding=padding, dropout_prob=dropout_prob1, is3d=is3d))
+        # conv2
+        self.add_module('SingleConv2',
+                        SingleConv(conv2_in_channels, conv2_out_channels, kernel_size, order, num_groups,
+                                   padding=padding, dropout_prob=dropout_prob2, is3d=is3d))
+class ResNetBlock(nn.Module):
+    """
+    Residual block that can be used instead of standard DoubleConv in the Encoder module.
+    Motivated by: https://arxiv.org/pdf/1706.00120.pdf
+    Notice we use ELU instead of ReLU (order='cge') and put non-linearity after the groupnorm.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, order='cge', num_groups=8, is3d=True, **kwargs):
+        super(ResNetBlock, self).__init__()
+        if in_channels != out_channels:
+            # conv1x1 for increasing the number of channels
+            if is3d:
+                self.conv1 = nn.Conv3d(in_channels, out_channels, 1)
+            else:
+                self.conv1 = nn.Conv2d(in_channels, out_channels, 1)
+        else:
+            self.conv1 = nn.Identity()
+        self.conv2 = SingleConv(in_channels, out_channels, kernel_size=kernel_size, order=order, num_groups=num_groups,
+                                is3d=is3d)
+        # remove non-linearity from the 3rd convolution since it's going to be applied after adding the residual
+        n_order = order
+        for c in 'rel':
+            n_order = n_order.replace(c, '')
+        self.conv3 = SingleConv(out_channels, out_channels, kernel_size=kernel_size, order=n_order,
+                                num_groups=num_groups, is3d=is3d)
+        # create non-linearity separately
+        if 'l' in order:
+            self.non_linearity = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+        elif 'e' in order:
+            self.non_linearity = nn.ELU(inplace=True)
+        else:
+            self.non_linearity = nn.ReLU(inplace=True)
+    def forward(self, x):
+        # apply first convolution to bring the number of channels to out_channels
+        residual = self.conv1(x)
+        out = self.conv2(x)
+        out = self.conv3(out)
+        out += residual
+        out = self.non_linearity(out)
+        return out
+class Encoder(nn.Module):
+    """
+    A single module from the encoder path consisting of the optional max
+    pooling layer (one may specify the MaxPool kernel_size to be different
+    from the standard (2,2,2), e.g. if the volumetric data is anisotropic
+    (make sure to use complementary scale_factor in the decoder path) followed by
+    a basic module (DoubleConv or ResNetBlock).
+    Args:
+        in_channels (int): number of input channels
+        out_channels (int): number of output channels
+        conv_kernel_size (int or tuple): size of the convolving kernel
+        apply_pooling (bool): if True use MaxPool3d before DoubleConv
+        pool_kernel_size (int or tuple): the size of the window
+        pool_type (str): pooling layer: 'max' or 'avg'
+        basic_module(nn.Module): either ResNetBlock or DoubleConv
+        conv_layer_order (string): determines the order of layers
+            in `DoubleConv` module. See `DoubleConv` for more info.
+        num_groups (int): number of groups for the GroupNorm
+        padding (int or tuple): add zero-padding added to all three sides of the input
+        upscale (int): number of the convolution to upscale in encoder if DoubleConv, default: 2
+        dropout_prob (float or tuple): dropout probability, default 0.1
+        is3d (bool): use 3d or 2d convolutions/pooling operation
+    """
+    def __init__(self, in_channels, out_channels, conv_kernel_size=3, apply_pooling=True,
+                 pool_kernel_size=2, pool_type='max', basic_module=DoubleConv, conv_layer_order='gcr',
+                 num_groups=8, padding=1, upscale=2, dropout_prob=0.1, is3d=True):
+        super(Encoder, self).__init__()
+        assert pool_type in ['max', 'avg']
+        if apply_pooling:
+            if pool_type == 'max':
+                if is3d:
+                    self.pooling = nn.MaxPool3d(kernel_size=pool_kernel_size)
+                else:
+                    self.pooling = nn.MaxPool2d(kernel_size=pool_kernel_size)
+            else:
+                if is3d:
+                    self.pooling = nn.AvgPool3d(kernel_size=pool_kernel_size)
+                else:
+                    self.pooling = nn.AvgPool2d(kernel_size=pool_kernel_size)
+        else:
+            self.pooling = None
+        self.basic_module = basic_module(in_channels, out_channels,
+                                         encoder=True,
+                                         kernel_size=conv_kernel_size,
+                                         order=conv_layer_order,
+                                         num_groups=num_groups,
+                                         padding=padding,
+                                         upscale=upscale,
+                                         dropout_prob=dropout_prob,
+                                         is3d=is3d)
+    def forward(self, x):
+        if self.pooling is not None:
+            x = self.pooling(x)
+        x = self.basic_module(x)
+        return x
+class Decoder(nn.Module):
+    """
+    A single module for decoder path consisting of the upsampling layer
+    (either learned ConvTranspose3d or nearest neighbor interpolation)
+    followed by a basic module (DoubleConv or ResNetBlock).
+    Args:
+        in_channels (int): number of input channels
+        out_channels (int): number of output channels
+        conv_kernel_size (int or tuple): size of the convolving kernel
+        scale_factor (int or tuple): used as the multiplier for the image H/W/D in
+            case of nn.Upsample or as stride in case of ConvTranspose3d, must reverse the MaxPool3d operation
+            from the corresponding encoder
+        basic_module(nn.Module): either ResNetBlock or DoubleConv
+        conv_layer_order (string): determines the order of layers
+            in `DoubleConv` module. See `DoubleConv` for more info.
+        num_groups (int): number of groups for the GroupNorm
+        padding (int or tuple): add zero-padding added to all three sides of the input
+        upsample (str): algorithm used for upsampling:
+            InterpolateUpsampling:   'nearest' | 'linear' | 'bilinear' | 'trilinear' | 'area'
+            TransposeConvUpsampling: 'deconv'
+            No upsampling:           None
+            Default: 'default' (chooses automatically)
+        dropout_prob (float or tuple): dropout probability, default 0.1
+    """
+    def __init__(self, in_channels, out_channels, conv_kernel_size=3, scale_factor=2, basic_module=DoubleConv,
+                 conv_layer_order='gcr', num_groups=8, padding=1, upsample='default',
+                 dropout_prob=0.1, is3d=True):
+        super(Decoder, self).__init__()
+        # perform concat joining per default
+        concat = True
+        # don't adapt channels after join operation
+        adapt_channels = False
+        if upsample is not None and upsample != 'none':
+            if upsample == 'default':
+                if basic_module == DoubleConv:
+                    upsample = 'nearest'  # use nearest neighbor interpolation for upsampling
+                    concat = True  # use concat joining
+                    adapt_channels = False  # don't adapt channels
+                elif basic_module == ResNetBlock: #or basic_module == ResNetBlockSE:
+                    upsample = 'deconv'  # use deconvolution upsampling
+                    concat = False  # use summation joining
+                    adapt_channels = True  # adapt channels after joining
+            # perform deconvolution upsampling if mode is deconv
+            if upsample == 'deconv':
+                self.upsampling = TransposeConvUpsampling(in_channels=in_channels, out_channels=out_channels,
+                                                          kernel_size=conv_kernel_size, scale_factor=scale_factor,
+                                                          is3d=is3d)
+            else:
+                self.upsampling = InterpolateUpsampling(mode=upsample)
+        else:
+            # no upsampling
+            self.upsampling = NoUpsampling()
+            # concat joining
+            self.joining = partial(self._joining, concat=True)
+        # perform joining operation
+        self.joining = partial(self._joining, concat=concat)
+        # adapt the number of in_channels for the ResNetBlock
+        if adapt_channels is True:
+            in_channels = out_channels
+        self.basic_module = basic_module(in_channels, out_channels,
+                                         encoder=False,
+                                         kernel_size=conv_kernel_size,
+                                         order=conv_layer_order,
+                                         num_groups=num_groups,
+                                         padding=padding,
+                                         dropout_prob=dropout_prob,
+                                         is3d=is3d)
+    def forward(self, encoder_features, x):
+        x = self.upsampling(encoder_features=encoder_features, x=x)
+        x = self.joining(encoder_features, x)
+        x = self.basic_module(x)
+        return x
+    @staticmethod
+    def _joining(encoder_features, x, concat):
+        if concat:
+            return torch.cat((encoder_features, x), dim=1)
+        else:
+            return encoder_features + x
+def create_encoders(in_channels, f_maps, basic_module, conv_kernel_size, conv_padding,
+                    conv_upscale, dropout_prob,
+                    layer_order, num_groups, pool_kernel_size, is3d):
+    # create encoder path consisting of Encoder modules. Depth of the encoder is equal to `len(f_maps)`
+    encoders = []
+    for i, out_feature_num in enumerate(f_maps):
+        if i == 0:
+            # apply conv_coord only in the first encoder if any
+            encoder = Encoder(in_channels, out_feature_num,
+                              apply_pooling=False,  # skip pooling in the firs encoder
+                              basic_module=basic_module,
+                              conv_layer_order=layer_order,
+                              conv_kernel_size=conv_kernel_size,
+                              num_groups=num_groups,
+                              padding=conv_padding,
+                              upscale=conv_upscale,
+                              dropout_prob=dropout_prob,
+                              is3d=is3d)
+        else:
+            encoder = Encoder(f_maps[i - 1], out_feature_num,
+                              basic_module=basic_module,
+                              conv_layer_order=layer_order,
+                              conv_kernel_size=conv_kernel_size,
+                              num_groups=num_groups,
+                              pool_kernel_size=pool_kernel_size,
+                              padding=conv_padding,
+                              upscale=conv_upscale,
+                              dropout_prob=dropout_prob,
+                              is3d=is3d)
+        encoders.append(encoder)
+    return nn.ModuleList(encoders)
+def create_decoders(f_maps, basic_module, conv_kernel_size, conv_padding, layer_order,
+                    num_groups, upsample, dropout_prob, is3d):
+    # create decoder path consisting of the Decoder modules. The length of the decoder list is equal to `len(f_maps) - 1`
+    decoders = []
+    reversed_f_maps = list(reversed(f_maps[1:]))
+    for i in range(len(reversed_f_maps) - 1):
+        if basic_module == DoubleConv and upsample != 'deconv':
+            in_feature_num = reversed_f_maps[i] + reversed_f_maps[i + 1]
+        else:
+            in_feature_num = reversed_f_maps[i]
+        out_feature_num = reversed_f_maps[i + 1]
+        decoder = Decoder(in_feature_num, out_feature_num,
+                          basic_module=basic_module,
+                          conv_layer_order=layer_order,
+                          conv_kernel_size=conv_kernel_size,
+                          num_groups=num_groups,
+                          padding=conv_padding,
+                          upsample=upsample,
+                          dropout_prob=dropout_prob,
+                          is3d=is3d)
+        decoders.append(decoder)
+    return nn.ModuleList(decoders)
+class AbstractUpsampling(nn.Module):
+    """
+    Abstract class for upsampling. A given implementation should upsample a given 5D input tensor using either
+    interpolation or learned transposed convolution.
+    """
+    def __init__(self, upsample):
+        super(AbstractUpsampling, self).__init__()
+        self.upsample = upsample
+    def forward(self, encoder_features, x):
+        # get the spatial dimensions of the output given the encoder_features
+        output_size = encoder_features.size()[2:]
+        # upsample the input and return
+        return self.upsample(x, output_size)
+class InterpolateUpsampling(AbstractUpsampling):
+    """
+    Args:
+        mode (str): algorithm used for upsampling:
+            'nearest' | 'linear' | 'bilinear' | 'trilinear' | 'area'. Default: 'nearest'
+            used only if transposed_conv is False
+    """
+    def __init__(self, mode='nearest'):
+        upsample = partial(self._interpolate, mode=mode)
+        super().__init__(upsample)
+    @staticmethod
+    def _interpolate(x, size, mode):
+        return F.interpolate(x, size=size, mode=mode)
+class TransposeConvUpsampling(AbstractUpsampling):
+    """
+    Args:
+        in_channels (int): number of input channels for transposed conv
+            used only if transposed_conv is True
+        out_channels (int): number of output channels for transpose conv
+            used only if transposed_conv is True
+        kernel_size (int or tuple): size of the convolving kernel
+            used only if transposed_conv is True
+        scale_factor (int or tuple): stride of the convolution
+            used only if transposed_conv is True
+        is3d (bool): if True use ConvTranspose3d, otherwise use ConvTranspose2d
+    """
+    class Upsample(nn.Module):
+        """
+        Workaround the 'ValueError: requested an output size...' in the `_output_padding` method in
+        transposed convolution. It performs transposed conv followed by the interpolation to the correct size if necessary.
+        """
+        def __init__(self, conv_transposed, is3d):
+            super().__init__()
+            self.conv_transposed = conv_transposed
+            self.is3d = is3d
+        def forward(self, x, size):
+            x = self.conv_transposed(x)
+            return F.interpolate(x, size=size)
+    def __init__(self, in_channels, out_channels, kernel_size=3, scale_factor=2, is3d=True):
+        # make sure that the output size reverses the MaxPool3d from the corresponding encoder
+        if is3d is True:
+            conv_transposed = nn.ConvTranspose3d(in_channels, out_channels, kernel_size=kernel_size,
+                                                 stride=scale_factor, padding=1, bias=False)
+        else:
+            conv_transposed = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=kernel_size,
+                                                 stride=scale_factor, padding=1, bias=False)
+        upsample = self.Upsample(conv_transposed, is3d)
+        super().__init__(upsample)
+class NoUpsampling(AbstractUpsampling):
+    def __init__(self):
+        super().__init__(self._no_upsampling)
+    @staticmethod
+    def _no_upsampling(x, size):
+        return x

modules/PartField/partfield/model/UNet/model.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# https://github.com/wolny/pytorch-3dunet/blob/master/pytorch3dunet/unet3d/buildingblocks.py
+# MIT License
+# Copyright (c) 2018 Adrian Wolny
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch.nn as nn
+from partfield.model.UNet.buildingblocks import DoubleConv, ResNetBlock, \
+    create_decoders, create_encoders
+def number_of_features_per_level(init_channel_number, num_levels):
+    return [init_channel_number * 2 ** k for k in range(num_levels)]
+class AbstractUNet(nn.Module):
+    """
+    Base class for standard and residual UNet.
+    Args:
+        in_channels (int): number of input channels
+        out_channels (int): number of output segmentation masks;
+            Note that the of out_channels might correspond to either
+            different semantic classes or to different binary segmentation mask.
+            It's up to the user of the class to interpret the out_channels and
+            use the proper loss criterion during training (i.e. CrossEntropyLoss (multi-class)
+            or BCEWithLogitsLoss (two-class) respectively)
+        f_maps (int, tuple): number of feature maps at each level of the encoder; if it's an integer the number
+            of feature maps is given by the geometric progression: f_maps ^ k, k=1,2,3,4
+        final_sigmoid (bool): if True apply element-wise nn.Sigmoid after the final 1x1 convolution,
+            otherwise apply nn.Softmax. In effect only if `self.training == False`, i.e. during validation/testing
+        basic_module: basic model for the encoder/decoder (DoubleConv, ResNetBlock, ....)
+        layer_order (string): determines the order of layers in `SingleConv` module.
+            E.g. 'crg' stands for GroupNorm3d+Conv3d+ReLU. See `SingleConv` for more info
+        num_groups (int): number of groups for the GroupNorm
+        num_levels (int): number of levels in the encoder/decoder path (applied only if f_maps is an int)
+            default: 4
+        is_segmentation (bool): if True and the model is in eval mode, Sigmoid/Softmax normalization is applied
+            after the final convolution; if False (regression problem) the normalization layer is skipped
+        conv_kernel_size (int or tuple): size of the convolving kernel in the basic_module
+        pool_kernel_size (int or tuple): the size of the window
+        conv_padding (int or tuple): add zero-padding added to all three sides of the input
+        conv_upscale (int): number of the convolution to upscale in encoder if DoubleConv, default: 2
+        upsample (str): algorithm used for decoder upsampling:
+            InterpolateUpsampling:   'nearest' | 'linear' | 'bilinear' | 'trilinear' | 'area'
+            TransposeConvUpsampling: 'deconv'
+            No upsampling:           None
+            Default: 'default' (chooses automatically)
+        dropout_prob (float or tuple): dropout probability, default: 0.1
+        is3d (bool): if True the model is 3D, otherwise 2D, default: True
+    """
+    def __init__(self, in_channels, out_channels, final_sigmoid, basic_module, f_maps=64, layer_order='gcr',
+                 num_groups=8, num_levels=4, is_segmentation=False, conv_kernel_size=3, pool_kernel_size=2,
+                 conv_padding=1, conv_upscale=2, upsample='default', dropout_prob=0.1, is3d=True, encoder_only=False):
+        super(AbstractUNet, self).__init__()
+        if isinstance(f_maps, int):
+            f_maps = number_of_features_per_level(f_maps, num_levels=num_levels)
+        assert isinstance(f_maps, list) or isinstance(f_maps, tuple)
+        assert len(f_maps) > 1, "Required at least 2 levels in the U-Net"
+        if 'g' in layer_order:
+            assert num_groups is not None, "num_groups must be specified if GroupNorm is used"
+        # create encoder path
+        self.encoders = create_encoders(in_channels, f_maps, basic_module, conv_kernel_size,
+                                        conv_padding, conv_upscale, dropout_prob,
+                                        layer_order, num_groups, pool_kernel_size, is3d)
+        self.encoder_only = encoder_only
+        if encoder_only == False:
+            # create decoder path
+            self.decoders = create_decoders(f_maps, basic_module, conv_kernel_size, conv_padding,
+                                            layer_order, num_groups, upsample, dropout_prob,
+                                            is3d)
+            # in the last layer a 1×1 convolution reduces the number of output channels to the number of labels
+            if is3d:
+                self.final_conv = nn.Conv3d(f_maps[1], out_channels, 1)
+            else:
+                self.final_conv = nn.Conv2d(f_maps[1], out_channels, 1)
+            if is_segmentation:
+                # semantic segmentation problem
+                if final_sigmoid:
+                    self.final_activation = nn.Sigmoid()
+                else:
+                    self.final_activation = nn.Softmax(dim=1)
+            else:
+                # regression problem
+                self.final_activation = None
+    def forward(self, x, return_bottleneck_feat=False):
+        # encoder part
+        encoders_features = []
+        for encoder in self.encoders:
+            x = encoder(x)
+            # reverse the encoder outputs to be aligned with the decoder
+            encoders_features.insert(0, x)
+        # remove the last encoder's output from the list
+        # !!remember: it's the 1st in the list
+        bottleneck_feat = encoders_features[0]
+        if self.encoder_only:
+            return bottleneck_feat
+        else:
+            encoders_features = encoders_features[1:]
+            # decoder part
+            for decoder, encoder_features in zip(self.decoders, encoders_features):
+                # pass the output from the corresponding encoder and the output
+                # of the previous decoder
+                x = decoder(encoder_features, x)
+            x = self.final_conv(x)
+            # During training the network outputs logits
+            if self.final_activation is not None:
+                x = self.final_activation(x)
+            if return_bottleneck_feat:
+                return x, bottleneck_feat
+            else:
+                return x
+class ResidualUNet3D(AbstractUNet):
+    """
+    Residual 3DUnet model implementation based on https://arxiv.org/pdf/1706.00120.pdf.
+    Uses ResNetBlock as a basic building block, summation joining instead
+    of concatenation joining and transposed convolutions for upsampling (watch out for block artifacts).
+    Since the model effectively becomes a residual net, in theory it allows for deeper UNet.
+    """
+    def __init__(self, in_channels, out_channels, final_sigmoid=True, f_maps=(8, 16, 64, 256, 1024), layer_order='gcr',
+                 num_groups=8, num_levels=5, is_segmentation=True, conv_padding=1,
+                 conv_upscale=2, upsample='default', dropout_prob=0.1, encoder_only=False,  **kwargs):
+        super(ResidualUNet3D, self).__init__(in_channels=in_channels,
+                                             out_channels=out_channels,
+                                             final_sigmoid=final_sigmoid,
+                                             basic_module=ResNetBlock,
+                                             f_maps=f_maps,
+                                             layer_order=layer_order,
+                                             num_groups=num_groups,
+                                             num_levels=num_levels,
+                                             is_segmentation=is_segmentation,
+                                             conv_padding=conv_padding,
+                                             conv_upscale=conv_upscale,
+                                             upsample=upsample,
+                                             dropout_prob=dropout_prob,
+                                             encoder_only=encoder_only,
+                                             is3d=True)

modules/PartField/partfield/model/model_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+import torch.nn as nn
+class VanillaMLP(nn.Module):
+    def __init__(self, input_dim, output_dim, out_activation, n_hidden_layers=4, n_neurons=64, activation="ReLU"):
+        super().__init__()
+        self.n_neurons = n_neurons
+        self.n_hidden_layers = n_hidden_layers
+        self.activation = activation
+        self.out_activation = out_activation
+        layers = [
+            self.make_linear(input_dim, self.n_neurons, is_first=True, is_last=False),
+            self.make_activation(),
+        ]
+        for i in range(self.n_hidden_layers - 1):
+            layers += [
+                self.make_linear(
+                    self.n_neurons, self.n_neurons, is_first=False, is_last=False
+                ),
+                self.make_activation(),
+            ]
+        layers += [
+            self.make_linear(self.n_neurons, output_dim, is_first=False, is_last=True)
+        ]
+        if self.out_activation == "sigmoid":
+            layers += [nn.Sigmoid()]
+        elif self.out_activation == "tanh":
+            layers += [nn.Tanh()]
+        elif self.out_activation == "hardtanh":
+            layers += [nn.Hardtanh()]
+        elif self.out_activation == "GELU":
+            layers += [nn.GELU()]
+        elif self.out_activation == "RELU":
+            layers += [nn.ReLU()]
+        else:
+            raise NotImplementedError
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x, split_size=100000):
+        with torch.cuda.amp.autocast(enabled=False):
+            out = self.layers(x)
+        return out
+    def make_linear(self, dim_in, dim_out, is_first, is_last):
+        layer = nn.Linear(dim_in, dim_out, bias=False)
+        return layer
+    def make_activation(self):
+        if self.activation == "ReLU":
+            return nn.ReLU(inplace=True)
+        elif self.activation == "GELU":
+            return nn.GELU()
+        else:
+            raise NotImplementedError

modules/PartField/partfield/model/triplane.py ADDED Viewed

	@@ -0,0 +1,331 @@

+#https://github.com/3DTopia/OpenLRM/blob/main/openlrm/models/modeling_lrm.py
+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from functools import partial
+def project_onto_planes(planes, coordinates):
+    """
+    Does a projection of a 3D point onto a batch of 2D planes,
+    returning 2D plane coordinates.
+    Takes plane axes of shape n_planes, 3, 3
+    # Takes coordinates of shape N, M, 3
+    # returns projections of shape N*n_planes, M, 2
+    """
+    N, M, C = coordinates.shape
+    n_planes, _, _ = planes.shape
+    coordinates = coordinates.unsqueeze(1).expand(-1, n_planes, -1, -1).reshape(N*n_planes, M, 3)
+    inv_planes = torch.linalg.inv(planes).unsqueeze(0).expand(N, -1, -1, -1).reshape(N*n_planes, 3, 3)
+    projections = torch.bmm(coordinates, inv_planes)
+    return projections[..., :2]
+def sample_from_planes(plane_features, coordinates, mode='bilinear', padding_mode='zeros', box_warp=None):
+    plane_axes = torch.tensor([[[1, 0, 0],
+                                [0, 1, 0],
+                                [0, 0, 1]],
+                                [[1, 0, 0],
+                                [0, 0, 1],
+                                [0, 1, 0]],
+                                [[0, 0, 1],
+                                [0, 1, 0],
+                                [1, 0, 0]]], dtype=torch.float32).cuda()
+    assert padding_mode == 'zeros'
+    N, n_planes, C, H, W = plane_features.shape
+    _, M, _ = coordinates.shape
+    plane_features = plane_features.view(N*n_planes, C, H, W)
+    projected_coordinates = project_onto_planes(plane_axes, coordinates).unsqueeze(1)
+    output_features = torch.nn.functional.grid_sample(plane_features, projected_coordinates.float(), mode=mode, padding_mode=padding_mode, align_corners=False).permute(0, 3, 2, 1).reshape(N, n_planes, M, C)
+    return output_features
+def get_grid_coord(grid_size = 256, align_corners=False):
+    if align_corners == False:
+        coords = torch.linspace(-1 + 1/(grid_size), 1 - 1/(grid_size), steps=grid_size)
+    else:
+        coords = torch.linspace(-1, 1, steps=grid_size)
+    i, j, k = torch.meshgrid(coords, coords, coords, indexing='ij')
+    coordinates = torch.stack((i, j, k), dim=-1).reshape(-1, 3)
+    return coordinates
+class BasicBlock(nn.Module):
+    """
+    Transformer block that is in its simplest form.
+    Designed for PF-LRM architecture.
+    """
+    # Block contains a self-attention layer and an MLP
+    def __init__(self, inner_dim: int, num_heads: int, eps: float,
+                 attn_drop: float = 0., attn_bias: bool = False,
+                 mlp_ratio: float = 4., mlp_drop: float = 0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(inner_dim, eps=eps)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm2 = nn.LayerNorm(inner_dim, eps=eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(mlp_drop),
+            nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
+            nn.Dropout(mlp_drop),
+        )
+    def forward(self, x):
+        # x: [N, L, D]
+        before_sa = self.norm1(x)
+        x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
+        x = x + self.mlp(self.norm2(x))
+        return x
+class ConditionBlock(nn.Module):
+    """
+    Transformer block that takes in a cross-attention condition.
+    Designed for SparseLRM architecture.
+    """
+    # Block contains a cross-attention layer, a self-attention layer, and an MLP
+    def __init__(self, inner_dim: int, cond_dim: int, num_heads: int, eps: float,
+                 attn_drop: float = 0., attn_bias: bool = False,
+                 mlp_ratio: float = 4., mlp_drop: float = 0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(inner_dim, eps=eps)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads, kdim=cond_dim, vdim=cond_dim,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm2 = nn.LayerNorm(inner_dim, eps=eps)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm3 = nn.LayerNorm(inner_dim, eps=eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(mlp_drop),
+            nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
+            nn.Dropout(mlp_drop),
+        )
+    def forward(self, x, cond):
+        # x: [N, L, D]
+        # cond: [N, L_cond, D_cond]
+        x = x + self.cross_attn(self.norm1(x), cond, cond, need_weights=False)[0]
+        before_sa = self.norm2(x)
+        x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
+        x = x + self.mlp(self.norm3(x))
+        return x
+class TransformerDecoder(nn.Module):
+    def __init__(self, block_type: str,
+                 num_layers: int, num_heads: int,
+                 inner_dim: int, cond_dim: int = None,
+                 eps: float = 1e-6):
+        super().__init__()
+        self.block_type = block_type
+        self.layers = nn.ModuleList([
+            self._block_fn(inner_dim, cond_dim)(
+                num_heads=num_heads,
+                eps=eps,
+            )
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(inner_dim, eps=eps)
+    @property
+    def block_type(self):
+        return self._block_type
+    @block_type.setter
+    def block_type(self, block_type):
+        assert block_type in ['cond', 'basic'], \
+            f"Unsupported block type: {block_type}"
+        self._block_type = block_type
+    def _block_fn(self, inner_dim, cond_dim):
+        assert inner_dim is not None, f"inner_dim must always be specified"
+        if self.block_type == 'basic':
+            return partial(BasicBlock, inner_dim=inner_dim)
+        elif self.block_type == 'cond':
+            assert cond_dim is not None, f"Condition dimension must be specified for ConditionBlock"
+            return partial(ConditionBlock, inner_dim=inner_dim, cond_dim=cond_dim)
+        else:
+            raise ValueError(f"Unsupported block type during runtime: {self.block_type}")
+    def forward_layer(self, layer: nn.Module, x: torch.Tensor, cond: torch.Tensor,):
+        if self.block_type == 'basic':
+            return layer(x)
+        elif self.block_type == 'cond':
+            return layer(x, cond)
+        else:
+            raise NotImplementedError
+    def forward(self, x: torch.Tensor, cond: torch.Tensor = None):
+        # x: [N, L, D]
+        # cond: [N, L_cond, D_cond] or None
+        for layer in self.layers:
+            x = self.forward_layer(layer, x, cond)
+        x = self.norm(x)
+        return x
+class Voxel2Triplane(nn.Module):
+    """
+    Full model of the basic single-view large reconstruction model.
+    """
+    def __init__(self, transformer_dim: int, transformer_layers: int, transformer_heads: int,
+                 triplane_low_res: int, triplane_high_res: int, triplane_dim: int, voxel_feat_dim: int, normalize_vox_feat=False, voxel_dim=16):
+        super().__init__()
+        # attributes
+        self.triplane_low_res = triplane_low_res
+        self.triplane_high_res = triplane_high_res
+        self.triplane_dim = triplane_dim
+        self.voxel_feat_dim = voxel_feat_dim
+        # initialize pos_embed with 1/sqrt(dim) * N(0, 1)
+        self.pos_embed = nn.Parameter(torch.randn(1, 3*triplane_low_res**2, transformer_dim) * (1. / transformer_dim) ** 0.5)
+        self.transformer = TransformerDecoder(
+            block_type='cond',
+            num_layers=transformer_layers, num_heads=transformer_heads,
+            inner_dim=transformer_dim, cond_dim=voxel_feat_dim
+        )
+        self.upsampler = nn.ConvTranspose2d(transformer_dim, triplane_dim, kernel_size=8, stride=8, padding=0)
+        self.normalize_vox_feat = normalize_vox_feat
+        if normalize_vox_feat:
+            self.vox_norm = nn.LayerNorm(voxel_feat_dim, eps=1e-6)
+            self.vox_pos_embed = nn.Parameter(torch.randn(1, voxel_dim * voxel_dim * voxel_dim, voxel_feat_dim) * (1. / voxel_feat_dim) ** 0.5)
+    def forward_transformer(self, voxel_feats):
+        N = voxel_feats.shape[0]
+        x = self.pos_embed.repeat(N, 1, 1)  # [N, L, D]
+        if self.normalize_vox_feat:
+            vox_pos_embed = self.vox_pos_embed.repeat(N, 1, 1)  # [N, L, D]
+            voxel_feats = self.vox_norm(voxel_feats + vox_pos_embed)
+        x = self.transformer(
+            x,
+            cond=voxel_feats
+        )
+        return x
+    def reshape_upsample(self, tokens):
+        N = tokens.shape[0]
+        H = W = self.triplane_low_res
+        x = tokens.view(N, 3, H, W, -1)
+        x = torch.einsum('nihwd->indhw', x)  # [3, N, D, H, W]
+        x = x.contiguous().view(3*N, -1, H, W)  # [3*N, D, H, W]
+        x = self.upsampler(x)  # [3*N, D', H', W']
+        x = x.view(3, N, *x.shape[-3:])  # [3, N, D', H', W']
+        x = torch.einsum('indhw->nidhw', x)  # [N, 3, D', H', W']
+        x = x.contiguous()
+        return x
+    def forward(self, voxel_feats):
+        N = voxel_feats.shape[0]
+        # encode image
+        assert voxel_feats.shape[-1] == self.voxel_feat_dim, \
+            f"Feature dimension mismatch: {voxel_feats.shape[-1]} vs {self.voxel_feat_dim}"
+        # transformer generating planes
+        tokens = self.forward_transformer(voxel_feats)
+        planes = self.reshape_upsample(tokens)
+        assert planes.shape[0] == N, "Batch size mismatch for planes"
+        assert planes.shape[1] == 3, "Planes should have 3 channels"
+        return planes
+class TriplaneTransformer(nn.Module):
+    """
+    Full model of the basic single-view large reconstruction model.
+    """
+    def __init__(self, input_dim: int, transformer_dim: int, transformer_layers: int, transformer_heads: int,
+                 triplane_low_res: int, triplane_high_res: int, triplane_dim: int):
+        super().__init__()
+        # attributes
+        self.triplane_low_res = triplane_low_res
+        self.triplane_high_res = triplane_high_res
+        self.triplane_dim = triplane_dim
+        # initialize pos_embed with 1/sqrt(dim) * N(0, 1)
+        self.pos_embed = nn.Parameter(torch.randn(1, 3*triplane_low_res**2, transformer_dim) * (1. / transformer_dim) ** 0.5)
+        self.transformer = TransformerDecoder(
+            block_type='basic',
+            num_layers=transformer_layers, num_heads=transformer_heads,
+            inner_dim=transformer_dim,
+        )
+        self.downsampler = nn.Sequential(
+            nn.Conv2d(input_dim, transformer_dim, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),  # Reduces size from 128x128 to 64x64
+            nn.Conv2d(transformer_dim, transformer_dim, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),  # Reduces size from 64x64 to 32x32
+        )
+        self.upsampler = nn.ConvTranspose2d(transformer_dim, triplane_dim, kernel_size=4, stride=4, padding=0)
+        self.mlp = nn.Sequential(
+            nn.Linear(input_dim, triplane_dim),
+            nn.ReLU(),
+            nn.Linear(triplane_dim, triplane_dim)
+        )
+    def forward_transformer(self, triplanes):
+        N = triplanes.shape[0]
+        tokens = torch.einsum('nidhw->nihwd', triplanes).reshape(N, self.pos_embed.shape[1], -1) # [N, L, D]
+        x = self.pos_embed.repeat(N, 1, 1) + tokens # [N, L, D]
+        x = self.transformer(x)
+        return x
+    def reshape_downsample(self, triplanes):
+        N = triplanes.shape[0]
+        H = W = self.triplane_high_res
+        x = triplanes.view(N, 3, -1, H, W)
+        x = torch.einsum('nidhw->indhw', x)  # [3, N, D, H, W]
+        x = x.contiguous().view(3*N, -1, H, W)  # [3*N, D, H, W]
+        x = self.downsampler(x)  # [3*N, D', H', W']
+        x = x.view(3, N, *x.shape[-3:])  # [3, N, D', H', W']
+        x = torch.einsum('indhw->nidhw', x)  # [N, 3, D', H', W']
+        x = x.contiguous()
+        return x
+    def reshape_upsample(self, tokens):
+        N = tokens.shape[0]
+        H = W = self.triplane_low_res
+        x = tokens.view(N, 3, H, W, -1)
+        x = torch.einsum('nihwd->indhw', x)  # [3, N, D, H, W]
+        x = x.contiguous().view(3*N, -1, H, W)  # [3*N, D, H, W]
+        x = self.upsampler(x)  # [3*N, D', H', W']
+        x = x.view(3, N, *x.shape[-3:])  # [3, N, D', H', W']
+        x = torch.einsum('indhw->nidhw', x)  # [N, 3, D', H', W']
+        x = x.contiguous()
+        return x
+    def forward(self, triplanes):
+        downsampled_triplanes = self.reshape_downsample(triplanes)
+        tokens = self.forward_transformer(downsampled_triplanes)
+        residual = self.reshape_upsample(tokens)
+        triplanes = triplanes.permute(0, 1, 3, 4, 2).contiguous()
+        triplanes = self.mlp(triplanes)
+        triplanes = triplanes.permute(0, 1, 4, 2, 3).contiguous()
+        planes = triplanes + residual
+        return planes

modules/PartField/partfield/model_trainer_pvcnn_only_demo.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import torch
+import lightning.pytorch as pl
+from .dataloader import Demo_Dataset, Demo_Remesh_Dataset, Correspondence_Demo_Dataset
+from torch.utils.data import DataLoader
+from partfield.model.UNet.model import ResidualUNet3D
+from partfield.model.triplane import TriplaneTransformer, get_grid_coord #, sample_from_planes, Voxel2Triplane
+from partfield.model.model_utils import VanillaMLP
+import torch.nn.functional as F
+import torch.nn as nn
+import os
+import trimesh
+import skimage
+import numpy as np
+import h5py
+import torch.distributed as dist
+from partfield.model.PVCNN.encoder_pc import TriPlanePC2Encoder, sample_triplane_feat
+import json
+import gc
+import time
+from plyfile import PlyData, PlyElement
+class Model(pl.LightningModule):
+    def __init__(self, cfg):
+        super().__init__()
+        self.save_hyperparameters()
+        self.cfg = cfg
+        self.automatic_optimization = False
+        self.triplane_resolution = cfg.triplane_resolution
+        self.triplane_channels_low = cfg.triplane_channels_low
+        self.triplane_transformer = TriplaneTransformer(
+            input_dim=cfg.triplane_channels_low * 2,
+            transformer_dim=1024,
+            transformer_layers=6,
+            transformer_heads=8,
+            triplane_low_res=32,
+            triplane_high_res=128,
+            triplane_dim=cfg.triplane_channels_high,
+        )
+        self.sdf_decoder = VanillaMLP(input_dim=64,
+                                      output_dim=1,
+                                      out_activation="tanh",
+                                      n_neurons=64, #64
+                                      n_hidden_layers=6) #6
+        self.use_pvcnn = cfg.use_pvcnnonly
+        self.use_2d_feat = cfg.use_2d_feat
+        if self.use_pvcnn:
+            self.pvcnn = TriPlanePC2Encoder(
+                cfg.pvcnn,
+                device="cuda",
+                shape_min=-1,
+                shape_length=2,
+                use_2d_feat=self.use_2d_feat) #.cuda()
+        self.logit_scale = nn.Parameter(torch.tensor([1.0], requires_grad=True))
+        self.grid_coord = get_grid_coord(256)
+        self.mse_loss = torch.nn.MSELoss()
+        self.l1_loss = torch.nn.L1Loss(reduction='none')
+        if cfg.regress_2d_feat:
+            self.feat_decoder = VanillaMLP(input_dim=64,
+                                output_dim=192,
+                                out_activation="GELU",
+                                n_neurons=64, #64
+                                n_hidden_layers=6) #6
+    def predict_dataloader(self):
+        if self.cfg.remesh_demo:
+            dataset = Demo_Remesh_Dataset(self.cfg)
+        elif self.cfg.correspondence_demo:
+            dataset = Correspondence_Demo_Dataset(self.cfg)
+        else:
+            dataset = Demo_Dataset(self.cfg)
+        dataloader = DataLoader(dataset,
+                            num_workers=self.cfg.dataset.val_num_workers,
+                            batch_size=self.cfg.dataset.val_batch_size,
+                            shuffle=False,
+                            pin_memory=True,
+                            drop_last=False)
+        return dataloader
+    @torch.no_grad()
+    def predict_step(self, batch, batch_idx):
+        save_dir = f"{self.cfg.result_name}"
+        os.makedirs(save_dir, exist_ok=True)
+        uid = batch['uid'][0]
+        view_id = 0
+        starttime = time.time()
+        if uid == "car" or uid == "complex_car":
+        # if uid == "complex_car":
+            print("Skipping this for now.")
+            print(uid)
+            return
+        ### Skip if model already processed
+        if os.path.exists(f'{save_dir}/part_feat_{uid}_{view_id}.npy') or os.path.exists(f'{save_dir}/part_feat_{uid}_{view_id}_batch.npy'):
+            print("Already processed "+uid)
+            return
+        N = batch['pc'].shape[0]
+        assert N == 1
+        if self.use_2d_feat:
+            print("ERROR. Dataloader not implemented with input 2d feat.")
+            exit()
+        else:
+            pc_feat = self.pvcnn(batch['pc'], batch['pc'])
+        planes = pc_feat
+        planes = self.triplane_transformer(planes)
+        sdf_planes, part_planes = torch.split(planes, [64, planes.shape[2] - 64], dim=2)
+        if self.cfg.is_pc:
+            tensor_vertices = batch['pc'].reshape(1, -1, 3).cuda().to(torch.float16)
+            point_feat = sample_triplane_feat(part_planes, tensor_vertices) # N, M, C
+            point_feat = point_feat.cpu().detach().numpy().reshape(-1, 448)
+            np.save(f'{save_dir}/part_feat_{uid}_{view_id}.npy', point_feat)
+            print(f"Exported part_feat_{uid}_{view_id}.npy")
+            ###########
+            from sklearn.decomposition import PCA
+            data_scaled = point_feat / np.linalg.norm(point_feat, axis=-1, keepdims=True)
+            pca = PCA(n_components=3)
+            data_reduced = pca.fit_transform(data_scaled)
+            data_reduced = (data_reduced - data_reduced.min()) / (data_reduced.max() - data_reduced.min())
+            colors_255 = (data_reduced * 255).astype(np.uint8)
+            points = batch['pc'].squeeze().detach().cpu().numpy()
+            if colors_255 is None:
+                colors_255 = np.full_like(points, 255)  # Default to white color (255,255,255)
+            else:
+                assert colors_255.shape == points.shape, "Colors must have the same shape as points"
+            # Convert to structured array for PLY format
+            vertex_data = np.array(
+                [(*point, *color) for point, color in zip(points, colors_255)],
+                dtype=[("x", "f4"), ("y", "f4"), ("z", "f4"), ("red", "u1"), ("green", "u1"), ("blue", "u1")]
+            )
+            # Create PLY element
+            el = PlyElement.describe(vertex_data, "vertex")
+            # Write to file
+            filename = f'{save_dir}/feat_pca_{uid}_{view_id}.ply'
+            PlyData([el], text=True).write(filename)
+            print(f"Saved PLY file: {filename}")
+            ############
+        else:
+            use_cuda_version = True
+            if use_cuda_version:
+                def sample_points(vertices, faces, n_point_per_face):
+                    # Generate random barycentric coordinates
+                    # borrowed from Kaolin https://github.com/NVIDIAGameWorks/kaolin/blob/master/kaolin/ops/mesh/trianglemesh.py#L43
+                    n_f = faces.shape[0]
+                    u = torch.sqrt(torch.rand((n_f, n_point_per_face, 1),
+                                                device=vertices.device,
+                                                dtype=vertices.dtype))
+                    v = torch.rand((n_f, n_point_per_face, 1),
+                                    device=vertices.device,
+                                    dtype=vertices.dtype)
+                    w0 = 1 - u
+                    w1 = u * (1 - v)
+                    w2 = u * v
+                    face_v_0 = torch.index_select(vertices, 0, faces[:, 0].reshape(-1))
+                    face_v_1 = torch.index_select(vertices, 0, faces[:, 1].reshape(-1))
+                    face_v_2 = torch.index_select(vertices, 0, faces[:, 2].reshape(-1))
+                    points = w0 * face_v_0.unsqueeze(dim=1) + w1 * face_v_1.unsqueeze(dim=1) + w2 * face_v_2.unsqueeze(dim=1)
+                    return points
+                def sample_and_mean_memory_save_version(part_planes, tensor_vertices, n_point_per_face):
+                    n_sample_each = self.cfg.n_sample_each # we iterate over this to avoid OOM
+                    n_v = tensor_vertices.shape[1]
+                    n_sample = n_v // n_sample_each + 1
+                    all_sample = []
+                    for i_sample in range(n_sample):
+                        sampled_feature = sample_triplane_feat(part_planes, tensor_vertices[:, i_sample * n_sample_each: i_sample * n_sample_each + n_sample_each,])
+                        assert sampled_feature.shape[1] % n_point_per_face == 0
+                        sampled_feature = sampled_feature.reshape(1, -1, n_point_per_face, sampled_feature.shape[-1])
+                        sampled_feature = torch.mean(sampled_feature, axis=-2)
+                        all_sample.append(sampled_feature)
+                    return torch.cat(all_sample, dim=1)
+                if self.cfg.vertex_feature:
+                    tensor_vertices = batch['vertices'][0].reshape(1, -1, 3).to(torch.float32)
+                    point_feat = sample_and_mean_memory_save_version(part_planes, tensor_vertices, 1)
+                else:
+                    n_point_per_face = self.cfg.n_point_per_face
+                    tensor_vertices = sample_points(batch['vertices'][0], batch['faces'][0], n_point_per_face)
+                    tensor_vertices = tensor_vertices.reshape(1, -1, 3).to(torch.float32)
+                    point_feat = sample_and_mean_memory_save_version(part_planes, tensor_vertices, n_point_per_face)  # N, M, C
+                #### Take mean feature in the triangle
+                print("Time elapsed for feature prediction: " + str(time.time() - starttime))
+                point_feat = point_feat.reshape(-1, 448).cpu().numpy()
+                np.save(f'{save_dir}/part_feat_{uid}_{view_id}_batch.npy', point_feat)
+                print(f"Exported part_feat_{uid}_{view_id}.npy")
+                ###########
+                from sklearn.decomposition import PCA
+                data_scaled = point_feat / np.linalg.norm(point_feat, axis=-1, keepdims=True)
+                pca = PCA(n_components=3)
+                data_reduced = pca.fit_transform(data_scaled)
+                data_reduced = (data_reduced - data_reduced.min()) / (data_reduced.max() - data_reduced.min())
+                colors_255 = (data_reduced * 255).astype(np.uint8)
+                V = batch['vertices'][0].cpu().numpy()
+                F = batch['faces'][0].cpu().numpy()
+                if self.cfg.vertex_feature:
+                    colored_mesh = trimesh.Trimesh(vertices=V, faces=F, vertex_colors=colors_255, process=False)
+                else:
+                    colored_mesh = trimesh.Trimesh(vertices=V, faces=F, face_colors=colors_255, process=False)
+                colored_mesh.export(f'{save_dir}/feat_pca_{uid}_{view_id}.ply')
+                ############
+                torch.cuda.empty_cache()
+            else:
+                ### Mesh input (obj file)
+                V = batch['vertices'][0].cpu().numpy()
+                F = batch['faces'][0].cpu().numpy()
+                ##### Loop through faces #####
+                num_samples_per_face = self.cfg.n_point_per_face
+                all_point_feats = []
+                for face in F:
+                    # Get the vertices of the current face
+                    v0, v1, v2 = V[face]
+                    # Generate random barycentric coordinates
+                    u = np.random.rand(num_samples_per_face, 1)
+                    v = np.random.rand(num_samples_per_face, 1)
+                    is_prob = (u+v) >1
+                    u[is_prob] = 1 - u[is_prob]
+                    v[is_prob] = 1 - v[is_prob]
+                    w = 1 - u - v
+                    # Calculate points in Cartesian coordinates
+                    points = u * v0 + v * v1 + w * v2
+                    tensor_vertices = torch.from_numpy(points.copy()).reshape(1, -1, 3).cuda().to(torch.float32)
+                    point_feat = sample_triplane_feat(part_planes, tensor_vertices) # N, M, C
+                    #### Take mean feature in the triangle
+                    point_feat = torch.mean(point_feat, axis=1).cpu().detach().numpy()
+                    all_point_feats.append(point_feat)
+                ##############################
+                all_point_feats = np.array(all_point_feats).reshape(-1, 448)
+                point_feat = all_point_feats
+                np.save(f'{save_dir}/part_feat_{uid}_{view_id}.npy', point_feat)
+                print(f"Exported part_feat_{uid}_{view_id}.npy")
+                ###########
+                from sklearn.decomposition import PCA
+                data_scaled = point_feat / np.linalg.norm(point_feat, axis=-1, keepdims=True)
+                pca = PCA(n_components=3)
+                data_reduced = pca.fit_transform(data_scaled)
+                data_reduced = (data_reduced - data_reduced.min()) / (data_reduced.max() - data_reduced.min())
+                colors_255 = (data_reduced * 255).astype(np.uint8)
+                colored_mesh = trimesh.Trimesh(vertices=V, faces=F, face_colors=colors_255, process=False)
+                colored_mesh.export(f'{save_dir}/feat_pca_{uid}_{view_id}.ply')
+                ############
+        print("Time elapsed: " + str(time.time()-starttime))
+        return

modules/PartField/partfield/partfield_encoder.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import lightning.pytorch as pl
+# from .dataloader import Demo_Dataset, Demo_Remesh_Dataset, Correspondence_Demo_Dataset
+from torch.utils.data import DataLoader
+from partfield.model.UNet.model import ResidualUNet3D
+from partfield.model.triplane import TriplaneTransformer, get_grid_coord #, sample_from_planes, Voxel2Triplane
+from partfield.model.model_utils import VanillaMLP
+import torch.nn.functional as F
+import torch.nn as nn
+import os
+import trimesh
+import skimage
+import numpy as np
+import h5py
+import torch.distributed as dist
+from partfield.model.PVCNN.encoder_pc import TriPlanePC2Encoder, sample_triplane_feat
+import json
+import gc
+import time
+from plyfile import PlyData, PlyElement
+class Model(pl.LightningModule):
+    def __init__(self, cfg):
+        super().__init__()
+        self.save_hyperparameters()
+        self.cfg = cfg
+        self.automatic_optimization = False
+        self.triplane_resolution = cfg.triplane_resolution
+        self.triplane_channels_low = cfg.triplane_channels_low
+        self.triplane_transformer = TriplaneTransformer(
+            input_dim=cfg.triplane_channels_low * 2,
+            transformer_dim=1024,
+            transformer_layers=6,
+            transformer_heads=8,
+            triplane_low_res=32,
+            triplane_high_res=128,
+            triplane_dim=cfg.triplane_channels_high,
+        )
+        self.sdf_decoder = VanillaMLP(input_dim=64,
+                                      output_dim=1,
+                                      out_activation="tanh",
+                                      n_neurons=64, #64
+                                      n_hidden_layers=6) #6
+        self.use_pvcnn = cfg.use_pvcnnonly
+        self.use_2d_feat = cfg.use_2d_feat
+        if self.use_pvcnn:
+            self.pvcnn = TriPlanePC2Encoder(
+                cfg.pvcnn,
+                device="cuda",
+                shape_min=-1,
+                shape_length=2,
+                use_2d_feat=self.use_2d_feat) #.cuda()
+        self.logit_scale = nn.Parameter(torch.tensor([1.0], requires_grad=True))
+        self.grid_coord = get_grid_coord(256)
+        self.mse_loss = torch.nn.MSELoss()
+        self.l1_loss = torch.nn.L1Loss(reduction='none')
+        if cfg.regress_2d_feat:
+            self.feat_decoder = VanillaMLP(input_dim=64,
+                                output_dim=192,
+                                out_activation="GELU",
+                                n_neurons=64, #64
+                                n_hidden_layers=6) #6
+    # def predict_dataloader(self):
+    #     if self.cfg.remesh_demo:
+    #         dataset = Demo_Remesh_Dataset(self.cfg)
+    #     elif self.cfg.correspondence_demo:
+    #         dataset = Correspondence_Demo_Dataset(self.cfg)
+    #     else:
+    #         dataset = Demo_Dataset(self.cfg)
+    #     dataloader = DataLoader(dataset,
+    #                         num_workers=self.cfg.dataset.val_num_workers,
+    #                         batch_size=self.cfg.dataset.val_batch_size,
+    #                         shuffle=False,
+    #                         pin_memory=True,
+    #                         drop_last=False)
+    #     return dataloader
+    @torch.no_grad()
+    def encode(self, points):
+        N = points.shape[0]
+        # assert N == 1
+        pcd = points[..., :3]
+        pc_feat = self.pvcnn(pcd, pcd)
+        planes = pc_feat
+        planes = self.triplane_transformer(planes)
+        sdf_planes, part_planes = torch.split(planes, [64, planes.shape[2] - 64], dim=2)
+        tensor_vertices = pcd.reshape(N, -1, 3).cuda().to(pcd.dtype)
+        point_feat = sample_triplane_feat(part_planes, tensor_vertices) # N, M, C
+        # point_feat = point_feat.cpu().detach().numpy().reshape(-1, 448)
+        point_feat = point_feat.reshape(N, -1, 448)
+        return point_feat

modules/PartField/partfield/utils.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import trimesh
+def load_mesh_util(input_fname):
+    mesh = trimesh.load(input_fname, force='mesh', process=False)
+    return mesh

modules/bbox_gen/config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+from omegaconf import OmegaConf, DictConfig
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+from datetime import datetime
+@dataclass
+class ExperimentConfig:
+    name: str = "default"
+    tag: str = ""
+    use_timestamp: bool = False
+    timestamp: Optional[str] = None
+    exp_root_dir: str = "outputs"
+    ### these shouldn't be set manually
+    exp_dir: str = "outputs/default"
+    trial_name: str = "exp"
+    trial_dir: str = "outputs/default/exp"
+    ###
+    resume: Optional[str] = None
+    ckpt_path: Optional[str] = None
+    data: dict = field(default_factory=dict)
+    model_pl: dict = field(default_factory=dict)
+    trainer: dict = field(default_factory=dict)
+    checkpoint: dict = field(default_factory=dict)
+    checkpoint_epoch: Optional[dict] = None
+    wandb: dict = field(default_factory=dict)
+def load_config(*yamls: str, cli_args: list = [], from_string=False, **kwargs) -> Any:
+    if from_string:
+        yaml_confs = [OmegaConf.create(s) for s in yamls]
+    else:
+        yaml_confs = [OmegaConf.load(f) for f in yamls]
+    cli_conf = OmegaConf.from_cli(cli_args)
+    cfg = OmegaConf.merge(*yaml_confs, cli_conf, kwargs)
+    OmegaConf.resolve(cfg)
+    assert isinstance(cfg, DictConfig)
+    scfg = parse_structured(ExperimentConfig, cfg)
+    return scfg
+def config_to_primitive(config, resolve: bool = True) -> Any:
+    return OmegaConf.to_container(config, resolve=resolve)
+def dump_config(path: str, config) -> None:
+    with open(path, "w") as fp:
+        OmegaConf.save(config=config, f=fp)
+def parse_structured(fields: Any, cfg: Optional[Union[dict, DictConfig]] = None) -> Any:
+    scfg = OmegaConf.structured(fields(**cfg))
+    return scfg

modules/bbox_gen/models/autogressive_bbox_gen.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from dataclasses import dataclass
+import os
+import sys
+import torch
+import trimesh
+from torch import nn
+from transformers import AutoModelForCausalLM
+from transformers.generation.logits_process import LogitsProcessorList
+from einops import rearrange
+from modules.bbox_gen.models.image_encoder import DINOv2ImageEncoder
+from modules.bbox_gen.config import parse_structured
+from modules.bbox_gen.models.bboxopt import BBoxOPT, BBoxOPTConfig
+from modules.bbox_gen.utils.bbox_tokenizer import BoundsTokenizerDiag
+from modules.bbox_gen.models.bbox_gen_models import GroupEmbedding, MultiModalProjector, MeshDecodeLogitsProcessor, SparseStructureEncoder
+current_dir = os.path.dirname(os.path.abspath(__file__))
+modules_dir = os.path.dirname(os.path.dirname(current_dir))
+partfield_dir = os.path.join(modules_dir, 'PartField')
+if partfield_dir not in sys.path:
+    sys.path.insert(0, partfield_dir)
+import importlib.util
+from partfield.config import default_argument_parser, setup
+class BboxGen(nn.Module):
+    @dataclass
+    class Config:
+        # encoder config
+        encoder_dim_feat: int = 3
+        encoder_dim: int = 64
+        encoder_heads: int = 4
+        encoder_token_num: int = 256
+        encoder_qkv_bias: bool = False
+        encoder_use_ln_post: bool = True
+        encoder_use_checkpoint: bool = False
+        encoder_num_embed_freqs: int = 8
+        encoder_embed_include_pi: bool = False
+        encoder_init_scale: float = 0.25
+        encoder_random_fps: bool = True
+        encoder_learnable_query: bool = False
+        encoder_layers: int = 4
+        group_embedding_dim: int = 64
+        # decoder config
+        vocab_size: int = 518
+        decoder_hidden_size: int = 1536
+        decoder_num_hidden_layers: int = 24
+        decoder_ffn_dim: int = 6144
+        decoder_heads: int = 16
+        decoder_use_flash_attention: bool = True
+        decoder_gradient_checkpointing: bool = True
+        # data config
+        bins: int = 64
+        BOS_id: int = 64
+        EOS_id: int = 65
+        PAD_id: int = 66
+        max_length: int = 2187  # bos + 50x2x3 + 1374 + 512
+        voxel_token_length: int = 1886
+        voxel_token_placeholder: int = -1
+        # tokenizer config
+        max_group_size: int = 50
+        # voxel encoder
+        partfield_encoder_path: str = ""
+    cfg: Config
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = parse_structured(self.Config, cfg)
+        self.image_encoder = DINOv2ImageEncoder(
+            model_name="facebook/dinov2-with-registers-large",
+        )
+        self.image_projector = MultiModalProjector(
+            in_features=(1024 + self.cfg.group_embedding_dim),
+            out_features=self.cfg.decoder_hidden_size,
+        )
+        self.group_embedding = GroupEmbedding(
+            max_group_size=self.cfg.max_group_size,
+            hidden_size=self.cfg.group_embedding_dim,
+        )
+        self.decoder_config = BBoxOPTConfig(
+            vocab_size=self.cfg.vocab_size,
+            hidden_size=self.cfg.decoder_hidden_size,
+            num_hidden_layers=self.cfg.decoder_num_hidden_layers,
+            ffn_dim=self.cfg.decoder_ffn_dim,
+            max_position_embeddings=self.cfg.max_length,
+            num_attention_heads=self.cfg.decoder_heads,
+            pad_token_id=self.cfg.PAD_id,
+            bos_token_id=self.cfg.BOS_id,
+            eos_token_id=self.cfg.EOS_id,
+            use_cache=True,
+            init_std=0.02,
+        )
+        if self.cfg.decoder_use_flash_attention:
+            self.decoder: BBoxOPT = AutoModelForCausalLM.from_config(
+                self.decoder_config,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2"
+            )
+        else:
+            self.decoder: BBoxOPT = AutoModelForCausalLM.from_config(
+                self.decoder_config,
+            )
+        if self.cfg.decoder_gradient_checkpointing:
+            self.decoder.gradient_checkpointing_enable()
+        self.logits_processor = LogitsProcessorList()
+        self.logits_processor.append(MeshDecodeLogitsProcessor(
+            bins=self.cfg.bins,
+            BOS_id=self.cfg.BOS_id,
+            EOS_id=self.cfg.EOS_id,
+            PAD_id=self.cfg.PAD_id,
+            vertices_num=2,
+        ))
+        self.tokenizer = BoundsTokenizerDiag(
+            bins=self.cfg.bins,
+            BOS_id=self.cfg.BOS_id,
+            EOS_id=self.cfg.EOS_id,
+            PAD_id=self.cfg.PAD_id,
+        )
+        self._load_partfield_encoder()
+        self.partfield_voxel_encoder = SparseStructureEncoder(
+            in_channels=451,
+            channels=[448, 448, 448, 1024],
+            latent_channels=448,
+            num_res_blocks=1,
+            num_res_blocks_middle=1,
+            norm_type="layer",
+        )
+    def _load_partfield_encoder(self):
+        # Load PartField encoder
+        model_spec = importlib.util.spec_from_file_location(
+                "partfield.partfield_encoder",
+                os.path.join(partfield_dir, "partfield", "partfield_encoder.py")
+            )
+        model_module = importlib.util.module_from_spec(model_spec)
+        model_spec.loader.exec_module(model_module)
+        Model = model_module.Model
+        parser = default_argument_parser()
+        args = []
+        args.extend(["-c", os.path.join(partfield_dir, "configs/final/demo.yaml")])
+        args.append("--opts")
+        args.extend(["continue_ckpt", self.cfg.partfield_encoder_path])
+        parsed_args = parser.parse_args(args)
+        cfg = setup(parsed_args, freeze=False)
+        self.partfield_encoder = Model(cfg)
+        self.partfield_encoder.eval()
+        weights = torch.load(self.cfg.partfield_encoder_path)["state_dict"]
+        self.partfield_encoder.load_state_dict(weights)
+        for param in self.partfield_encoder.parameters():
+            param.requires_grad = False
+        print("PartField encoder loaded")
+    def _prepare_lm_inputs(self, voxel_token, input_ids):
+        inputs_embeds = torch.zeros(input_ids.shape[0], input_ids.shape[1], self.cfg.decoder_hidden_size, device=input_ids.device, dtype=voxel_token.dtype)
+        voxel_token_mask = (input_ids == self.cfg.voxel_token_placeholder)
+        inputs_embeds[voxel_token_mask] = voxel_token.view(-1, self.cfg.decoder_hidden_size)
+        inputs_embeds[~voxel_token_mask] = self.decoder.get_input_embeddings()(input_ids[~voxel_token_mask]).to(dtype=inputs_embeds.dtype)
+        attention_mask = (input_ids != self.cfg.PAD_id)
+        return inputs_embeds, attention_mask.long()
+    def forward(self, batch):
+        image_latents = self.image_encoder(batch['images'])
+        masks = batch['masks']
+        masks_emb = self.group_embedding(masks)
+        masks_emb = rearrange(masks_emb, 'b c h w -> b (h w) c') # B x Q x C
+        group_emb = torch.zeros((image_latents.shape[0], image_latents.shape[1], masks_emb.shape[2]), device=image_latents.device, dtype=image_latents.dtype)
+        group_emb[:, :masks_emb.shape[1], :] = masks_emb
+        image_latents = torch.cat([image_latents, group_emb], dim=-1)
+        image_latents = self.image_projector(image_latents)
+        points = batch['points'][..., :3]
+        rot_matrix = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], device=points.device, dtype=points.dtype)
+        rot_points = torch.matmul(points, rot_matrix)
+        rot_points = rot_points * (2 * 0.9)  # from (-0.5, 0.5) to (-1, 1)
+        partfield_feat = self.partfield_encoder.encode(rot_points)
+        feat_volume = torch.zeros((points.shape[0], 448, 64, 64, 64), device=partfield_feat.device, dtype=partfield_feat.dtype)
+        whole_voxel_index = batch['whole_voxel_index']  # (b, m, 3)
+        batch_size, num_points = whole_voxel_index.shape[0], whole_voxel_index.shape[1]
+        batch_indices = torch.arange(batch_size, device=whole_voxel_index.device).unsqueeze(1).expand(-1, num_points)  # (b, m)
+        batch_flat = batch_indices.flatten()  # (b*m,)
+        x_flat = whole_voxel_index[..., 0].flatten()  # (b*m,)
+        y_flat = whole_voxel_index[..., 1].flatten()  # (b*m,)
+        z_flat = whole_voxel_index[..., 2].flatten()  # (b*m,)
+        partfield_feat_flat = partfield_feat.reshape(-1, 448)  # (b*m, 448)
+        feat_volume[batch_flat, :, x_flat, y_flat, z_flat] = partfield_feat_flat
+        xyz_volume = torch.zeros((points.shape[0], 3, 64, 64, 64), device=points.device, dtype=points.dtype)
+        xyz_volume[batch_flat, :, x_flat, y_flat, z_flat] = points.reshape(-1, 3)
+        feat_volume = torch.cat([feat_volume, xyz_volume], dim=1)
+        feat_volume = self.partfield_voxel_encoder(feat_volume)
+        feat_volume = rearrange(feat_volume, 'b c x y z -> b (x y z) c')
+        voxel_token = torch.cat([image_latents, feat_volume], dim=1) # B x N x D
+        input_ids = batch['input_ids']
+        inputs_embeds, attention_mask = self._prepare_lm_inputs(voxel_token, input_ids)
+        output = self.decoder(
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            return_dict=True,
+        )
+        return {
+            "logits": output.logits,
+        }
+    def gen_mesh_from_bounds(self, bounds, random_color):
+        bboxes = []
+        for j in range(bounds.shape[0]):
+            bbox = trimesh.primitives.Box(bounds=bounds[j])
+            color = random_color[j]
+            bbox.visual.vertex_colors = color
+            bboxes.append(bbox)
+        mesh = trimesh.Scene(bboxes)
+        return mesh
+    def generate(self, batch):
+        image_latents = self.image_encoder(batch['images'])
+        masks = batch['masks']
+        masks_emb = self.group_embedding(masks)
+        masks_emb = rearrange(masks_emb, 'b c h w -> b (h w) c') # B x Q x C
+        group_emb = torch.zeros((image_latents.shape[0], image_latents.shape[1], masks_emb.shape[2]), device=image_latents.device, dtype=image_latents.dtype)
+        group_emb[:, :masks_emb.shape[1], :] = masks_emb
+        image_latents = torch.cat([image_latents, group_emb], dim=-1)
+        image_latents = self.image_projector(image_latents)
+        points = batch['points'][..., :3]
+        rot_matrix = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], device=points.device, dtype=points.dtype)
+        rot_points = torch.matmul(points, rot_matrix)
+        rot_points = rot_points * (2 * 0.9)  # from (-0.5, 0.5) to (-1, 1)
+        partfield_feat = self.partfield_encoder.encode(rot_points)
+        feat_volume = torch.zeros((points.shape[0], 448, 64, 64, 64), device=partfield_feat.device, dtype=partfield_feat.dtype)
+        whole_voxel_index = batch['whole_voxel_index']  # (b, m, 3)
+        batch_size, num_points = whole_voxel_index.shape[0], whole_voxel_index.shape[1]
+        batch_indices = torch.arange(batch_size, device=whole_voxel_index.device).unsqueeze(1).expand(-1, num_points)  # (b, m)
+        batch_flat = batch_indices.flatten()  # (b*m,)
+        x_flat = whole_voxel_index[..., 0].flatten()  # (b*m,)
+        y_flat = whole_voxel_index[..., 1].flatten()  # (b*m,)
+        z_flat = whole_voxel_index[..., 2].flatten()  # (b*m,)
+        partfield_feat_flat = partfield_feat.reshape(-1, 448)  # (b*m, 448)
+        feat_volume[batch_flat, :, x_flat, y_flat, z_flat] = partfield_feat_flat
+        xyz_volume = torch.zeros((points.shape[0], 3, 64, 64, 64), device=points.device, dtype=points.dtype)
+        xyz_volume[batch_flat, :, x_flat, y_flat, z_flat] = points.reshape(-1, 3)
+        feat_volume = torch.cat([feat_volume, xyz_volume], dim=1)
+        feat_volume = self.partfield_voxel_encoder(feat_volume)
+        feat_volume = rearrange(feat_volume, 'b c x y z -> b (x y z) c')
+        voxel_token = torch.cat([image_latents, feat_volume], dim=1) # B x N x D
+        meshes = []
+        mesh_names = []
+        bboxes = []
+        output = self.decoder.generate(
+            inputs_embeds=voxel_token,
+            max_new_tokens=self.cfg.max_length - voxel_token.shape[1],
+            logits_processor=self.logits_processor,
+            do_sample=True,
+            top_k=5,
+            top_p=0.95,
+            temperature=0.5,
+            use_cache=True,
+        )
+        for i in range(output.shape[0]):
+            bounds = self.tokenizer.decode(output[i].detach().cpu().numpy(), coord_rg=(-0.5, 0.5))
+            # mesh = self.gen_mesh_from_bounds(bounds, batch['random_color'][i])
+            # meshes.append(mesh)
+            mesh_names.append("topk=5")
+            bboxes.append(bounds)
+        return {
+            # 'meshes': meshes,
+            'mesh_names': mesh_names,
+            'bboxes': bboxes,
+        }

modules/bbox_gen/models/bbox_gen_models.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from diffusers.models.normalization import FP32LayerNorm
+from diffusers.models.attention import FeedForward
+from transformers.generation.logits_process import LogitsProcessor
+from typing import List, Literal, Optional
+from modules.bbox_gen.modules.norm import GroupNorm32, ChannelLayerNorm32
+class GroupEmbedding(nn.Module):
+    def __init__(self, max_group_size, hidden_size=64):
+        super().__init__()
+        self.group_embedding = nn.Embedding(max_group_size + 1, hidden_size)  # +1 for background
+        self.group_embedding.weight.data.normal_(mean=0.0, std=0.02)
+    def forward(self, masks):
+        batch_size, height, width = masks.shape
+        masks_flat = masks.reshape(batch_size, -1)
+        embeddings = self.group_embedding(masks_flat)
+        embeddings = embeddings.reshape(batch_size, height, width, -1)
+        embeddings = embeddings.permute(0, 3, 1, 2)
+        return embeddings
+class MultiModalProjector(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, pos_embed_seq_len=None):
+        super().__init__()
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+        if pos_embed_seq_len is not None:
+            self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features))
+        else:
+            self.pos_embed = None
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        if self.pos_embed is not None:
+            batch_size, seq_len, embed_dim = encoder_hidden_states_image.shape
+            encoder_hidden_states_image = encoder_hidden_states_image.view(-1, 2 * seq_len, embed_dim)
+            encoder_hidden_states_image = encoder_hidden_states_image + self.pos_embed
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+class MeshDecodeLogitsProcessor(LogitsProcessor):
+    def __init__(self, bins, BOS_id, EOS_id, PAD_id, vertices_num=8):
+        super().__init__()
+        self.bins = bins
+        self.BOS_id = BOS_id
+        self.EOS_id = EOS_id
+        self.PAD_id = PAD_id
+        self.filter_value = -float('inf')
+        self.vertices_num = vertices_num
+    def force_token(self, scores, token_id):
+        mask = torch.ones_like(scores, dtype=torch.bool)
+        mask[:, token_id] = False
+        scores[mask] = self.filter_value
+    def __call__(self, input_ids, scores):
+        # # all rules:
+        # # 1. first token: BOS
+        current_len = input_ids.shape[-1]
+        if current_len == 0:
+            # force bos
+            self.force_token(scores, self.BOS_id)
+        elif current_len <= self.vertices_num * 3 + 1:
+            scores[:, self.bins:] = self.filter_value
+        else:
+            scores[:, self.BOS_id] = self.filter_value
+            scores[:, self.PAD_id] = self.filter_value
+            effective_tokens = current_len - 1
+            complete_boxes = effective_tokens % (self.vertices_num * 3) == 0
+            # print(effective_tokens, complete_boxes)
+            if not complete_boxes:
+                scores[:, self.EOS_id] = self.filter_value
+        return scores
+def norm_layer(norm_type: str, *args, **kwargs) -> nn.Module:
+    """
+    Return a normalization layer.
+    """
+    if norm_type == "group":
+        return GroupNorm32(32, *args, **kwargs)
+    elif norm_type == "layer":
+        return ChannelLayerNorm32(*args, **kwargs)
+    else:
+        raise ValueError(f"Invalid norm type {norm_type}")
+class ResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        norm_type: Literal["group", "layer"] = "layer",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.norm1 = norm_layer(norm_type, channels)
+        self.norm2 = norm_layer(norm_type, self.out_channels)
+        self.conv1 = nn.Conv3d(channels, self.out_channels, 3, padding=1)
+        self.conv2 = zero_module(nn.Conv3d(self.out_channels, self.out_channels, 3, padding=1))
+        self.skip_connection = nn.Conv3d(channels, self.out_channels, 1) if channels != self.out_channels else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = F.silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.silu(h)
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class DownsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "avgpool"] = "conv",
+    ):
+        assert mode in ["conv", "avgpool"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels, 2, stride=2)
+        elif mode == "avgpool":
+            assert in_channels == out_channels, "Pooling mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            return self.conv(x)
+        else:
+            return F.avg_pool3d(x, 2)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class SparseStructureEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.dtype = torch.float16
+        self.input_layer = nn.Conv3d(in_channels, channels[0], 3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    DownsampleBlock3d(ch, channels[i+1])
+                )
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[-1], channels[-1])
+            for _ in range(num_res_blocks_middle)
+        ])
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def forward(self, x: torch.Tensor):
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        h = self.middle_block(h)
+        h = h.type(x.dtype)
+        return h

modules/bbox_gen/models/bboxopt.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers import AutoModelForCausalLM, AutoConfig
+from transformers.models.opt.modeling_opt import OPTForCausalLM, OPTModel, OPTDecoder, OPTConfig
+from transformers.utils import logging
+from typing import Optional, Union
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.utils import GenerateNonBeamOutput, GenerateEncoderDecoderOutput, GenerateDecoderOnlyOutput
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.streamers import BaseStreamer
+logger = logging.get_logger(__name__)
+class BBoxOPTConfig(OPTConfig):
+    model_type = "mesh_opt"
+class BBoxOPTDecoder(OPTDecoder):
+    config_class = BBoxOPTConfig
+class BBoxOPTModel(OPTModel):
+    config_class = BBoxOPTConfig
+    def __init__(self, config: BBoxOPTConfig):
+        super(OPTModel, self).__init__(config)
+        self.decoder = BBoxOPTDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+class BBoxOPT(OPTForCausalLM):
+    config_class = BBoxOPTConfig
+    def __init__(self, config: BBoxOPTConfig):
+        super(OPTForCausalLM, self).__init__(config)
+        self.model = BBoxOPTModel(config)
+        # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional["BaseStreamer"],
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
+            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        max_length = generation_config.max_length
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # keep track of which sequences are already finished
+        batch_size, cur_len = input_ids.shape
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+        while self._has_unfinished_sequences(
+            this_peer_finished, synced_gpus, device=input_ids.device
+        ) and cur_len < max_length:
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+            # forward pass to get next token
+            outputs = self(**model_inputs, return_dict=True)
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # (the clone itself is always small)
+            next_token_logits = outputs.logits.clone()[:, -1, :].float()
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # token selection
+            if do_sample:
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(next_token_scores, dim=-1)
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+            cur_len += 1
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            del outputs
+        if streamer is not None:
+            streamer.end()
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+AutoConfig.register("mesh_opt", BBoxOPTConfig)
+AutoModelForCausalLM.register(BBoxOPTConfig, BBoxOPT)

modules/bbox_gen/models/image_encoder.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import Literal
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from transformers import AutoModel
+class DINOv2ImageEncoder(nn.Module):
+    def __init__(self, model_name: Literal[
+        "facebook/dinov2-with-registers-large",
+        "facebook/dinov2-large"
+    ]):
+        super().__init__()
+        self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+        self.model.requires_grad_(False)
+        self.model.eval()
+        DINOv2_INPUT_MEAN = torch.as_tensor([0.485, 0.456, 0.406], dtype=torch.float32)[
+            None, :, None, None
+        ]
+        DINOv2_INPUT_STD = torch.as_tensor([0.229, 0.224, 0.225], dtype=torch.float32)[
+            None, :, None, None
+        ]
+        self.register_buffer("DINOv2_INPUT_MEAN", DINOv2_INPUT_MEAN, persistent=False)
+        self.register_buffer("DINOv2_INPUT_STD", DINOv2_INPUT_STD, persistent=False)
+        self.max_size = 518
+        self.hidden_size = self.model.config.hidden_size
+    def preprocess(self, image: torch.Tensor):
+        B, C, H, W = image.shape
+        assert C == 3 and H <= self.max_size and W <= self.max_size
+        image = (image - self.DINOv2_INPUT_MEAN.to(image)) / self.DINOv2_INPUT_STD.to(image)
+        return image
+    def forward(self, image: torch.Tensor):
+        image = self.preprocess(image)
+        features = self.model(image).last_hidden_state
+        return features