Chang, Di; Cao, Mingdeng; Shi, Yichun; Liu, Bo; Cai, Shengqu; Zhou, Shijie; Huang, Weilin; Wetzstein, Gordon; Soleymani, Mohammad; Wang, Peng

ByteMorph: Benchmarking Instruction-Guided Image Editing with Non-Rigid Motions Miscellaneous

2025, (arXiv:2506.03107 [cs]).

Abstract | Links | BibTeX | Tags: VGL

Siniukov, Maksim; Chang, Di; Tran, Minh; Gong, Hongkun; Chaubey, Ashutosh; Soleymani, Mohammad

DiTaiListener: Controllable High Fidelity Listener Video Generation with Diffusion Miscellaneous

2025, (Version Number: 1).

Abstract | Links | BibTeX | Tags: DTIC, VGL

@misc{siniukov_ditailistener_2025,

title = {DiTaiListener: Controllable High Fidelity Listener Video Generation with Diffusion},

author = {Maksim Siniukov and Di Chang and Minh Tran and Hongkun Gong and Ashutosh Chaubey and Mohammad Soleymani},

url = {https://arxiv.org/abs/2504.04010},

doi = {10.48550/ARXIV.2504.04010},

year  = {2025},

date = {2025-03-01},

urldate = {2025-04-15},

publisher = {arXiv},

abstract = {Generating naturalistic and nuanced listener motions for extended interactions remains an open problem. Existing methods often rely on low-dimensional motion codes for facial behavior generation followed by photorealistic rendering, limiting both visual fidelity and expressive richness. To address these challenges, we introduce DiTaiListener, powered by a video diffusion model with multimodal conditions. Our approach first generates short segments of listener responses conditioned on the speaker's speech and facial motions with DiTaiListener-Gen. It then refines the transitional frames via DiTaiListener-Edit for a seamless transition. Specifically, DiTaiListener-Gen adapts a Diffusion Transformer (DiT) for the task of listener head portrait generation by introducing a Causal Temporal Multimodal Adapter (CTM-Adapter) to process speakers' auditory and visual cues. CTM-Adapter integrates speakers' input in a causal manner into the video generation process to ensure temporally coherent listener responses. For long-form video generation, we introduce DiTaiListener-Edit, a transition refinement video-to-video diffusion model. The model fuses video segments into smooth and continuous videos, ensuring temporal consistency in facial expressions and image quality when merging short video segments produced by DiTaiListener-Gen. Quantitatively, DiTaiListener achieves the state-of-the-art performance on benchmark datasets in both photorealism (+73.8% in FID on RealTalk) and motion representation (+6.1% in FD metric on VICO) spaces. User studies confirm the superior performance of DiTaiListener, with the model being the clear preference in terms of feedback, diversity, and smoothness, outperforming competitors by a significant margin.},

note = {Version Number: 1},

keywords = {DTIC, VGL},

pubstate = {published},

tppubtype = {misc}

}

Close

Huang, Huajian; Chen, Yingshu; Li, Longwei; Cheng, Hui; Braud, Tristan; Zhao, Yajie; Yeung, Sai-Kit

SC-OmniGS: Self-Calibrating Omnidirectional Gaussian Splatting Miscellaneous

2025, (arXiv:2502.04734 [cs]).

Abstract | Links | BibTeX | Tags: VGL

@misc{huang_sc-omnigs_2025,

title = {SC-OmniGS: Self-Calibrating Omnidirectional Gaussian Splatting},

author = {Huajian Huang and Yingshu Chen and Longwei Li and Hui Cheng and Tristan Braud and Yajie Zhao and Sai-Kit Yeung},

url = {http://arxiv.org/abs/2502.04734},

doi = {10.48550/arXiv.2502.04734},

year  = {2025},

date = {2025-02-01},

urldate = {2025-03-18},

publisher = {arXiv},

abstract = {360-degree cameras streamline data collection for radiance field 3D reconstruction by capturing comprehensive scene data. However, traditional radiance field methods do not address the specific challenges inherent to 360-degree images. We present SC-OmniGS, a novel self-calibrating omnidirectional Gaussian splatting system for fast and accurate omnidirectional radiance field reconstruction using 360-degree images. Rather than converting 360-degree images to cube maps and performing perspective image calibration, we treat 360-degree images as a whole sphere and derive a mathematical framework that enables direct omnidirectional camera pose calibration accompanied by 3D Gaussians optimization. Furthermore, we introduce a differentiable omnidirectional camera model in order to rectify the distortion of real-world data for performance enhancement. Overall, the omnidirectional camera intrinsic model, extrinsic poses, and 3D Gaussians are jointly optimized by minimizing weighted spherical photometric loss. Extensive experiments have demonstrated that our proposed SC-OmniGS is able to recover a high-quality radiance field from noisy camera poses or even no pose prior in challenging scenarios characterized by wide baselines and non-object-centric configurations. The noticeable performance gain in the real-world dataset captured by consumer-grade omnidirectional cameras verifies the effectiveness of our general omnidirectional camera model in reducing the distortion of 360-degree images.},

note = {arXiv:2502.04734 [cs]},

keywords = {VGL},

pubstate = {published},

tppubtype = {misc}

}

Close

Chang, Di; Xu, Hongyi; Xie, You; Gao, Yipeng; Kuang, Zhengfei; Cai, Shengqu; Zhang, Chenxu; Song, Guoxian; Wang, Chao; Shi, Yichun; Chen, Zeyuan; Zhou, Shijie; Luo, Linjie; Wetzstein, Gordon; Soleymani, Mohammad

X-Dyna: Expressive Dynamic Human Image Animation Miscellaneous

2025, (arXiv:2501.10021 [cs]).

Abstract | Links | BibTeX | Tags: VGL

Hu, Yue; Liu, Rong; Chen, Meida; Beerel, Peter; Feng, Andrew

SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting Miscellaneous

2025, (arXiv:2501.07015 [cs]).

Abstract | Links | BibTeX | Tags: VGL

@misc{hu_splatmap_2025,

title = {SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting},

author = {Yue Hu and Rong Liu and Meida Chen and Peter Beerel and Andrew Feng},

url = {http://arxiv.org/abs/2501.07015},

doi = {10.48550/arXiv.2501.07015},

year  = {2025},

date = {2025-01-01},

urldate = {2025-01-16},

publisher = {arXiv},

abstract = {Achieving high-fidelity 3D reconstruction from monocular video remains challenging due to the inherent limitations of traditional methods like Structure-from-Motion (SfM) and monocular SLAM in accurately capturing scene details. While differentiable rendering techniques such as Neural Radiance Fields (NeRF) address some of these challenges, their high computational costs make them unsuitable for real-time applications. Additionally, existing 3D Gaussian Splatting (3DGS) methods often focus on photometric consistency, neglecting geometric accuracy and failing to exploit SLAM's dynamic depth and pose updates for scene refinement. We propose a framework integrating dense SLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach introduces SLAM-Informed Adaptive Densification, which dynamically updates and densifies the Gaussian model by leveraging dense point clouds from SLAM. Additionally, we incorporate Geometry-Guided Optimization, which combines edge-aware geometric constraints and photometric consistency to jointly optimize the appearance and geometry of the 3DGS scene representation, enabling detailed and accurate SLAM mapping reconstruction. Experiments on the Replica and TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving state-of-the-art results among monocular systems. Specifically, our method achieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica, representing improvements of 10.7%, 6.4%, and 49.4%, respectively, over the previous SOTA. On TUM-RGBD, our method outperforms the closest baseline by 10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the potential of our framework in bridging the gap between photometric and geometric dense 3D scene representations, paving the way for practical and efficient monocular dense reconstruction.},

note = {arXiv:2501.07015 [cs]},

keywords = {VGL},

pubstate = {published},

tppubtype = {misc}

}

Close

Chen, Meida; Han, Kangle; Yu, Zifan; Feng, Andrew; Hou, Yu; You, Suya; Soibelman, Lucio

An Aerial Photogrammetry Benchmark Dataset for Point Cloud Segmentation and Style Translation Journal Article

In: Remote Sensing, vol. 16, no. 22, pp. 4240, 2024, ISSN: 2072-4292.

Abstract | Links | BibTeX | Tags: DTIC, VGL

@article{chen_aerial_2024,

title = {An Aerial Photogrammetry Benchmark Dataset for Point Cloud Segmentation and Style Translation},

author = {Meida Chen and Kangle Han and Zifan Yu and Andrew Feng and Yu Hou and Suya You and Lucio Soibelman},

url = {https://www.mdpi.com/2072-4292/16/22/4240},

doi = {10.3390/rs16224240},

issn = {2072-4292},

year  = {2024},

date = {2024-11-01},

urldate = {2024-12-05},

journal = {Remote Sensing},

volume = {16},

number = {22},

pages = {4240},

abstract = {The recent surge in diverse 3D datasets spanning various scales and applications marks a significant advancement in the field. However, the comprehensive process of data acquisition, refinement, and annotation at a large scale poses a formidable challenge, particularly for individual researchers and small teams. To this end, we present a novel synthetic 3D point cloud generation framework that can produce detailed outdoor aerial photogrammetric 3D datasets with accurate ground truth annotations without the labor-intensive and time-consuming data collection/annotation processes. Our pipeline procedurally generates synthetic environments, mirroring real-world data collection and 3D reconstruction processes. A key feature of our framework is its ability to replicate consistent quality, noise patterns, and diversity similar to real-world datasets. This is achieved by adopting UAV flight patterns that resemble those used in real-world data collection processes (e.g., the cross-hatch flight pattern) across various synthetic terrains that are procedurally generated, thereby ensuring data consistency akin to real-world scenarios. Moreover, the generated datasets are enriched with precise semantic and instance annotations, eliminating the need for manual labeling. Our approach has led to the development and release of the Semantic Terrain Points Labeling—Synthetic 3D (STPLS3D) benchmark, an extensive outdoor 3D dataset encompassing over 16 km2, featuring up to 19 semantic labels. We also collected, reconstructed, and annotated four real-world datasets for validation purposes. Extensive experiments on these datasets demonstrate our synthetic datasets’ effectiveness, superior quality, and their value as a benchmark dataset for further point cloud research.},

keywords = {DTIC, VGL},

pubstate = {published},

tppubtype = {article}

}

Close

Xiao, Hanyuan; Chen, Yingshu; Huang, Huajian; Xiong, Haolin; Yang, Jing; Prasad, Pratusha; Zhao, Yajie

Localized Gaussian Splatting Editing with Contextual Awareness Miscellaneous

2024, (arXiv:2408.00083 [cs]).

Abstract | Links | BibTeX | Tags: DTIC, VGL

@misc{xiao_localized_2024,

title = {Localized Gaussian Splatting Editing with Contextual Awareness},

author = {Hanyuan Xiao and Yingshu Chen and Huajian Huang and Haolin Xiong and Jing Yang and Pratusha Prasad and Yajie Zhao},

url = {http://arxiv.org/abs/2408.00083},

year  = {2024},

date = {2024-07-01},

urldate = {2024-08-16},

publisher = {arXiv},

abstract = {Recent text-guided generation of individual 3D object has achieved great success using diffusion priors. However, these methods are not suitable for object insertion and replacement tasks as they do not consider the background, leading to illumination mismatches within the environment. To bridge the gap, we introduce an illumination-aware 3D scene editing pipeline for 3D Gaussian Splatting (3DGS) representation. Our key observation is that inpainting by the state-of-the-art conditional 2D diffusion model is consistent with background in lighting. To leverage the prior knowledge from the well-trained diffusion models for 3D object generation, our approach employs a coarse-to-fine objection optimization pipeline with inpainted views. In the first coarse step, we achieve image-to-3D lifting given an ideal inpainted view. The process employs 3D-aware diffusion prior from a view-conditioned diffusion model, which preserves illumination present in the conditioning image. To acquire an ideal inpainted image, we introduce an Anchor View Proposal (AVP) algorithm to find a single view that best represents the scene illumination in target region. In the second Texture Enhancement step, we introduce a novel Depth-guided Inpainting Score Distillation Sampling (DI-SDS), which enhances geometry and texture details with the inpainting diffusion prior, beyond the scope of the 3D-aware diffusion prior knowledge in the first coarse step. DI-SDS not only provides fine-grained texture enhancement, but also urges optimization to respect scene lighting. Our approach efficiently achieves local editing with global illumination consistency without explicitly modeling light transport. We demonstrate robustness of our method by evaluating editing in real scenes containing explicit highlight and shadows, and compare against the state-of-the-art text-to-3D editing methods.},

note = {arXiv:2408.00083 [cs]},

keywords = {DTIC, VGL},

pubstate = {published},

tppubtype = {misc}

}

Close

Chen, Meida; Lal, Devashish; Yu, Zifan; Xu, Jiuyi; Feng, Andrew; You, Suya; Nurunnabi, Abdul; Shi, Yangming

Large-Scale 3D Terrain Reconstruction Using 3D Gaussian Splatting for Visualization and Simulation Journal Article

In: Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci., vol. XLVIII-2-2024, pp. 49–54, 2024, ISSN: 2194-9034.

Abstract | Links | BibTeX | Tags: DTIC, Graphics, VGL

Zhang, Mingyuan; Cai, Zhongang; Pan, Liang; Hong, Fangzhou; Guo, Xinying; Yang, Lei; Liu, Ziwei

MotionDiffuse: Text-Driven Human Motion Generation With Diffusion Model Journal Article

In: IEEE Trans. Pattern Anal. Mach. Intell., vol. 46, no. 6, pp. 4115–4128, 2024, ISSN: 0162-8828, 2160-9292, 1939-3539.

Links | BibTeX | Tags: VGL

Nurunnabi, Abdul; Teferle, Felicia; Laefer, Debra F.; Chen, Meida; Ali, Mir Masoom

Development of a Precise Tree Structure from LiDAR Point Clouds Journal Article

In: Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci., vol. XLVIII-2-2024, pp. 301–308, 2024, ISSN: 2194-9034.

Abstract | Links | BibTeX | Tags: Narrative, VGL

@article{nurunnabi_development_2024,

title = {Development of a Precise Tree Structure from LiDAR Point Clouds},

author = {Abdul Nurunnabi and Felicia Teferle and Debra F. Laefer and Meida Chen and Mir Masoom Ali},

url = {https://isprs-archives.copernicus.org/articles/XLVIII-2-2024/301/2024/},

doi = {10.5194/isprs-archives-XLVIII-2-2024-301-2024},

issn = {2194-9034},

year  = {2024},

date = {2024-06-01},

urldate = {2024-07-11},

journal = {Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci.},

volume = {XLVIII-2-2024},

pages = {301–308},

abstract = {Abstract. A precise tree structure that represents the distribution of tree stem, branches, and leaves is crucial for accurately capturing the full representation of a tree. Light Detection and Ranging (LiDAR)-based three-dimensional (3D) point clouds (PCs) capture the geometry of scanned objects including forests stands and individual trees. PCs are irregular, unstructured, often noisy, and contaminated by outliers. Researchers have struggled to develop methods to separate leaves and wood without losing the tree geometry. This paper proposes a solution that employs only the spatial coordinates (x, y, z) of the PC. The new algorithm works as a filtering approach, utilizing multi-scale neighborhood-based geometric features (GFs) e.g., linearity, planarity, and verticality to classify linear (wood) and non-linear (leaf) points. This involves finding potential wood points and coupling them with an octree-based segmentation to develop a tree architecture. The main contributions of this paper are (i) investigating the potential of different GFs to split linear and non-linear points, (ii) introducing a novel method that pointwise classifies leaf and wood points, and (iii) developing a precise 3D tree structure. The performance of the new algorithm has been demonstrated through terrestrial laser scanning PCs. For a Scots pine tree, the new method classifies leaf and wood points with an overall accuracy of 97.9%.},

keywords = {Narrative, VGL},

pubstate = {published},

tppubtype = {article}

}

Close

Zhang, Hao; Chang, Di; Li, Fang; Soleymani, Mohammad; Ahuja, Narendra

MagicPose4D: Crafting Articulated Models with Appearance and Motion Control Miscellaneous

2024, (Version Number: 1).

Abstract | Links | BibTeX | Tags: VGL, Virtual Humans

@misc{zhang_magicpose4d_2024,

title = {MagicPose4D: Crafting Articulated Models with Appearance and Motion Control},

author = {Hao Zhang and Di Chang and Fang Li and Mohammad Soleymani and Narendra Ahuja},

url = {https://arxiv.org/abs/2405.14017},

doi = {10.48550/ARXIV.2405.14017},

year  = {2024},

date = {2024-05-01},

urldate = {2024-06-25},

publisher = {arXiv},

abstract = {With the success of 2D and 3D visual generative models, there is growing interest in generating 4D content. Existing methods primarily rely on text prompts to produce 4D content, but they often fall short of accurately defining complex or rare motions. To address this limitation, we propose MagicPose4D, a novel framework for refined control over both appearance and motion in 4D generation. Unlike traditional methods, MagicPose4D accepts monocular videos as motion prompts, enabling precise and customizable motion generation. MagicPose4D comprises two key modules: 

 i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations. 

 ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training. 

 Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.},

note = {Version Number: 1},

keywords = {VGL, Virtual Humans},

pubstate = {published},

tppubtype = {misc}

}

Close

Liu, Rong; Xu, Rui; Hu, Yue; Chen, Meida; Feng, Andrew

AtomGS: Atomizing Gaussian Splatting for High-Fidelity Radiance Field Miscellaneous

2024, (Version Number: 2).

Abstract | Links | BibTeX | Tags: Graphics, VGL

Zhang, Hui; Kuang, Bingran; Zhao, Yajie

Camera Calibration using a Single View of a Symmetric Object Proceedings Article

In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2705–2709, IEEE, Seoul, Korea, Republic of, 2024, ISBN: 979-8-3503-4485-1.

Links | BibTeX | Tags: Graphics, VGL

Chen, Haiwei; Zhao, Yajie

Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting Miscellaneous

2024, (arXiv:2403.18186 [cs]).

Abstract | Links | BibTeX | Tags: VGL

Yang, Jing; Xiao, Hanyuan; Teng, Wenbin; Cai, Yunxuan; Zhao, Yajie

Light Sampling Field and BRDF Representation for Physically-based Neural Rendering Journal Article

In: 2023, (Publisher: arXiv Version Number: 1).

Abstract | Links | BibTeX | Tags: DTIC, UARC, VGL

Chemburkar, Ankur; Lu, Shuhong; Feng, Andrew

MoDDM: Text-to-Motion Synthesis using Discrete Diffusion Model Proceedings Article

In: 2023.

Abstract | Links | BibTeX | Tags: VGL

Liu, Shichen; Cai, Yunxuan; Chen, Haiwei; Zhou, Yichao; Zhao, Yajie

Rapid Face Asset Acquisition with Recurrent Feature Alignment Journal Article

In: ACM Trans. Graph., vol. 41, no. 6, pp. 214:1–214:17, 2022, ISSN: 0730-0301.

Abstract | Links | BibTeX | Tags: DTIC, VGL

Kuang, Zhengfei; Li, Jiaman; He, Mingming; Wang, Tong; Zhao, Yajie

DenseGAP: Graph-Structured Dense Correspondence Learning with Anchor Points Proceedings Article

In: pp. 542–549, IEEE Computer Society, 2022, ISBN: 978-1-6654-9062-7.

Abstract | Links | BibTeX | Tags: VGL

@inproceedings{kuang_densegap_2022,

title = {DenseGAP: Graph-Structured Dense Correspondence Learning with Anchor Points},

author = {Zhengfei Kuang and Jiaman Li and Mingming He and Tong Wang and Yajie Zhao},

url = {https://www.computer.org/csdl/proceedings-article/icpr/2022/09956472/1IHpppIuqOc},

doi = {10.1109/ICPR56361.2022.9956472},

isbn = {978-1-6654-9062-7},

year  = {2022},

date = {2022-08-01},

urldate = {2023-03-31},

pages = {542–549},

publisher = {IEEE Computer Society},

abstract = {Establishing dense correspondence between two images is a fundamental computer vision problem, which is typically tackled by matching local feature descriptors. However, without global awareness, such local features are often insufficient for disambiguating similar regions. And computing the pairwise feature correlation across images is both computation-expensive and memory-intensive. To make the local features aware of the global context and improve their matching accuracy, we introduce DenseGAP, a new solution for efficient Dense correspondence learning with a Graph-structured neural network conditioned on Anchor Points. Specifically, we first propose a graph structure that utilizes anchor points to provide sparse but reliable prior on inter- and intra-image context and propagates them to all image points via directed edges. We also design a graph-structured network to broadcast multi-level contexts via light-weighted message-passing layers and generate high-resolution feature maps at low memory cost. Finally, based on the predicted feature maps, we introduce a coarse-to-fine framework for accurate correspondence prediction using cycle consistency. Our feature descriptors capture both local and global information, thus enabling a continuous feature field for querying arbitrary points at high resolution. Through comprehensive ablative experiments and evaluations on large-scale indoor and outdoor datasets, we demonstrate that our method advances the state-of-the-art of correspondence learning on most benchmarks.},

keywords = {VGL},

pubstate = {published},

tppubtype = {inproceedings}

}

Close

Chen, Haiwei; Liu, Jiayi; Chen, Weikai; Liu, Shichen; Zhao, Yajie

Exemplar-based Pattern Synthesis with Implicit Periodic Field Network Proceedings Article

In: 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3698–3707, IEEE, New Orleans, LA, USA, 2022, ISBN: 978-1-6654-6946-3.

Links | BibTeX | Tags: UARC, VGL

Danieau, Fabien; Guillotel, Philippe; Hoyet, Ludovic; Tonneau, Steve; Zhao, Yajie

Editorial: Creating Lifelike Digital Humans Journal Article

In: Front. Virtual Real., vol. 3, pp. 906118, 2022, ISSN: 2673-4192.

Links | BibTeX | Tags: VGL

Liu, Shichen; Li, Tianye; Chen, Weikai; Li, Hao

A General Differentiable Mesh Renderer for Image-Based 3D Reasoning Journal Article

In: IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 44, no. 1, pp. 50–62, 2022, ISSN: 1939-3539, (Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence).

Abstract | Links | BibTeX | Tags: VGL

@article{liu_general_2022,

title = {A General Differentiable Mesh Renderer for Image-Based 3D Reasoning},

author = {Shichen Liu and Tianye Li and Weikai Chen and Hao Li},

doi = {10.1109/TPAMI.2020.3007759},

issn = {1939-3539},

year  = {2022},

date = {2022-01-01},

journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},

volume = {44},

number = {1},

pages = {50–62},

abstract = {Rendering bridges the gap between 2D vision and 3D scenes by simulating the physical process of image formation. By inverting such renderer, one can think of a learning approach to infer 3D information from 2D images. However, standard graphics renderers involve a fundamental step called rasterization, which prevents rendering to be differentiable. Unlike the state-of-the-art differentiable renderers (Kato et al. 2018 and Loper 2018), which only approximate the rendering gradient in the backpropagation, we propose a natually differentiable rendering framework that is able to (1) directly render colorized mesh using differentiable functions and (2) back-propagate efficient supervisions to mesh vertices and their attributes from various forms of image representations. The key to our framework is a novel formulation that views rendering as an aggregation function that fuses the probabilistic contributions of all mesh triangles with respect to the rendered pixels. Such formulation enables our framework to flow gradients to the occluded and distant vertices, which cannot be achieved by the previous state-of-the-arts. We show that by using the proposed renderer, one can achieve significant improvement in 3D unsupervised single-view reconstruction both qualitatively and quantitatively. Experiments also demonstrate that our approach can handle the challenging tasks in image-based shape fitting, which remain nontrivial to existing differentiable renders.},

note = {Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence},

keywords = {VGL},

pubstate = {published},

tppubtype = {article}

}

Close

Li, Jiaman; Villegas, Ruben; Ceylan, Duygu; Yang, Jimei; Kuang, Zhengfei; Li, Hao; Zhao, Yajie

Task-Generic Hierarchical Human Motion Prior using VAEs Proceedings Article

In: 2021 International Conference on 3D Vision (3DV), pp. 771–781, IEEE, London, United Kingdom, 2021, ISBN: 978-1-6654-2688-6.

Links | BibTeX | Tags: DTIC, UARC, VGL

Liu, Shichen; Zhou, Yichao; Zhao, Yajie

VaPiD: A Rapid Vanishing Point Detector via Learned Optimizers Proceedings Article

In: 2021 IEEE/CVF International Conference on Computer Vision (ICCV), pp. 12839–12848, IEEE, Montreal, QC, Canada, 2021, ISBN: 978-1-6654-2812-5.

Links | BibTeX | Tags: DTIC, UARC, VGL

Li, Tianye; Liu, Shichen; Bolkart, Timo; Liu, Jiayi; Li, Hao; Zhao, Yajie

Topologically Consistent Multi-View Face Inference Using Volumetric Sampling Proceedings Article

In: 2021 IEEE/CVF International Conference on Computer Vision (ICCV), pp. 3804–3814, IEEE, Montreal, QC, Canada, 2021, ISBN: 978-1-6654-2812-5.

Links | BibTeX | Tags: DTIC, UARC, VGL

Xiang, Sitao; Gu, Yuming; Xiang, Pengda; Chai, Menglei; Li, Hao; Zhao, Yajie; He, Mingming

DisUnknown: Distilling Unknown Factors for Disentanglement Learning Proceedings Article

In: 2021 IEEE/CVF International Conference on Computer Vision (ICCV), pp. 14790–14799, IEEE, Montreal, QC, Canada, 2021, ISBN: 978-1-6654-2812-5.

Links | BibTeX | Tags: DTIC, UARC, VGL

Xiang, Sitao

Eliminating topological errors in neural network rotation estimation using self-selecting ensembles Journal Article

In: ACM Trans. Graph., vol. 40, no. 4, pp. 167:1–167:21, 2021, ISSN: 0730-0301.

Abstract | Links | BibTeX | Tags: VGL

@article{xiang_eliminating_2021,

title = {Eliminating topological errors in neural network rotation estimation using self-selecting ensembles},

author = {Sitao Xiang},

url = {https://dl.acm.org/doi/10.1145/3450626.3459882},

doi = {10.1145/3450626.3459882},

issn = {0730-0301},

year  = {2021},

date = {2021-07-01},

urldate = {2023-03-31},

journal = {ACM Trans. Graph.},

volume = {40},

number = {4},

pages = {167:1–167:21},

abstract = {Many problems in computer graphics and computer vision applications involves inferring a rotation from a variety of different forms of inputs. With the increasing use of deep learning, neural networks have been employed to solve such problems. However, the traditional representations for 3D rotations, the quaternions and Euler angles, are found to be problematic for neural networks in practice, producing seemingly unavoidable large estimation errors. Previous researches has identified the discontinuity of the mapping from SO(3) to the quaternions or Euler angles as the source of such errors, and to solve it, embeddings of SO(3) have been proposed as the output representation of rotation estimation networks instead. In this paper, we argue that the argument against quaternions and Euler angles from local discontinuities of the mappings from SO(3) is flawed, and instead provide a different argument from the global topological properties of SO(3) that also establishes the lower bound of maximum error when using quaternions and Euler angles for rotation estimation networks. Extending from this view, we discover that rotation symmetries in the input object causes additional topological problems that even using embeddings of SO(3) as the output representation would not correctly handle. We propose the self-selecting ensemble, a topologically motivated approach, where the network makes multiple predictions and assigns weights to them. We show theoretically and with experiments that our methods can be combined with a wide range of different rotation representations and can handle all kinds of finite symmetries in 3D rotation estimation problems.},

keywords = {VGL},

pubstate = {published},

tppubtype = {article}

}

Close

Chen, Haiwei; Liu, Shichen; Chen, Weikai; Li, Hao; Hill, Randall

Equivariant Point Network for 3D Point Cloud Analysis Proceedings Article

In: pp. 14514–14523, 2021.

Links | BibTeX | Tags: UARC, VGL

Zhou, Yi; Wu, Chenglei; Li, Zimo; Cao, Chen; Ye, Yuting; Saragih, Jason; Li, Hao; Sheikh, Yaser

Fully convolutional mesh autoencoder using efficient spatially varying kernels Proceedings Article

In: Proceedings of the 34th International Conference on Neural Information Processing Systems, pp. 9251–9262, Curran Associates Inc., Red Hook, NY, USA, 2020, ISBN: 978-1-7138-2954-6.

Abstract | BibTeX | Tags: VGL

Chen, Haiwei; Zhao, Yajie

Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting Proceedings Article

In: pp. 7591–7600, 0000.

Abstract | Links | BibTeX | Tags: DTIC, Graphics, VGL

Publications

2025

2024

2023

2022

2021

2020

0000