2024
Kaibin Tian; Ruixiang Zhao; Zijie Xin; Bangxiang Lan; Xirong Li
Holistic Features are almost Sufficient for Text-to-Video Retrieval Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2024.
@inproceedings{cvpr24-TeachCLIP,
title = {Holistic Features are almost Sufficient for Text-to-Video Retrieval},
author = {Kaibin Tian and Ruixiang Zhao and Zijie Xin and Bangxiang Lan and Xirong Li},
url = {https://lixirong.net/pub/cvpr2024-TeachCLIP.pdf},
year = {2024},
date = {2024-06-19},
urldate = {2024-06-19},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Fan Hu; Yanlin Wang; Lun Du; Hongyu Zhang; Shi Han; Dongmei Zhang; Xirong Li
Tackling Long Code Search with Splitting, Encoding, and Aggregating Proceedings Article
In: The 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING), 2024.
@inproceedings{coling24-sea,
title = {Tackling Long Code Search with Splitting, Encoding, and Aggregating},
author = {Fan Hu and Yanlin Wang and Lun Du and Hongyu Zhang and Shi Han and Dongmei Zhang and Xirong Li},
year = {2024},
date = {2024-05-20},
urldate = {2024-05-20},
booktitle = {The 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Aozhu Chen; Fangming Zhou; Ziyuan Wang; Xirong Li
CLIPRerank: An Extremely Simple Method for Improving Ad-hoc Video Search Proceedings Article
In: ICASSP, 2024.
@inproceedings{icassp2024-cliprerank,
title = {CLIPRerank: An Extremely Simple Method for Improving Ad-hoc Video Search},
author = {Aozhu Chen and Fangming Zhou and Ziyuan Wang and Xirong Li},
year = {2024},
date = {2024-04-14},
urldate = {2024-04-14},
booktitle = {ICASSP},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Bing Li; Huan Chen; Weihong Yu; Ming Zhang; Fang Lu; Jingxue Ma; Yuhua Hao; Xiaorong Li; Bojie Hu; Lijun Shen; Jianbo Mao; Xixi He; Hao Wang; Dayong Ding; Xirong Li; Youxin Chen
In: npj Digital Medicine, 2024.
@article{npjdm24-eval,
title = {The performance of a deep learning system in assisting junior ophthalmologists in diagnosing 13 major fundus diseases: A prospective multi-center clinical trial},
author = {Bing Li and Huan Chen and Weihong Yu and Ming Zhang and Fang Lu and Jingxue Ma and Yuhua Hao and Xiaorong Li and Bojie Hu and Lijun Shen and Jianbo Mao and Xixi He and Hao Wang and Dayong Ding and Xirong Li and Youxin Chen},
url = {https://www.nature.com/articles/s41746-023-00991-9},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {npj Digital Medicine},
abstract = {Artificial intelligence (AI)-based diagnostic systems have been reported to improve fundus disease screening in previous studies. This multicenter prospective self-controlled clinical trial aims to evaluate the diagnostic performance of a deep learning system (DLS) in assisting junior ophthalmologists in detecting 13 major fundus diseases. A total of 1493 fundus images from 748 patients were prospectively collected from five tertiary hospitals in China. Nine junior ophthalmologists were trained and annotated the images with or without the suggestions proposed by the DLS. The diagnostic performance was evaluated among three groups: DLS-assisted junior ophthalmologist group (test group), junior ophthalmologist group (control group) and DLS group. The diagnostic consistency was 84.9% (95%CI, 83.0% ~ 86.9%), 72.9% (95%CI, 70.3% ~ 75.6%) and 85.5% (95%CI, 83.5% ~ 87.4%) in the test group, control group and DLS group, respectively. With the help of the proposed DLS, the diagnostic consistency of junior ophthalmologists improved by approximately 12% (95% CI, 9.1% ~ 14.9%) with statistical significance (P < 0.001). For the detection of 13 diseases, the test group achieved significant higher sensitivities (72.2% ~ 100.0%) and comparable specificities (90.8% ~ 98.7%) comparing with the control group (sensitivities, 50% ~ 100%; specificities 96.7 ~ 99.8%). The DLS group presented similar performance to the test group in the detection of any fundus abnormality (sensitivity, 95.7%; specificity, 87.2%) and each of the 13 diseases (sensitivity, 83.3% ~ 100.0%; specificity, 89.0 ~ 98.0%). The proposed DLS provided a novel approach for the automatic detection of 13 major fundus diseases with high diagnostic consistency and assisted to improve the performance of junior ophthalmologists, resulting especially in reducing the risk of missed diagnoses.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2023
Qijie Wei; Jingyuan Yang; Bo Wang; Jinrui Wang; Jianchun Zhao; Xinyu Zhao; Sheng Yang; Niranchana Manivannan; Youxin Chen; Dayong Ding; Jing Zhou; Xirong Li
Supervised Domain Adaptation for Recognizing Retinal Diseases from Wide-Field Fundus Images Proceedings Article
In: IEEE International Conference on Bioinformatics & Biomedicine (BIBM), 2023.
@inproceedings{bibm23-sda,
title = {Supervised Domain Adaptation for Recognizing Retinal Diseases from Wide-Field Fundus Images},
author = {Qijie Wei and Jingyuan Yang and Bo Wang and Jinrui Wang and Jianchun Zhao and Xinyu Zhao and Sheng Yang and Niranchana Manivannan and Youxin Chen and Dayong Ding and Jing Zhou and Xirong Li},
year = {2023},
date = {2023-12-05},
urldate = {2023-12-05},
booktitle = {IEEE International Conference on Bioinformatics & Biomedicine (BIBM)},
abstract = {This paper addresses the emerging task of recognizing multiple retinal diseases from wide-field (WF) and ultra-wide-field (UWF) fundus images. For an effective use of existing large amount of labeled color fundus photo (CFP) data and the relatively small amount of WF and UWF data, we propose a supervised domain adaptation method named Cross-domain Collaborative Learning (CdCL). Inspired by the success of fixed-ratio based mixup in unsupervised domain adaptation, we re-purpose this strategy for the current task. Due to the intrinsic disparity between the field-of-view of CFP and WF/UWF images, a scale bias naturally exists in a mixup sample that the anatomic structure from a CFP image will be considerably larger than its WF/UWF counterpart. The CdCL method resolves the issue by Scale-bias Correction, which employs Transformers for producing scale-invariant features. As demonstrated by extensive experiments on multiple datasets covering both WF and UWF images, the proposed method compares favorably against a number of competitive baseline.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Aozhu Chen; Ziyuan Wang; Chengbo Dong; Kaibin Tian; Ruixiang Zhao; Xun Liang; Zhanhui Kang; Xirong Li
ChinaOpen: A Dataset for Open-world Multimodal Learning Proceedings Article
In: ACM Multimedia, 2023.
@inproceedings{mm23-ChinaOpen,
title = {ChinaOpen: A Dataset for Open-world Multimodal Learning},
author = {Aozhu Chen and Ziyuan Wang and Chengbo Dong and Kaibin Tian and Ruixiang Zhao and Xun Liang and Zhanhui Kang and Xirong Li},
year = {2023},
date = {2023-10-29},
booktitle = {ACM Multimedia},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jiazhen Liu; Xirong Li
Geometrized Transformer for Self-Supervised Homography Estimation Proceedings Article
In: International Conference on Computer Vision (ICCV), 2023.
@inproceedings{iccv2023-GeoFormer,
title = {Geometrized Transformer for Self-Supervised Homography Estimation},
author = {Jiazhen Liu and Xirong Li},
year = {2023},
date = {2023-10-03},
booktitle = {International Conference on Computer Vision (ICCV)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhihao Sun; Haoran Jiang; Danding Wang; Xirong Li; Juan Cao
SAFL-Net: Semantic-Agnostic Feature Learning Network with Auxiliary Plugins for Image Manipulation Detection Proceedings Article
In: International Conference on Computer Vision (ICCV), 2023.
@inproceedings{iccv23-saflnet,
title = {SAFL-Net: Semantic-Agnostic Feature Learning Network with Auxiliary Plugins for Image Manipulation Detection},
author = {Zhihao Sun and Haoran Jiang and Danding Wang and Xirong Li and Juan Cao},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
booktitle = {International Conference on Computer Vision (ICCV)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Fan Hu; Aozhu Chen; Xirong Li
Towards Making a Trojan-horse Attack on Text-to-Image Retrieval Proceedings Article
In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 2023.
@inproceedings{icassp2023-tha,
title = {Towards Making a Trojan-horse Attack on Text-to-Image Retrieval},
author = {Fan Hu and Aozhu Chen and Xirong Li},
year = {2023},
date = {2023-06-04},
booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
abstract = {While deep learning based image retrieval is reported to be vulnerable to adversarial attacks, existing works are mainly on image-to-image retrieval with their attacks performed at the front end via query modification. By contrast, we present in this paper the first study about a threat that occurs at the back end of a text-to-image retrieval (T2IR) system. Our study is motivated by the fact that the image collection indexed by the system will be regularly updated due to the arrival of new images from various sources such as web crawlers and advertisers. With malicious images indexed, it is possible for an attacker to indirectly interfere with the retrieval process, letting users see certain images that are completely irrelevant w.r.t. their queries. We put this thought into practice by proposing a novel Trojan-horse attack (THA). In particular, we construct a set of Trojan-horse images by first embedding word-specific adversarial information into a QR code and then putting the code on benign advertising images. A proof-of-concept evaluation, conducted on two popular T2IR datasets (Flickr30k and MS-COCO), shows the effectiveness of the proposed THA in a white-box mode.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Fan Hu; Aozhu Chen; Ziyue Wang; Fangming Zhou; Jianfeng Dong; Xirong Li
Lightweight Attentional Feature Fusion: A New Baseline for Text-to-Video Retrieval Proceedings Article
In: European Conference on Computer Vision (ECCV), 2022.
@inproceedings{eccv2022-laff-video-retrieval,
title = {Lightweight Attentional Feature Fusion: A New Baseline for Text-to-Video Retrieval},
author = {Fan Hu and Aozhu Chen and Ziyue Wang and Fangming Zhou and Jianfeng Dong and Xirong Li},
year = {2022},
date = {2022-10-23},
booktitle = {European Conference on Computer Vision (ECCV)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jiazhen Liu; Xirong Li; Qijie Wei; Jie Xu; Dayong Ding
Semi-Supervised Keypoint Detector and Descriptor for Retinal Image Matching Proceedings Article
In: European Conference on Computer Vision (ECCV), 2022.
@inproceedings{eccv2022-SuperRetina,
title = {Semi-Supervised Keypoint Detector and Descriptor for Retinal Image Matching},
author = {Jiazhen Liu and Xirong Li and Qijie Wei and Jie Xu and Dayong Ding},
year = {2022},
date = {2022-10-23},
booktitle = {European Conference on Computer Vision (ECCV)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Ziyue Wang; Aozhu Chen; Fan Hu; Xirong Li
Learn to Understand Negation in Video Retrieval Proceedings Article
In: ACM Multimedia, 2022.
@inproceedings{mm22-negation-learning,
title = {Learn to Understand Negation in Video Retrieval},
author = {Ziyue Wang and Aozhu Chen and Fan Hu and Xirong Li},
year = {2022},
date = {2022-10-10},
urldate = {2022-10-10},
booktitle = {ACM Multimedia},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jianfeng Dong; Xiaoke Chen; Minsong Zhang; Xun Yang; Shujie Chen; Xirong Li; Xun Wang
Partially Relevant Video Retrieval Proceedings Article
In: ACM Multimedia, 2022.
@inproceedings{mm2022-prvr,
title = {Partially Relevant Video Retrieval},
author = {Jianfeng Dong and Xiaoke Chen and Minsong Zhang and Xun Yang and Shujie Chen and Xirong Li and Xun Wang},
year = {2022},
date = {2022-10-10},
urldate = {2022-10-10},
booktitle = {ACM Multimedia},
journal = {ACM Multimedia},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chengbo Dong; Xinru Chen; Ruohan Hu; Juan Cao; Xirong Li
MVSS-Net: Multi-View Multi-Scale Supervised Networks for Image Manipulation Detection Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 2022.
@article{tpami22-mvssnet,
title = {MVSS-Net: Multi-View Multi-Scale Supervised Networks for Image Manipulation Detection},
author = {Chengbo Dong and Xinru Chen and Ruohan Hu and Juan Cao and Xirong Li},
url = {pub/tpami2022-mvssnet.pdf},
doi = {10.1109/TPAMI.2022.3180556},
year = {2022},
date = {2022-07-01},
urldate = {2022-07-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
abstract = {As manipulating images by copy-move, splicing and/or inpainting may lead to misinterpretation of the visual content, detecting these sorts of manipulations is crucial for media forensics. Given the variety of possible attacks on the content, devising a generic method is nontrivial. Current deep learning based methods are promising when training and test data are well aligned, but perform poorly on independent tests. Moreover, due to the absence of authentic test images, their image-level detection specificity is in doubt. The key question is how to design and train a deep neural network capable of learning generalizable features sensitive to manipulations in novel data, whilst specific to prevent false alarms on the authentic. We propose multi-view feature learning to jointly exploit tampering boundary artifacts and the noise view of the input image. As both clues are meant to be semantic-agnostic, the learned features are thus generalizable. For effectively learning from authentic images, we train with multi-scale (pixel / edge / image) supervision. We term the new network MVSS-Net and its enhanced version MVSS-Net++. Experiments are conducted in both within-dataset and cross-dataset scenarios, showing that MVSS-Net++ performs the best, and exhibits better robustness against JPEG compression, Gaussian blur and screenshot based image re-capturing.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Yue Wu; Yang Zhou; Jianchun Zhao; Jingyuan Yang; Weihong Yu; Youxin Chen; Xirong Li
Lesion Localization in OCT by Semi-Supervised Object Detection Proceedings Article
In: ACM International Conference on Multimedia Retrieval (ICMR), 2022.
@inproceedings{icmr22-oct-ssod,
title = {Lesion Localization in OCT by Semi-Supervised Object Detection},
author = {Yue Wu and Yang Zhou and Jianchun Zhao and Jingyuan Yang and Weihong Yu and Youxin Chen and Xirong Li},
doi = {10.1145/3512527.3531418},
year = {2022},
date = {2022-06-27},
booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)},
abstract = {Over 300 million people worldwide are affected by various retinal diseases. By noninvasive Optical Coherence Tomography (OCT) scans, a number of abnormal structural changes in the retina, namely retinal lesions, can be identified. Automated lesion localization in OCT is thus important for detecting retinal diseases at their early stage. To conquer the lack of manual annotation for deep supervised learning, this paper presents a first study on utilizing semi-supervised object detection (SSOD) for lesion localization in OCT images. To that end, we develop a taxonomy to provide a unified and structured viewpoint of the current SSOD methods, and consequently identify key modules in these methods. To evaluate the influence of these modules in the new task, we build OCT-SS, a new dataset consisting of over 1k expert-labeled OCT B-scan images and over 13k unlabeled B-scans. Extensive experiments on OCT-SS identify Unbiased Teacher (UnT) as the best current SSOD method for lesion localization. Moreover, we improve over this strong baseline, with mAP increased from 49.34 to 50.86.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Rui Qian; Xin Lai; Xirong Li
3D Object Detection for Autonomous Driving: A Survey Journal Article
In: Pattern Recognition, 2022.
@article{pr22-3d-object-detection,
title = {3D Object Detection for Autonomous Driving: A Survey},
author = {Rui Qian and Xin Lai and Xirong Li},
doi = {10.1016/j.patcog.2022.108796},
year = {2022},
date = {2022-05-16},
urldate = {2022-05-16},
journal = {Pattern Recognition},
abstract = {Autonomous driving is regarded as one of the most promising remedies to shield human beings from severe crashes. To this end, 3D object detection serves as the core basis of perception stack especially for the sake of path planning, motion prediction, and collision avoidance etc. Taking a quick glance at the progress we have made, we attribute challenges to visual appearance recovery in the absence of depth information from images, representation learning from partially occluded unstructured point clouds, and semantic alignments over heterogeneous features from cross modalities. Despite existing efforts, 3D object detection for autonomous driving is still in its infancy. Recently, a large body of literature have been investigated to address this 3D vision task. Nevertheless, few investigations have looked into collecting and structuring this growing knowledge. We therefore aim to fill this gap in a comprehensive survey, encompassing all the main concerns including sensors, datasets, performance metrics and the recent state-of-the-art detection methods, together with their pros and cons. Furthermore, we provide quantitative comparisons with the state of the art. A case study on fifteen selected representative methods is presented, involved with runtime analysis, error analysis, and robustness analysis. Finally, we provide concluding remarks after an in-depth analysis of the surveyed works and identify promising directions for future work.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Weisen Wang; Xirong Li; Zhiyan Xu; Weihong Yu; Jianchun Zhao; Dayong Ding; Youxin Chen
Learning Two-Stream CNN for Multi-Modal Age-related Macular Degeneration Categorization Journal Article
In: IEEE Journal of Biomedical and Health Informatics (J-BHI), 2022.
@article{jbhi22-mmc-amd,
title = {Learning Two-Stream CNN for Multi-Modal Age-related Macular Degeneration Categorization},
author = {Weisen Wang and Xirong Li and Zhiyan Xu and Weihong Yu and Jianchun Zhao and Dayong Ding and Youxin Chen},
url = {pub/jbhi2022-amd.pdf},
doi = {10.1109/JBHI.2022.3171523},
year = {2022},
date = {2022-04-26},
urldate = {2022-04-26},
journal = {IEEE Journal of Biomedical and Health Informatics (J-BHI)},
abstract = {This paper tackles automated categorization of Age-related Macular Degeneration (AMD), a common macular disease among people over 50. Previous research efforts mainly focus on AMD categorization with a single-modal input, let it be a color fundus photograph (CFP) or an OCT B-scan image. By contrast, we consider AMD categorization given a multi-modal input, a direction that is clinically meaningful yet mostly unexplored. Contrary to the prior art that takes a traditional approach of feature extraction plus classifier training that cannot be jointly optimized, we opt for end-to-end multi-modal Convolutional Neural Networks (MM-CNN). Our MM-CNN is instantiated by a two-stream CNN, with spatially-invariant fusion to combine information from the CFP and OCT streams. In order to visually interpret the contribution of the individual modalities to the final prediction, we extend the class activation mapping (CAM) technique to the multi-modal scenario. For effective training of MM-CNN, we develop two data augmentation methods. One is GAN-based CFP/OCT image synthesis, with our novel use of CAMs as conditional input of a high-resolution image-to-image translation GAN. The other method is Loose Pairing, which pairs a CFP image and an OCT image on the basis of their classes instead of eye identities. Experiments on a clinical dataset consisting of 1,094 CFP images and 1,289 OCT images acquired from 1,093 distinct eyes show that the proposed solution obtains better F1 and Accuracy than multiple baselines for multi-modal AMD categorization. Code and data are available at https://github.com/li-xirong/mmc-amd.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jianfeng Dong; Yabing Wang; Xianke Chen; Xiaoye Qu; Xirong Li; Yuan He; Xun Wang
Reading-strategy Inspired Visual Representation Learning for Text-to-Video Retrieval Journal Article
In: IEEE Transactions on Circuits and Systems for Video Technology (TCSVT), 2022.
@article{tcsvt2022-rivrl,
title = {Reading-strategy Inspired Visual Representation Learning for Text-to-Video Retrieval},
author = {Jianfeng Dong and Yabing Wang and Xianke Chen and Xiaoye Qu and Xirong Li and Yuan He and Xun Wang},
doi = {10.1109/TCSVT.2022.3150959},
year = {2022},
date = {2022-02-08},
urldate = {2022-02-08},
journal = {IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)},
abstract = {This paper aims for the task of text-to-video retrieval, where given a query in the form of a natural-language sentence, it is asked to retrieve videos which are semantically relevant to the given query, from a great number of unlabeled videos. The success of this task depends on cross-modal representation learning that projects both videos and sentences into common spaces for semantic similarity computation. In this work, we concentrate on video representation learning, an essential component for text-to-video retrieval. Inspired by the reading strategy of humans, we propose a Reading-strategy Inspired Visual Representation Learning (RIVRL) to represent videos, which consists of two branches: a previewing branch and an intensive-reading branch. The previewing branch is designed to briefly capture the overview information of videos, while the intensive-reading branch is designed to obtain more in-depth information. Moreover, the intensive-reading branch is aware of the video overview captured by the previewing branch. Such holistic information is found to be useful for the intensive-reading branch to extract more fine-grained features. Extensive experiments on three datasets are conducted, where our model RIVRL achieves a new state-of-the-art on TGIF and VATEX. Moreover, on MSR-VTT, our model using two video features shows comparable performance to the state-of-the-art using seven video features and even outperforms models pre-trained on the large-scale HowTo100M dataset.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Rui Qian; Xin Lai; Xirong Li
BADet: Boundary-Aware 3D Object Detection from Point Clouds Journal Article
In: Pattern Recognition, 2022.
@article{pr2022-badet,
title = {BADet: Boundary-Aware 3D Object Detection from Point Clouds},
author = {Rui Qian and Xin Lai and Xirong Li},
doi = {10.1016/j.patcog.2022.108524},
year = {2022},
date = {2022-01-10},
journal = {Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2021
Xirong Li; Fangming Zhou; Chaoxi Xu; Jiaqi Ji; Gang Yang
SEA: Sentence Encoder Assembly for Video Retrieval by Textual Queries Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 23, pp. 4351-4362, 2021.
@article{tmm2021-sea,
title = {SEA: Sentence Encoder Assembly for Video Retrieval by Textual Queries},
author = {Xirong Li and Fangming Zhou and Chaoxi Xu and Jiaqi Ji and Gang Yang},
url = {http://lixirong.net/pub/tmm2021-sea.pdf},
doi = {10.1109/TMM.2020.3042067},
year = {2021},
date = {2021-12-02},
urldate = {2021-12-02},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {23},
pages = {4351-4362},
abstract = {Retrieving unlabeled videos by textual queries, known as Ad-hoc Video Search (AVS), is a core theme in multimedia data management and retrieval. The success of AVS counts on cross-modal representation learning that encodes both query sentences and videos into common spaces for semantic similarity computation. Inspired by the initial success of previously few works in combining multiple sentence encoders, this paper takes a step forward by developing a new and general method for effectively exploiting diverse sentence encoders. The novelty of the proposed method, which we term Sentence Encoder Assembly (SEA), is two-fold. First, different from prior art that uses only a single common space, SEA supports text-video matching in multiple encoder-specific common spaces. Such a property prevents the matching from being dominated by a specific encoder that produces an encoding vector much longer than other encoders. Second, in order to explore complementarities among the individual common spaces, we propose multi-space multi-loss learning. As extensive experiments on four benchmarks (MSR-VTT, TRECVID AVS 2016-2019, TGIF and MSVD) show, SEA surpasses the state-of-the-art. In addition, SEA is extremely ease to implement. All this makes SEA an appealing solution for AVS and promising for continuously advancing the task by harvesting new sentence encoders.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xirong Li; Yang Zhou; Jie Wang; Hailan Lin; Jianchun Zhao; Dayong Ding; Weihong Yu; Youxin Chen
Multi-Modal Multi-Instance Learning for Retinal Disease Recognition Proceedings Article
In: ACM Multimedia, 2021.
@inproceedings{mm21-mm-mil,
title = {Multi-Modal Multi-Instance Learning for Retinal Disease Recognition},
author = {Xirong Li and Yang Zhou and Jie Wang and Hailan Lin and Jianchun Zhao and Dayong Ding and Weihong Yu and Youxin Chen},
url = {pub/mm2021-mm-mil.pdf},
doi = {10.1145/3474085.3475418},
year = {2021},
date = {2021-10-20},
urldate = {2021-10-20},
booktitle = {ACM Multimedia},
abstract = {This paper attacks an emerging challenge of multi-modal retinal disease recognition. Given a multi-modal case consisting of a color fundus photo (CFP) and an array of OCT B-scan images acquired during an eye examination, we aim to build a deep neural network that recognizes multiple vision-threatening diseases for the given case. As the diagnostic efficacy of CFP and OCT is disease-dependent, the network's ability of being both selective and interpretable is important. Moreover, as both data acquisition and manual labeling are extremely expensive in the medical domain, the network has to be relatively lightweight for learning from a limited set of labeled multi-modal samples. Prior art on retinal disease recognition focuses either on a single disease or on a single modality, leaving multi-modal fusion largely underexplored. We propose in this paper Multi-Modal Multi-Instance Learning (MM-MIL) for selectively fusing CFP and OCT modalities. Its lightweight architecture (as compared to current multi-head attention modules) makes it suited for learning from relatively small-sized datasets. For an effective use of MM-MIL, we propose to generate a pseudo sequence of CFPs by over sampling a given CFP. The benefits of this tactic include well balancing instances across modalities, increasing the resolution of the CFP input, and finding out regions of the CFP most relevant with respect to the final diagnosis. Extensive experiments on a real-world dataset consisting of 1,206 multi-modal cases from 1,193 eyes of 836 subjects demonstrate the viability of the proposed model.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chengbo Dong; Xinru Chen; Aozhu Chen; Fan Hu; Zihan Wang; Xirong Li
Multi-Level Visual Representation with Semantic-Reinforced Learning for Video Captioning Proceedings Article
In: ACM Multimedia (Grand Challenge Track), 2021.
@inproceedings{mm21-videocap,
title = {Multi-Level Visual Representation with Semantic-Reinforced Learning for Video Captioning},
author = {Chengbo Dong and Xinru Chen and Aozhu Chen and Fan Hu and Zihan Wang and Xirong Li},
year = {2021},
date = {2021-10-20},
booktitle = {ACM Multimedia (Grand Challenge Track)},
abstract = {This paper describes our bronze-medal solution for the video captioning task of the ACMMM2021 Pre-Training for Video Understanding Challenge. We depart from the Bottom-Up-Top-Down model, with technical improvements on both video content encoding and caption decoding. For encoding, we propose to extract multi-level video features that describe holistic scenes and fine-grained key objects, respectively. The scene-level and object-level features are enhanced separately by multi-head self-attention mechanisms before feeding them into the decoding module. Towards generating content-relevant and human-like captions, we train our network end-to-end by semantic-reinforced learning. Finally, in order to select the best caption from captions produced by different models, we perform caption reranking by cross-modal matching between a given video and each candidate caption. Both internal experiments on the MSR-VTT test set and external evaluations by the challenge organizers justify the viability of the proposed solution.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Peng Qi; Juan Cao; Xirong Li; Huan Liu; Qiang Sheng; Xiaoyue Mi; Qin He; Yongbiao Lv; Chenyang Guo; Yingchao Yu
Improving Fake News Detection by Using an Entity-enhanced Framework to Fuse Diverse Multimodal Clues Proceedings Article
In: ACM Multimedia (Industrial Track), 2021.
@inproceedings{nokey,
title = {Improving Fake News Detection by Using an Entity-enhanced Framework to Fuse Diverse Multimodal Clues},
author = {Peng Qi and Juan Cao and Xirong Li and Huan Liu and Qiang Sheng and Xiaoyue Mi and Qin He and Yongbiao Lv and Chenyang Guo and Yingchao Yu},
year = {2021},
date = {2021-10-20},
booktitle = {ACM Multimedia (Industrial Track)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Aozhu Chen; Fan Hu; Zihan Wang; Fangming Zhou; Xirong Li
What Matters for Ad-hoc Video Search? A Large-scale Evaluation on TRECVID Proceedings Article
In: The 2nd International Workshop on Video Retrieval Methods and Their Limits (ViRal'21@ICCV 2021), 2021.
@inproceedings{viral21-avs,
title = {What Matters for Ad-hoc Video Search? A Large-scale Evaluation on TRECVID},
author = {Aozhu Chen and Fan Hu and Zihan Wang and Fangming Zhou and Xirong Li},
year = {2021},
date = {2021-10-16},
urldate = {2021-10-16},
booktitle = {The 2nd International Workshop on Video Retrieval Methods and Their Limits (ViRal'21@ICCV 2021)},
abstract = {For quantifying progress in Ad-hoc Video Search (AVS), the annual TRECVID AVS task is an important international evaluation. Solutions submitted by the task participants vary in terms of their choices of cross-modal matching models, visual features and training data. As such, what one may conclude from the evaluation is at a high level that is insufficient to reveal the influence of the individual components. In order to bridge the gap between the current solution-level comparison and the desired component-wise comparison, we propose in this paper a large-scale and systematic evaluation on TRECVID. By selected combinations of state-of-the-art matching models, visual features and (pre-)training data, we construct a set of 25 different solutions and evaluate them on the TRECVID AVS tasks 2016--2020. The presented evaluation helps answer the key question of what matters for AVS. The resultant observations and learned lessons are also instructive for developing novel AVS solutions.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xinru Chen; Chengbo Dong; Jiaqi Ji; Juan Cao; Xirong Li
Image Manipulation Detection by Multi-View Multi-Scale Supervision Proceedings Article
In: International Conference on Computer Vision (ICCV) , 2021.
@inproceedings{iccv21-mvssnet,
title = {Image Manipulation Detection by Multi-View Multi-Scale Supervision},
author = {Xinru Chen and Chengbo Dong and Jiaqi Ji and Juan Cao and Xirong Li},
url = {pub/iccv2021-mvssnet.pdf},
year = {2021},
date = {2021-10-11},
urldate = {2021-10-11},
booktitle = {International Conference on Computer Vision (ICCV) },
abstract = {The key challenge of image manipulation detection is how to learn generalizable features that are sensitive to manipulations in novel data, whilst specific to prevent false alarms on authentic images. Current research emphasizes the sensitivity, with the specificity overlooked. In this paper we address both aspects by multi-view feature learning and multi-scale supervision. By exploiting noise distribution and boundary artifact surrounding tampered regions, the former aims to learn semantic-agnostic and thus more generalizable features. The latter allows us to learn from authentic images which are nontrivial to taken into account by current semantic segmentation network based methods. Our thoughts are realized by a new network which we term MVSS-Net. Extensive experiments on five benchmark sets justify the viability of MVSS-Net for both pixel-level and image-level manipulation detection.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Qiang Sheng; Juan Cao; Xueyao Zhang; Xirong Li; Lei Zhong
Article Reranking by Memory-enhanced Key Sentence Matching for Detecting Previously Fact-checked Claims Proceedings Article
In: The Joint Conference of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL-IJCNLP 2021), 2021.
@inproceedings{acl21-fact-check,
title = {Article Reranking by Memory-enhanced Key Sentence Matching for Detecting Previously Fact-checked Claims},
author = {Qiang Sheng and Juan Cao and Xueyao Zhang and Xirong Li and Lei Zhong},
year = {2021},
date = {2021-08-01},
booktitle = {The Joint Conference of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL-IJCNLP 2021)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jianfeng Dong; Xun Wang; Leimin Zhang; Chaoxi Xu; Gang Yang; Xirong Li
Feature Re-Learning with Data Augmentation for Video Relevance Prediction Journal Article
In: IEEE Transactions on Knowledge and Data Engineering (TKDE), vol. 33, no. 5, pp. 1946-1959, 2021, ISSN: 1041-4347.
@article{tkde21-frda,
title = {Feature Re-Learning with Data Augmentation for Video Relevance Prediction},
author = {Jianfeng Dong and Xun Wang and Leimin Zhang and Chaoxi Xu and Gang Yang and Xirong Li},
url = {pub/tkde2019-video-relevance.pdf},
doi = {10.1109/TKDE.2019.2947442},
issn = {1041-4347},
year = {2021},
date = {2021-05-01},
journal = {IEEE Transactions on Knowledge and Data Engineering (TKDE)},
volume = {33},
number = {5},
pages = {1946-1959},
abstract = {Predicting the relevance between two given videos with respect to their visual content is a key component for content-based video recommendation and retrieval. Thanks to the increasing availability of pre-trained image and video convolutional neural network models, deep visual features are widely used for video content representation. However, as how two videos are relevant is task-dependent, such off-the-shelf features are not always optimal for all tasks. Moreover, due to varied concerns including copyright, privacy and security, one might have access to only pre-computed video features rather than original videos. We propose in this paper feature re-learning for improving video relevance prediction, with no need of revisiting the original video content. In particular, re-learning is realized by projecting a given deep feature into a new space by an affine transformation. We optimize the re-learning process by a novel negative-enhanced triplet ranking loss. In order to generate more training data, we propose a new data augmentation strategy which works directly on frame-level and video-level features. Extensive experiments in the context of the Hulu Content-based Video Relevance Prediction Challenge 2018 justifies the effectiveness of the proposed method and its state-of-the-art performance for content-based video relevance prediction.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xueyao Zhang; Juan Cao; Xirong Li; Qiang Sheng; Lei Zhong; Kai Shu
Mining Dual Emotion for Fake News Detection Proceedings Article
In: The Web Conference 2021 (WWW), 2021.
@inproceedings{web2021-fake-news-detection,
title = {Mining Dual Emotion for Fake News Detection},
author = {Xueyao Zhang and Juan Cao and Xirong Li and Qiang Sheng and Lei Zhong and Kai Shu},
url = {http://lixirong.net/pub/www2021_fake_news_detection.pdf},
year = {2021},
date = {2021-04-19},
booktitle = {The Web Conference 2021 (WWW)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jie Wang; Kaibin Tian; Dayong Ding; Gang Yang; Xirong Li
Unsupervised Domain Expansion for Visual Categorization Journal Article
In: ACM Transactions on Multimedia Computing Communications and Applications (TOMM), 2021.
@article{tomm2021-ude,
title = {Unsupervised Domain Expansion for Visual Categorization},
author = {Jie Wang and Kaibin Tian and Dayong Ding and Gang Yang and Xirong Li},
url = {pub/tomm2021-ude.pdf},
doi = {10.1145/3448108},
year = {2021},
date = {2021-04-02},
urldate = {2021-04-02},
journal = {ACM Transactions on Multimedia Computing Communications and Applications (TOMM)},
abstract = {Expanding visual categorization into a novel domain without the need of extra annotation has been a long-term interest for multimedia intelligence. Previously, this challenge has been approached by unsupervised domain adaptation (UDA). Given labeled data from a source domain and unlabeled data from a target domain, UDA seeks for a deep representation that is both discriminative and domain-invariant. While UDA focuses on the target domain, we argue that the performance on both source and target domains matters, as in practice which domain a test example comes from is unknown. In this paper we extend UDA by proposing a new task called unsupervised domain expansion (UDE), which aims to adapt a deep model for the target domain with its unlabeled data, meanwhile maintaining the model's performance on the source domain. We propose Knowledge Distillation Domain Expansion (KDDE) as a general method for the UDE task. Its domain-adaptation module can be instantiated with any existing model. We develop a knowledge distillation based learning mechanism, enabling KDDE to optimize a single objective wherein the source and target domains are equally treated. Extensive experiments on two major benchmarks, i.e., Office-Home and DomainNet, show that KDDE compares favorably against four competitive baselines, i.e., DDC, DANN, DAAN, and CDAN, for both UDA and UDE tasks. Our study also reveals that the current UDA models improve their performance on the target domain at the cost of noticeable performance loss on the source domain.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Bing Li; Huan Chen; Bilei Zhang; Mingzhen Yuan; Xuemin Jin; Bo Lei; Jie Xu; Wei Gu; David Wong; Xixi He; Hao Wang; Dayong Ding; Xirong Li; Weihong Yu; Youxin Chen
Development and evaluation of a deep learning model for the detection of multiple fundus diseases based on color fundus photography Journal Article
In: British Journal of Ophthalmology, 2021.
@article{bjo2021-mdd,
title = {Development and evaluation of a deep learning model for the detection of multiple fundus diseases based on color fundus photography},
author = {Bing Li and Huan Chen and Bilei Zhang and Mingzhen Yuan and Xuemin Jin and Bo Lei and Jie Xu and Wei Gu and David Wong and Xixi He and Hao Wang and Dayong Ding and Xirong Li and Weihong Yu and Youxin Chen},
year = {2021},
date = {2021-02-16},
journal = {British Journal of Ophthalmology},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jianfeng Dong; Xirong Li; Chaoxi Xu; Xun Yang; Gang Yang; Xun Wang; Meng Wang
Dual Encoding for Video Retrieval by Text Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 2021.
@article{tpami21-dual-encoding,
title = {Dual Encoding for Video Retrieval by Text},
author = {Jianfeng Dong and Xirong Li and Chaoxi Xu and Xun Yang and Gang Yang and Xun Wang and Meng Wang},
url = {http://lixirong.net/pub/tpami2021-video-retrieval.pdf},
doi = {10.1109/TPAMI.2021.3059295},
year = {2021},
date = {2021-02-15},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
abstract = {This paper attacks the challenging problem of video retrieval by text. In such a retrieval paradigm, an end user searches for unlabeled videos by ad-hoc queries described exclusively in the form of a natural-language sentence, with no visual example provided. Given videos as sequences of frames and queries as sequences of words, an effective sequence-to-sequence cross-modal matching is crucial. To that end, the two modalities need to be first encoded into real-valued vectors and then projected into a common space. In this paper we achieve this by proposing a dual deep encoding network that encodes videos and queries into powerful dense representations of their own. Our novelty is two-fold. First, different from prior art that resorts to a specific single-level encoder, the proposed network performs multi-level encoding that represents the rich content of both modalities in a coarse-to-fine fashion. Second, different from a conventional common space learning algorithm which is either concept based or latent space based, we introduce hybrid space learning which combines the high performance of the latent space and the good interpretability of the concept space. Dual encoding is conceptually simple, practically effective and end-to-end trained with hybrid space learning. Extensive experiments on four challenging video datasets show the viability of the new method.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2020
Xirong Li; Wencui Wan; Yang Zhou; Jianchun Zhao; Qijie Wei; Junbo Rong; Pengyi Zhou; Limin Xu; Lijuan Lang; Yuying Liu; Chengzhi Niu; Dayong Ding; Xuemin Jin
Deep Multiple Instance Learning with Spatial Attention for ROP Case Classification, Instance Selection and Abnormality Localization Proceedings Article
In: 25th International Conference on Pattern Recognition (ICPR2020), 2020, (oral).
@inproceedings{icpr20-rop,
title = {Deep Multiple Instance Learning with Spatial Attention for ROP Case Classification, Instance Selection and Abnormality Localization },
author = {Xirong Li and Wencui Wan and Yang Zhou and Jianchun Zhao and Qijie Wei and Junbo Rong and Pengyi Zhou and Limin Xu and Lijuan Lang and Yuying Liu and Chengzhi Niu and Dayong Ding and Xuemin Jin},
url = {http://lixirong.net/pub/icpr2020-rop.pdf},
year = {2020},
date = {2020-12-31},
booktitle = {25th International Conference on Pattern Recognition (ICPR2020)},
abstract = {This paper tackles automated screening of Retinopathy of Prematurity (ROP), one of the most common causes of visual loss in childhood. Clinically, ROP screening per case requires multiple color fundus images capturing different zones of the premature retina. A desirable model shall not only make a decision at the case level, but also pinpoint which instances and what part of the instances are responsible for the decision. This paper makes the first attempt to accomplish three tasks, i.e, ROP case classification, instance selection and abnormality localization in a unified framework. To that end, we propose a new model that effectively combines instance-attention based deep multiple instance learning (MIL) and spatial attention (SA). The propose model, which we term MIL-SA, identifies positive instances in light of
their contributions to case-level decision. Meanwhile, abnormal regions in the identified instances are automatically localized by the SA mechanism. Moreover, MIL-SA is learned from case-level binary labels exclusively, and in an end-to-end manner. Experiments on a large clinical dataset of 2,186 cases with 11,053 fundus images show the viability of the proposed model for all the three tasks. },
note = {oral},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
their contributions to case-level decision. Meanwhile, abnormal regions in the identified instances are automatically localized by the SA mechanism. Moreover, MIL-SA is learned from case-level binary labels exclusively, and in an end-to-end manner. Experiments on a large clinical dataset of 2,186 cases with 11,053 fundus images show the viability of the proposed model for all the three tasks.
Qijie Wei; Xirong Li; Weihong Yu; xiao Zhang; Yongpeng Zhang; Bojie Hu; Bin Mo; Di Gong; Ning Chen; Dayong Ding; Youxin Chen
Learn to Segment Retinal Lesions and Beyond Proceedings Article
In: 25th International Conference on Pattern Recognition (ICPR2020), 2020.
@inproceedings{icpr20-lesion-net,
title = {Learn to Segment Retinal Lesions and Beyond },
author = {Qijie Wei and Xirong Li and Weihong Yu and xiao Zhang and Yongpeng Zhang and Bojie Hu and Bin Mo and Di Gong and Ning Chen and Dayong Ding and Youxin Chen},
url = {http://lixirong.net/pub/icpr2020-lesion.pdf},
year = {2020},
date = {2020-12-31},
booktitle = {25th International Conference on Pattern Recognition (ICPR2020)},
abstract = {Towards automated retinal screening, this paper makes an endeavor to simultaneously achieve pixel-level retinal lesion segmentation and image-level disease classification. Such a multi-task approach is crucial for accurate and clinically interpretable disease diagnosis. Prior art is insufficient due to three challenges, i.e., lesions lacking objective boundaries, clinical importance of lesions irrelevant to their size, and the lack of one-to-one correspondence between lesion and disease classes. This paper attacks the three challenges in the context of diabetic retinopathy (DR) grading. We propose Lesion-Net, a new variant of fully convolutional networks, with its expansive path re-designed to tackle the first challenge. A dual Dice loss that leverages both semantic segmentation and image classification losses is introduced to resolve the second challenge. Lastly, we build a multi-task network that employs Lesion-Net as a side-attention branch for both DR grading and result interpretation. A set of 12K fundus images is manually segmented by 45 ophthalmologists for 8 DR-related lesions, resulting in 290K manual segments in total. Extensive experiments on this large-scale dataset show that our proposed approach surpasses the prior art for multiple tasks including lesion segmentation, lesion classification and DR grading. },
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Aozhu Chen; Xinyi Huang; Hailan Lin; Xirong Li
Towards Annotation-Free Evaluation of Cross-Lingual Image Captioning Proceedings Article
In: ACM Multimedia Asia (MMAsia), 2020.
@inproceedings{mmasia20-anno-free,
title = {Towards Annotation-Free Evaluation of Cross-Lingual Image Captioning},
author = {Aozhu Chen and Xinyi Huang and Hailan Lin and Xirong Li},
url = {http://lixirong.net/pub/mmasia2020-eval-cap.pdf},
doi = {10.1145/3444685.3446322},
year = {2020},
date = {2020-12-31},
booktitle = {ACM Multimedia Asia (MMAsia)},
abstract = {Cross-lingual image captioning, with its ability to caption an unlabelled image in a target language other than English, is an emerging topic in the multimedia field. A large-scale evaluation of the resultant models typically needs thousands of test images associated with manually written captions in the same language. In order to save the precious human resource from re-writing reference sentences per target language, in this paper we make a brave attempt towards annotation-free evaluation of cross-lingual image captioning. Depending on whether we assume the availability of English references, two scenarios are investigated. For the first scenario with the references available, we propose two metrics, i.e., WMDRel and CLinRel. WMDRel measures the semantic relevance between a model-generated caption and machine translation of an English reference using their Word Mover Distance. By projecting both captions into a deep visual feature space, CLinRel is a visual-oriented cross-lingual relevance measure. As for the second scenario, which has zero reference and is thus more challenging, we propose CMedRel to compute a cross-media relevance between the generated caption and the image content, in the same visual feature space as used by CLinRel. We have conducted a number of experiments to evaluate the effectiveness of the three proposed metrics. The combination of WMDRel, CLinRel and CMedRel has a Spearman’s rank correlation of 0.952 with the sum of BLEU-4, METEOR, ROUGE-L and CIDEr, four standard metrics computed using references in the target language. CMedRel alone has a Spearman’s rank correlation of 0.786 with the standard metrics. The promising results show high potential of the new metrics for evaluation with no need of references in the target language.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Fangming Zhou; Aozhu Chen
Renmin University of China at TRECVID 2020: Sentence Encoder Assembly for Ad-hoc Video Search Proceedings Article
In: TRECVID 2020 Workshop, 2020.
@inproceedings{tv20-rucmm,
title = {Renmin University of China at TRECVID 2020: Sentence Encoder Assembly for Ad-hoc Video Search},
author = {Xirong Li and Fangming Zhou and Aozhu Chen},
year = {2020},
date = {2020-12-03},
booktitle = {TRECVID 2020 Workshop},
abstract = {In this paper we summarize our TRECVID 2020 video retrieval experiments. We participated in the Ad-hoc Video Search (AVS) task with fully deep learning based solutions. Our solutions contain two deep models and their variants. One is W2VV++ that vectorizes a given textual query by concatenating the output of multiple sentence encoders such as bag-of-words (BoW), word2vec and GRU. The other is a newly developed model, which we term Sentence Encoder Assembly (SEA). The novelty of the SEA model is two-fold. First, different from the prior art that uses only a single common space, SEA supports text-video matching in multiple encoder-specific common spaces. Such a property prevents the matching from being dominated by a specific encoder that produces an encoding vector much longer than other encoders. Second, in order to explore complementarities among the individual common spaces, we propose to train SEA by multi-space multi-loss learning. We exploit MSR-VTT and TGIF as training data. For video representation, we use pre-trained ResNet-152 and ResNeXt-101 to extract frame-level features, and C3D to extract segment-level features. Video-level features are obtained by mean pooling. Using SEA alone obtains a mean infAP of 0.236 for the 2020 task. Having SEA pre-trained on the Google’s Conceptual Captions dataset is helpful, obtaining a higher infAP of 0.251. We again find late average fusion of dis- tinct models (consisting of SEA and W2VV++ trained in varied settings) beneficial, obtaining the best infAP of 0.269 among our four submissions, and ranked at the second place teamwise.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jakub Lokoč; Tomáš Souček; Patrik Veselý; František Mejzlík; Jiaqi Ji; Chaoxi Xu; Xirong Li
A W2VV++ Case Study with Automated and Interactive Text-to-Video Retrieval Proceedings Article
In: ACM Multimedia, 2020.
@inproceedings{mm20-video-retrieval,
title = {A W2VV++ Case Study with Automated and Interactive Text-to-Video Retrieval},
author = {Jakub Lokoč and Tomáš Souček and Patrik Veselý and František Mejzlík and Jiaqi Ji and Chaoxi Xu and Xirong Li},
url = {http://lixirong.net/pub/mm2020-lokoc.pdf},
doi = {10.1145/3394171.3414002},
year = {2020},
date = {2020-10-12},
booktitle = {ACM Multimedia},
abstract = {As reported by respected evaluation campaigns focusing both on automated and interactive video search approaches, deep learning started to dominate the video retrieval area. However, the results are still not satisfactory for many types of search tasks focusing on high recall. To report on this challenging problem, we present two orthogonal task-based performance studies centered around the state-of-the-art W2VV++ query representation learning model for video retrieval. First, an ablation study is presented to investigate which components of the model are effective in two types of benchmark tasks focusing on high recall. Second, interactive search scenarios from the Video Browser Showdown are analyzed for two winning prototype systems providing additional querying and visualization components for the incorporated W2VV++ text search model. The analysis of collected logs demonstrates that even with state-of-the-art text search video retrieval models, it is still auspicious to integrate users into the search process for tasks, where high recall is essential.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhengxiong Jia; Xirong Li
iCap: Interactive Image Captioning with Predictive Text Proceedings Article
In: ACM International Conference on Multimedia Retrieval (ICMR), 2020.
@inproceedings{icmr20-icap,
title = {iCap: Interactive Image Captioning with Predictive Text},
author = {Zhengxiong Jia and Xirong Li},
url = {pub/icmr2020-icap.pdf},
doi = {10.1145/3372278.3390697},
year = {2020},
date = {2020-06-08},
booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)},
abstract = {In this paper we study a brand new topic of interactive image captioning with human in the loop. Different from automated image captioning where a given test image is the sole input in the inference stage, we have access to both the test image and a sequence of (incomplete) user-input sentences in the interactive scenario. We formulate the problem as Visually Conditioned Sentence Completion (VCSC). For VCSC, we propose ABD-Cap, asynchronous bidirectional decoding for image caption completion. With ABD-Cap as the core module, we build iCap, a web-based interactive image captioning system capable of predicting new text with respect to live input from a user. A number of experiments covering both automated evaluations and real user studies show the viability of our proposals.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Yutong Liu; Jingyuan Yang; Yang Zhou; Weisen Wang; Jianchun Zhao; Weihong Yu; Dingding Zhang; Dayong Ding; Xirong Li; Youxin Chen
In: British Journal of Ophthalmology, 2020.
@article{bjo20-ganoct,
title = {Prediction of OCT Images of Short-term Response to Anti-VEGF Treatment for Neovascular Age-related Macular Degeneration using Generative Adversarial Network},
author = {Yutong Liu and Jingyuan Yang and Yang Zhou and Weisen Wang and Jianchun Zhao and Weihong Yu and Dingding Zhang and Dayong Ding and Xirong Li and Youxin Chen},
doi = {10.1136/bjophthalmol-2019-315338},
year = {2020},
date = {2020-02-26},
journal = {British Journal of Ophthalmology},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2019
Xirong Li; Jinde Ye; Chaoxi Xu; Shanjinwen Yun; Leimin Zhang; Xun Wang; Rui Qian; Jianfeng Dong
Renmin University of China and Zhejiang Gongshang University at TRECVID 2019: Learn to Search and Describe Videos Proceedings Article
In: TRECVID 2019 Workshop, 2019.
@inproceedings{trecvid19-rucmm,
title = {Renmin University of China and Zhejiang Gongshang University at TRECVID 2019: Learn to Search and Describe Videos},
author = {Xirong Li and Jinde Ye and Chaoxi Xu and Shanjinwen Yun and Leimin Zhang and Xun Wang and Rui Qian and Jianfeng Dong},
url = {pub/trecvid2019-rucmm.pdf},
year = {2019},
date = {2019-11-12},
booktitle = {TRECVID 2019 Workshop},
abstract = {In this paper we summarize our TRECVID 2019 video retrieval experiments. We participated in two tasks: Ad-hoc Video Search (AVS) and Video-to-Text (VTT). For the AVS task, we develop our solutions based on two deep learn- ing models, i.e. the W2VV++ network and the Dual Encoding Network. For the VTT Matching and Rank- ing subtask, our entry is also based on the W2VV++ and Dual Encoding Networks. For the VTT Description Generation subtask, we enhance the classical encoder-decoder model with multi-level video encoding and attribute prediction. The 2019 edition of the TRECVID benchmark has been a fruitful participation for our joint-team. Our runs are ranked at the second place for AVS and VTT Matching and Ranking tasks and the third place for the VTT Description Generation subtask in terms of the ciderD criterion.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Chaoxi Xu; Gang Yang; Zhineng Chen; Jianfeng Dong
W2VV++: Fully Deep Learning for Ad-hoc Video Search Proceedings Article
In: ACM Multimedia, 2019.
@inproceedings{mm2019-w2vvpp,
title = {W2VV++: Fully Deep Learning for Ad-hoc Video Search},
author = {Xirong Li and Chaoxi Xu and Gang Yang and Zhineng Chen and Jianfeng Dong},
url = {pub/mm2019-w2vvpp.pdf},
doi = {10.1145/3343031.3350906},
year = {2019},
date = {2019-10-21},
booktitle = { ACM Multimedia},
abstract = {Ad-hoc video search (AVS) is an important yet challenging problem in multimedia retrieval. Different from previous concept-based methods, we propose an end-to-end deep learning method for query representation learning. The proposed method requires no concept modeling, matching and selection. The backbone of our method is the proposed W2VV++ model, a super version of Word2VisualVec (W2VV) previously developed for visual-to-text matching. W2VV++ is obtained by tweaking W2VV with a better sentence encoding strategy and an improved triplet ranking loss. With these simple changes, W2VV++ brings in a substantial improvement in performance. As our participation in the TRECVID 2018 AVS task and retrospective experiments on the TRECVID 2016 and 2017 data show, our best single model, with an overall inferred average precision (infAP) of 0.157, outperforms the state-of-the-art. The performance can be further boosted by model ensemble using late average fusion, reaching a higher infAP of 0.163. With W2VV++, we establish a new baseline for ad-hoc video search.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chaoxi Xu; Xiangjia Zhu; Wenwen He; Yi Lu; Xixi He; Zongjiang Shang; Jun Wu; Keke Zhang; Yinglei Zhang; Xianfang Rong; Zhennan Zhao; Lei Cai; Dayong Ding; Xirong Li
Fully Deep Learning for Slit-lamp Photo based Nuclear Cataract Grading Proceedings Article
In: International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI), 2019, (early accept).
@inproceedings{miccai19-cataract,
title = {Fully Deep Learning for Slit-lamp Photo based Nuclear Cataract Grading},
author = {Chaoxi Xu and Xiangjia Zhu and Wenwen He and Yi Lu and Xixi He and Zongjiang Shang and Jun Wu and Keke Zhang and Yinglei Zhang and Xianfang Rong and Zhennan Zhao and Lei Cai and Dayong Ding and Xirong Li},
url = {pub/miccai2019-cataract.pdf},
doi = {10.1007/978-3-030-32251-9_56},
year = {2019},
date = {2019-10-13},
booktitle = {International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)},
note = {early accept},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Weisen Wang; Zhiyan Xu; Weihong Yu; Jianchun Zhao; Jingyuan Yang; Feng He; Zhikun Yang; Di Chen; Dayong Ding; Youxin Chen; Xirong Li
Two-Stream CNN with Loose Pair Training for Multi-modal AMD Categorization Proceedings Article
In: International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI), 2019, (early accept).
@inproceedings{miccai19-amd,
title = {Two-Stream CNN with Loose Pair Training for Multi-modal AMD Categorization},
author = {Weisen Wang and Zhiyan Xu and Weihong Yu and Jianchun Zhao and Jingyuan Yang and Feng He and Zhikun Yang and Di Chen and Dayong Ding and Youxin Chen and Xirong Li},
url = {https://link.springer.com/chapter/10.1007%2F978-3-030-32239-7_18},
doi = {10.1007/978-3-030-32239-7_18},
year = {2019},
date = {2019-10-13},
booktitle = {International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)},
note = {early accept},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhuoya Yang; Xirong Li; Xixi He; Dayong Ding; Yanting Wang; Fangfang Dai; Xuemin Jin
Joint Localization of Optic Disc and Fovea in Ultra-Widefield Fundus Images Proceedings Article
In: The 10th International Workshop on Machine Learning in Medical Imaging (MLMI), 2019, (in conjunction with MICCAI 2019).
@inproceedings{mlmi2019-uwf,
title = {Joint Localization of Optic Disc and Fovea in Ultra-Widefield Fundus Images},
author = {Zhuoya Yang and Xirong Li and Xixi He and Dayong Ding and Yanting Wang and Fangfang Dai and Xuemin Jin },
url = {http://lixirong.net/pub/mlmi2019-uwf.pdf},
doi = {10.1007/978-3-030-32692-0_52},
year = {2019},
date = {2019-10-13},
booktitle = {The 10th International Workshop on Machine Learning in Medical Imaging (MLMI)},
note = {in conjunction with MICCAI 2019},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Chaoxi Xu; Xiaoxu Wang; Weiyu Lan; Zhengxiong Jia; Gang Yang; Jieping Xu
COCO-CN for Cross-Lingual Image Tagging, Captioning and Retrieval Journal Article
In: IEEE Transactions on Multimedia, vol. 21, no. 9, pp. 2347-2360, 2019.
@article{tmm2019-cococn,
title = {COCO-CN for Cross-Lingual Image Tagging, Captioning and Retrieval},
author = {Xirong Li and Chaoxi Xu and Xiaoxu Wang and Weiyu Lan and Zhengxiong Jia and Gang Yang and Jieping Xu},
url = {/pub/tmm2019-cococn.pdf},
doi = {10.1109/TMM.2019.2896494},
year = {2019},
date = {2019-09-01},
journal = {IEEE Transactions on Multimedia},
volume = {21},
number = {9},
pages = {2347-2360},
abstract = {This paper contributes to cross-lingual image annotation and retrieval in terms of data and baseline methods. We propose COCO-CN, a novel dataset enriching MS-COCO with manually written Chinese sentences and tags. For more effective annotation acquisition, we develop a recommendation-assisted collective annotation system, automatically providing an annotator with several tags and sentences deemed to be relevant with respect to the pictorial content. Having 20,342 images annotated with 27,218 Chinese sentences and 70,993 tags, COCO-CN is currently the largest Chinese-English dataset that provides a unified and challenging platform for cross-lingual image tagging, captioning and retrieval. We develop conceptually simple yet effective methods per task for learning from cross-lingual resources. Extensive experiments on the three tasks justify the viability of the proposed dataset and methods. Data and code are publicly available at https://github.com/li-xirong/coco-cn.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jianfeng Dong; Xirong Li; Chaoxi Xu; Shouling Ji; Yuan He; Gang Yang; Xun Wang
Dual Encoding for Zero-Example Video Retrieval Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2019.
@inproceedings{cvpr19-zevr,
title = {Dual Encoding for Zero-Example Video Retrieval},
author = {Jianfeng Dong and Xirong Li and Chaoxi Xu and Shouling Ji and Yuan He and Gang Yang and Xun Wang},
url = {/pub/cvpr2019-dense-encoding.pdf},
year = {2019},
date = {2019-06-15},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
abstract = {This paper attacks the challenging problem of zero- example video retrieval. In such a retrieval paradigm, an end user searches for unlabeled videos by ad-hoc queries described in natural language text with no visual example provided. Given videos as sequences of frames and queries as sequences of words, an effective sequence-to-sequence cross-modal matching is required. The majority of exist- ing methods are concept based, extracting relevant concepts from queries and videos and accordingly establish- ing associations between the two modalities. In contrast, this paper takes a concept-free approach, proposing a dual deep encoding network that encodes videos and queries into powerful dense representations of their own. Dual encoding is conceptually simple, practically effective and end- to-end. As experiments on three benchmarks, i.e. MSR- VTT, TRECVID 2016 and 2017 Ad-hoc Video Search show, the proposed solution establishes a new state-of-the-art for zero-example video retrieval.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xin Lai; Xirong Li; Rui Qian; Dayong Ding; Jun Wu; Jieping Xu
Four Models for Automatic Recognition of Left and Right Eye in Fundus Images Conference
the 25th International Conference on MultiMedia Modeling (MMM), 2019.
@conference{mmm2019-left-right-eye,
title = {Four Models for Automatic Recognition of Left and Right Eye in Fundus Images},
author = {Xin Lai and Xirong Li and Rui Qian and Dayong Ding and Jun Wu and Jieping Xu},
url = {http://lixirong.net/pub/mmm2019-left-right-eyes.pdf},
year = {2019},
date = {2019-01-08},
booktitle = {the 25th International Conference on MultiMedia Modeling (MMM)},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
蓝玮毓; 王晓旭; 杨刚; 李锡荣
标签增强的中文看图造句 (Improving Chinese Image Captioning by Tag Prediction) Journal Article
In: 计算机学报 (Chinese Journal of Computers), vol. 42, no. 1, pp. 136-148, 2019.
@article{cjc2019-imcap,
title = {标签增强的中文看图造句 (Improving Chinese Image Captioning by Tag Prediction)},
author = {蓝玮毓 and 王晓旭 and 杨刚 and 李锡荣},
url = {/pub/cjc2019-lwy.pdf},
doi = {10.11897/SP.J.1016.2019.00136},
year = {2019},
date = {2019-01-01},
journal = {计算机学报 (Chinese Journal of Computers)},
volume = {42},
number = {1},
pages = {136-148},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2018
Qijie Wei; Xirong Li; Hao Wang; Dayong Ding; Weihong Yu; Youxin Chen
Laser Scar Detection in Fundus Images using Convolutional Neural Networks Conference
Asian Conference on Computer Vision (ACCV), 2018.
@conference{accv2018-laser-scar,
title = {Laser Scar Detection in Fundus Images using Convolutional Neural Networks},
author = {Qijie Wei and Xirong Li and Hao Wang and Dayong Ding and Weihong Yu and Youxin Chen},
url = {/pub/accv2018-laser-scar-detection.pdf},
doi = {10.1007/978-3-030-20870-7_12},
year = {2018},
date = {2018-12-02},
booktitle = {Asian Conference on Computer Vision (ACCV)},
pages = {191--206},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Xirong Li; Jianfeng Dong; Chaoxi Xu; Jing Cao; Xun Wang; Gang Yang
Renmin University of China and Zhejiang Gongshang University at TRECVID 2018: Deep Cross-Modal Embeddings for Video-Text Retrieval Proceedings Article
In: TRECVID 2018 Workshop, 2018.
@inproceedings{trecvid2018-rucmm,
title = {Renmin University of China and Zhejiang Gongshang University at TRECVID 2018: Deep Cross-Modal Embeddings for Video-Text Retrieval},
author = {Xirong Li and Jianfeng Dong and Chaoxi Xu and Jing Cao and Xun Wang and Gang Yang},
url = {pub/trecvid2018-rucmm.pdf},
year = {2018},
date = {2018-11-13},
booktitle = {TRECVID 2018 Workshop},
abstract = {In this paper we summarize our TRECVID 2018 video retrieval experiments. We participated in two tasks: Ad-hoc Video Search (AVS) and Video-to-Text (VTT) Matching and Ranking. For the AVS task, we develop our solutions based on W2VV++, a super version of Word2VisualVec (W2VV). For the VTT task, our entry is built on the top of a recently proposed dual dense encoding network, which encodes an input, let it be a video or a natural language sentence, in a dense manner. The 2018 edition of the TRECVID benchmark has been a fruitful participation for our joint-team, resulting in the best overall result for both AVS and VTT tasks. Retrospective experiments show that our ad-hoc video search system, used as is, also outperforms the previous best results of the TRECVID 2016 and 2017 AVS tasks. We have released feature data at https://github.com/li-xirong/avs},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jianfeng Dong; Xirong Li; Chaoxi Xu; Gang Yang; Xun Wang
Feature Re-Learning with Data Augmentation for Content-based Video Recommendation Conference
ACM Multimedia, 2018, (Grand challenge paper).
@conference{mm2018-cbvrp,
title = {Feature Re-Learning with Data Augmentation for Content-based Video Recommendation},
author = {Jianfeng Dong and Xirong Li and Chaoxi Xu and Gang Yang and Xun Wang},
url = {/pub/mm2018-cbvr.pdf},
doi = {10.1145/3240508.3266441},
year = {2018},
date = {2018-10-23},
booktitle = {ACM Multimedia},
abstract = {This paper describes our solution for the Hulu Content-based Video Relevance Prediction Challenge. Noting the deficiency of the original features, we propose feature re-learning to improve video relevance prediction. To generate more training instances for supervised learning, we develop two data augmentation strategies, one for frame-level features and the other for video-level features. In addition, late fusion of multiple models is employed to further boost the performance. Evaluation conducted by the organizers shows that our best run outperforms the Hulu baseline, obtaining relative improvements of 26.2% and 30.2% on the TV-shows track and the Movies track, respectively, in terms of recall@100. The results clearly justify the effectiveness of the proposed solution.},
note = {Grand challenge paper},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Gang Yang; Jinlu Liu; Jieping Xu; Xirong Li
Dissimilarity Representation Learning for Generalized Zero-Shot Recognition Conference
ACM Multimedia, 2018.
@conference{mm2018-gzsl,
title = {Dissimilarity Representation Learning for Generalized Zero-Shot Recognition},
author = {Gang Yang and Jinlu Liu and Jieping Xu and Xirong Li},
year = {2018},
date = {2018-10-22},
booktitle = {ACM Multimedia},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Bin Liang; Hongcheng Li; Miaoqiang Su; Pan Bian; Xirong Li; Wenchang Shi
Deep Text Classification Can be Fooled Conference
IJCAI, 2018.
@conference{ijcai18-textfool,
title = {Deep Text Classification Can be Fooled},
author = {Bin Liang and Hongcheng Li and Miaoqiang Su and Pan Bian and Xirong Li and Wenchang Shi},
url = {/pub/ijcai2018-text-fool.pdf},
year = {2018},
date = {2018-07-13},
booktitle = {IJCAI},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Gang Yang; Jinlu Liu; Xirong Li
Imagination Based Sample Construction for Zero-Shot Learning Conference
SIGIR, 2018.
@conference{sigir18-zsl,
title = {Imagination Based Sample Construction for Zero-Shot Learning},
author = {Gang Yang and Jinlu Liu and Xirong Li},
url = {/pub/sigir2018-zsl.pdf},
year = {2018},
date = {2018-07-08},
booktitle = {SIGIR},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Jianfeng Dong; Xirong Li; Cees G. M. Snoek
Predicting Visual Features from Text for Image and Video Caption Retrieval Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 20, no. 12, pp. 3377-3388, 2018.
@article{tmm18-w2vv,
title = {Predicting Visual Features from Text for Image and Video Caption Retrieval},
author = {Jianfeng Dong and Xirong Li and Cees G. M. Snoek},
url = {pub/tmm2018-w2vv.pdf},
doi = {10.1109/TMM.2018.2832602},
year = {2018},
date = {2018-05-02},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {20},
number = {12},
pages = {3377-3388},
abstract = {This paper strives to find amidst a set of sentences the one best describing the content of a given image or video. Different from existing works, which rely on a joint subspace for their image and video caption retrieval, we propose to do so in a visual space exclusively. Apart from this conceptual novelty, we contribute Word2VisualVec, a deep neural network architecture that learns to predict a visual feature representation from textual input. Example captions are encoded into a textual embedding based on multi-scale sentence vectorization and further transferred into a deep visual feature of choice via a simple multi-layer perceptron. We further generalize Word2VisualVec for video caption retrieval, by predicting from text both 3-D convolutional neural network features as well as a visual-audio representation. Experiments on Flickr8k, Flickr30k, the Microsoft Video Description dataset and the very recent NIST TrecVid challenge for video caption retrieval detail Word2VisualVec’s properties, its benefit over textual embeddings, the potential for multimodal query composition and its state-of-the-art results.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jianfeng Dong; Xirong Li; Duanqing Xu
Cross-Media Similarity Evaluation for Web Image Retrieval in the Wild Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 20, no. 9, pp. 2371-2384, 2018.
@article{tmm2018-cross-media,
title = {Cross-Media Similarity Evaluation for Web Image Retrieval in the Wild},
author = {Jianfeng Dong and Xirong Li and Duanqing Xu},
url = {/pub/tmm2018-web-image-retrieval.pdf},
doi = {10.1109/TMM.2018.2796248},
year = {2018},
date = {2018-01-23},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {20},
number = {9},
pages = {2371-2384},
abstract = {In order to retrieve unlabeled images by textual queries, cross-media similarity computation is a key ingredient. Although novel methods are continuously introduced, little has been done to evaluate these methods together with large-scale query log analysis. Consequently, how far have these methods brought us in answering real-user queries is unclear. Given baseline methods that use relatively simple text/image matching, how much progress have advanced models made is also unclear. This paper takes a pragmatic approach to answering the two questions. Queries are automatically categorized according to the proposed query visualness measure, and later connected to the evaluation of multiple cross-media similarity models on three test sets. Such a connection reveals that the success of the state-of-the-art is mainly attributed to their good performance on visual-oriented queries, which account for only a small part of real-user queries. To quantify the current progress, we propose a simple text2image method, representing a novel query by a set of images selected from large-scale query log. Consequently, computing cross-media similarity between the query and a given image boils down to comparing the visual similarity between the given image and the selected images. Image retrieval experiments on the challenging Clickture dataset show that the proposed text2image is a strong baseline, comparing favorably to recent deep learning alternatives.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2017
Cees G. M. Snoek; Xirong Li; Chaoxi Xu; Dennis C. Koelma
TRECVID Workshop, 2017.
@conference{tv2017-uvaruc,
title = {University of Amsterdam and Renmin University at TRECVID 2017: Searching Video, Detecting Events and Describing Video},
author = {Cees G. M. Snoek and Xirong Li and Chaoxi Xu and Dennis C. Koelma},
url = {/pub/mediamill-TRECVID2017-final.pdf},
year = {2017},
date = {2017-11-15},
booktitle = {TRECVID Workshop},
abstract = {In this paper we summarize our TRECVID 2017 [1] video recognition and retrieval experiments. We participated in three tasks: video search, event detection and video description. For both video search and event detection we explore semantic representations based on VideoStory [8] and an ImageNet Shuffle [16], which thrive well in few-example regimes. For the video description task we experiment with a deep network that predicts a visual representation from a natural language description with Word2VisualVec [5], and use this space for the sentence matching. For generative description we enhance a neural image captioning model with Early Embedding and Late Reranking [4]. The 2017 edition of the TRECVID benchmark has been a fruitful participation for our joint-team, resulting in the best overall result for video search and event detection as well as the runner-up position for video description.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Weiyu Lan; Xirong Li; Jianfeng Dong
Fluency-Guided Cross-Lingual Image Captioning Proceedings Article
In: ACM Multimedia, 2017.
@inproceedings{mm2017-fluency,
title = {Fluency-Guided Cross-Lingual Image Captioning},
author = {Weiyu Lan and Xirong Li and Jianfeng Dong},
url = {/pub/mm2017-weiyu.pdf},
doi = {10.1145/3123266.3123366},
year = {2017},
date = {2017-10-23},
booktitle = {ACM Multimedia},
abstract = {Image captioning has so far been explored mostly in English, as most available datasets are in this language. However, the application of image captioning should not be restricted by language. Only few studies have been conducted for image captioning in a cross-lingual setting. Different from these works that manually build a dataset for a target language, we aim to learn a cross-lingual captioning model fully from machine-translated sentences. To conquer the lack of fluency in the translated sentences, we propose in this paper a fluency-guided learning framework. The framework comprises a module to automatically estimate the fluency of the sentences and another module to utilize the estimated fluency scores to effectively train an image captioning model for the target language. As experiments on two bilingual (English-Chinese) datasets show, our approach improves both fluency and relevance of the generated captions in Chinese, but without using any manually written sentences from the target language.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Qijie Wei; Xiaoxu Wang; Xirong Li
Harvesting Deep Models for Cross-Lingual Image Annotation Conference
The 15th International Workshop on Content-Based Multimedia Indexing (CBMI), 2017.
@conference{cbmi2017-crossling,
title = {Harvesting Deep Models for Cross-Lingual Image Annotation},
author = {Qijie Wei and Xiaoxu Wang and Xirong Li},
url = {pub/cbmi2017_cross-lin.pdf},
doi = {10.1145/3095713.3095751},
year = {2017},
date = {2017-06-19},
booktitle = {The 15th International Workshop on Content-Based Multimedia Indexing (CBMI)},
abstract = {This paper considers cross-lingual image annotation, harvesting deep visual models from one language to annotate images with labels from another language. This task cannot be accomplished by machine translation, as labels can be ambiguous and a translated vocabulary leaves us limited freedom to annotate images with appropriate labels. Given non-overlapping vocabularies between two languages, we formulate cross-lingual image annotation as a zero-shot learning problem. For cross-lingual label matching, we adapt zero-shot by replacing the current monolingual semantic embedding space by a bilingual alternative. In order to reduce both label ambiguity and redundancy we propose a simple yet effective approach called label-enhanced zero-shot learning. Using three state-of-the-art deep visual models, i.e., ResNet-152, GoogleNet-Shuffle and OpenImages, experiments on the test set of Flickr8k-CN demonstrate the viability of the proposed approach for cross-lingual image annotation.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Xirong Li
Tag Relevance Fusion for Social Image Retrieval Journal Article
In: Multimedia Systems, vol. 23, no. 1, pp. 29–40, 2017, ISSN: 1432-1882.
@article{mmsy2014-tagrelfusion,
title = {Tag Relevance Fusion for Social Image Retrieval},
author = {Xirong Li},
url = {/pub/mmsy2014-tagrelfusion.pdf},
doi = {10.1007/s00530-014-0430-9},
issn = {1432-1882},
year = {2017},
date = {2017-02-01},
journal = {Multimedia Systems},
volume = {23},
number = {1},
pages = {29–40},
abstract = {Due to the subjective nature of social tagging, measuring the relevance of social tags with respect to the visual content is crucial for retrieving the increasing amounts of social-networked images. Witnessing the limit of a single measurement of tag relevance, we introduce in this paper tag relevance fusion as an extension to methods for tag relevance estimation. We present a systematic study, covering tag relevance fusion in early and late stages, and in supervised and unsupervised settings. Experiments on a large present-day benchmark set show that tag relevance fusion leads to better image retrieval. Moreover, unsupervised tag relevance fusion is found to be practically as effective as supervised tag relevance fusion, but without the need of any training efforts. This finding suggests the potential of tag relevance fusion for real-world deployment.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2016
Cees G. M. Snoek; Jianfeng Dong; Xirong Li; Xiaoxu Wang; Qijie Wei; Weiyu Lan; Efstratios Gavves; Noureldien Hussein; Dennis C. Koelma; Arnold W. M. Smeulders
University of Amsterdam and Renmin University at TRECVID 2016: Searching Video, Detecting Events and Describing Video Conference
TRECVID Workshop, 2016.
@conference{tv2016-uvaruc,
title = {University of Amsterdam and Renmin University at TRECVID 2016: Searching Video, Detecting Events and Describing Video},
author = {Cees G. M. Snoek and Jianfeng Dong and Xirong Li and Xiaoxu Wang and Qijie Wei and Weiyu Lan and Efstratios Gavves and Noureldien Hussein and Dennis C. Koelma and Arnold W. M. Smeulders},
year = {2016},
date = {2016-11-01},
booktitle = {TRECVID Workshop},
abstract = {In this paper we summarize our TRECVID 2016 video recognition experiments. We participated in three tasks: video search, event detection and video description. Here we describe the tasks on event detection and video description. For event detection we explore semantic representations based on VideoStory and an ImageNet Shuffle for both zero-shot and few-example regimes. For the showcase task on video description we experiment with a deep network that predicts a visual representation from a natural language description, and use this space for the sentence matching. For generative description we enhance a neural image captioning model with Early Embedding and Late Reranking. The 2016 edition of the TRECVID benchmark has been a fruitful participation for our joint-team, resulting in the best overall result for zero- and few-example event detection as well as video description by matching and in generative mode.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Jianfeng Dong; Xirong Li; Weiyu Lan; Yujia Huo; Cees G. M. Snoek
Early Embedding and Late Reranking for Video Captioning Proceedings Article
In: ACM Multimedia, 2016, (Grand Challenge Award).
@inproceedings{mm2016-v2t,
title = {Early Embedding and Late Reranking for Video Captioning},
author = {Jianfeng Dong and Xirong Li and Weiyu Lan and Yujia Huo and Cees G. M. Snoek},
url = {http://lixirong.net/pub/mm2016-video2text.pdf},
doi = {10.1145/2964284.2984064},
year = {2016},
date = {2016-10-15},
urldate = {2016-10-15},
booktitle = {ACM Multimedia},
abstract = {This paper describes our solution for the MSR Video to Language Challenge. We start from the popular ConvNet + LSTM model, which we extend with two novel modules. One is early embedding, which enriches the current low-level input to LSTM by tag embeddings. The other is late reranking, for re-scoring generated sentences in terms of their relevance to a specific video. The modules are inspired by recent works on image captioning, repurposed and redesigned for video. As experiments on the MSR-VTT validation set show, the joint use of these two modules add a clear improvement over a non-trivial ConvNet + LSTM baseline under four performance metrics. The viability of the proposed solution is further confirmed by the blind test by the organizers. Our system is ranked at the 4th place in terms of overall performance, while scoring the best CIDEr-D, which measures the human-likeness of generated captions.},
note = {Grand Challenge Award},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Yujia Huo; Qin Jin; Jieping Xu
Detecting Violence in Video using Subclasses Proceedings Article
In: ACM Multimedia, 2016.
@inproceedings{mm2016-videoviolence,
title = {Detecting Violence in Video using Subclasses},
author = {Xirong Li and Yujia Huo and Qin Jin and Jieping Xu},
url = {http://lixirong.net/pub/mm2016-vsd.pdf},
doi = {10.1145/2964284.2967289},
year = {2016},
date = {2016-10-01},
booktitle = {ACM Multimedia},
abstract = {This paper attacks the challenging problem of violence detection in videos. Different from existing works focusing on combining multi-modal features, we go one step further by adding and exploiting subclasses visually related to violence. We enrich the MediaEval 2015 violence dataset by manually labeling violence videos with respect to the subclasses. Such fine-grained annotations not only help understand what have impeded previous efforts on learning to fuse the multi-modal features, but also enhance the generalization ability of the learned fusion to novel test data. The new subclass based solution, with AP of 0.303 and P100 of 0.55 on the MediaEval 2015 test set, outperforms the state-of-the-art. Notice that our solution does not require fine-grained annotations on the test set, so it can be directly applied on novel and fully unlabeled videos. Interestingly, our study shows that motion related features (MBH, HOG and HOF), though being essential part in previous systems, are seemingly dispensable.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Qin Jin
Improving Image Captioning by Concept-based Sentence Reranking Proceedings Article
In: The 17th Pacific-Rim Conference on Multimedia (PCM), pp. 231-240, 2016, (Best Paper Runner-up).
@inproceedings{pcm2016-sent-rerank,
title = {Improving Image Captioning by Concept-based Sentence Reranking},
author = {Xirong Li and Qin Jin},
url = {http://lixirong.net/pub/pcm2016-sent-rerank.pdf},
doi = {10.1007/978-3-319-48896-7_23},
year = {2016},
date = {2016-09-15},
booktitle = {The 17th Pacific-Rim Conference on Multimedia (PCM)},
pages = {231-240},
abstract = {This paper describes our winning entry in the ImageCLEF 2015 image sentence generation task. We improve Google’s CNN-LSTM model by introducing concept-based sentence reranking, a data-driven approach which exploits the large amounts of concept-level annotations on Flickr. Different from previous usage of concept detection that is tailored to specific image captioning models, the propose approach reranks predicted sentences in terms of their matches with detected concepts, essentially treating the underlying model as a black box. This property makes the approach applicable to a number of existing solutions. We also experiment with fine tuning on the deep language model, which improves the performance further. Scoring METEOR of 0.1875 on the ImageCLEF 2015 test set, our system outperforms the runner-up (METEOR of 0.1687) with a clear margin.},
note = {Best Paper Runner-up},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Masoud Mazloom; Xirong Li; Cees G. M. Snoek
TagBook: A Semantic Video Representation Without Supervision for Event Detection Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 18, no. 7, pp. 1378-1388, 2016.
@article{tmm2016-tagbook,
title = {TagBook: A Semantic Video Representation Without Supervision for Event Detection},
author = {Masoud Mazloom and Xirong Li and Cees G. M. Snoek},
url = {http://lixirong.net/pub/tmm2016-tagbook.pdf},
year = {2016},
date = {2016-07-01},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {18},
number = {7},
pages = {1378-1388},
abstract = {We consider the problem of event detection in video for scenarios where only a few, or even zero, examples are available for training. For this challenging setting, the prevailing solutions in the literature rely on a semantic video representation obtained from thousands of pretrained concept detectors. Different from existing work, we propose a new semantic video representation that is based on freely available social tagged videos only, without the need for training any intermediate concept detectors. We introduce a simple algorithm that propagates tags from a video's nearest neighbors, similar in spirit to the ones used for image retrieval, but redesign it for video event detection by including video source set refinement and varying the video tag assignment. We call our approach TagBook and study its construction, descriptiveness, and detection performance on the TRECVID 2013 and 2014 multimedia event detection datasets and the Columbia Consumer Video dataset. Despite its simple nature, the proposed TagBook video representation is remarkably effective for few-example and zero-example event detection, even outperforming very recent state-of-the-art alternatives building on supervised representations.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xirong Li; Weiyu Lan; Jianfeng Dong; Hailong Liu
Adding Chinese Captions to Images Proceedings Article
In: Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval (ICMR), pp. 271–275, 2016.
@inproceedings{icmr2016-chisent,
title = {Adding Chinese Captions to Images},
author = { Xirong Li and Weiyu Lan and Jianfeng Dong and Hailong Liu},
url = {http://lixirong.net/pub/icmr2016_chisent.pdf},
doi = {10.1145/2911996.2912049},
year = {2016},
date = {2016-06-09},
booktitle = {Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval (ICMR)},
pages = {271--275},
abstract = {This paper extends research on automated image captioning in the dimension of language, studying how to generate Chinese sentence descriptions for unlabeled images. To evaluate image captioning in this novel context, we present Flickr8k-CN, a bilingual extension of the popular Flickr8k set. The new multimedia dataset can be used to quantitatively assess the performance of Chinese captioning and English-Chinese machine translation. The possibility of re-using existing English data and models via machine translation is investigated. Our study reveals to some extent that a computer can master two distinct languages, English and Chinese, at a similar level for describing the visual world. Data is publicly available at http://tinyurl.com/flickr8kcn},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Tiberio Uricchio; Lamberto Ballan; Marco Bertini; Cees G. M. Snoek; Alberto Del Bimbo
Socializing the Semantic Gap: A Comparative Survey on Image Tag Assignment, Refinement, and Retrieval Journal Article
In: ACM Computing Surveys (CSUR), vol. 49, no. 1, pp. 14:1-14:39, 2016.
@article{csur2016-tagsurvey,
title = {Socializing the Semantic Gap: A Comparative Survey on Image Tag Assignment, Refinement, and Retrieval},
author = {Xirong Li and Tiberio Uricchio and Lamberto Ballan and Marco Bertini and Cees G. M. Snoek and Alberto Del Bimbo},
url = {http://lixirong.net/pub/csur2016-tagsurvey.pdf},
doi = {10.1145/2906152},
year = {2016},
date = {2016-06-01},
journal = {ACM Computing Surveys (CSUR)},
volume = {49},
number = {1},
pages = {14:1-14:39},
abstract = {Where previous reviews on content-based image retrieval emphasize what can be seen in an image to bridge the semantic gap, this survey considers what people tag about an image. A comprehensive treatise of three closely linked problems (i.e., image tag assignment, refinement, and tag-based image retrieval) is presented. While existing works vary in terms of their targeted tasks and methodology, they rely on the key functionality of tag relevance, that is, estimating the relevance of a specific tag with respect to the visual content of a given image and its social context. By analyzing what information a specific method exploits to construct its tag relevance function and how such information is exploited, this article introduces a two-dimensional taxonomy to structure the growing literature, understand the ingredients of the main works, clarify their connections and difference, and recognize their merits and limitations. For a head-to-head comparison with the state of the art, a new experimental protocol is presented, with training sets containing 10,000, 100,000, and 1 million images, and an evaluation on three test sets, contributed by various research groups. Eleven representative works are implemented and evaluated. Putting all this together, the survey aims to provide an overview of the past and foster progress for the near future.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2015
Xirong Li; Tiberio Uricchio; Lamberto Ballan; Marco Bertini; Cees G. M. Snoek; Alberto Del Bimbo
Image Tag Assignment, Refinement and Retrieval Proceedings Article
In: Proceedings of the 23rd Annual ACM Conference on Multimedia Conference (ACMMM), pp. 1325–1326, 2015.
@inproceedings{mm2015-tut,
title = {Image Tag Assignment, Refinement and Retrieval},
author = {Xirong Li and Tiberio Uricchio and Lamberto Ballan and Marco Bertini and Cees G. M. Snoek and Alberto Del Bimbo},
url = {/pub/mm2015-tutorial.pdf},
doi = {10.1145/2733373.2807419},
year = {2015},
date = {2015-10-26},
booktitle = {Proceedings of the 23rd Annual ACM Conference on Multimedia Conference (ACMMM)},
pages = {1325--1326},
abstract = {This tutorial focuses on challenges and solutions for content-based image annotation and retrieval in the context of online image sharing and tagging. We present a unified review on three closely linked problems, i.e., tag assignment, tag refinement, and tag-based image retrieval. We introduce a taxonomy to structure the growing literature, understand the ingredients of the main works, clarify their connections and difference, and recognize their merits and limitations. Moreover, we present an open-source testbed, with training sets of varying sizes and three test datasets, to evaluate methods of varied learning complexity. A selected set of eleven representative works have been implemented and evaluated. During the tutorial we provide a practice session for hands on experience with the methods, software and datasets. For repeatable experiments all data and code are online at http://www.micc.unifi.it/tagsurvey},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jianfeng Dong; Xirong Li; Shuai Liao; Jieping Xu; Duanqing Xu; Xiaoyong Du
Image Retrieval by Cross-Media Relevance Fusion Proceedings Article
In: ACM Multimedia, pp. 173–176, 2015.
@inproceedings{mm2015-cmrf,
title = {Image Retrieval by Cross-Media Relevance Fusion},
author = { Jianfeng Dong and Xirong Li and Shuai Liao and Jieping Xu and Duanqing Xu and Xiaoyong Du},
url = {/pub/mm2015-cmrf.pdf},
doi = {10.1145/2733373.2809929},
year = {2015},
date = {2015-10-20},
booktitle = {ACM Multimedia},
pages = {173--176},
abstract = {How to estimate cross-media relevance between a given query and an unlabeled image is a key question in the MSR-Bing Image Retrieval Challenge. We answer the question by proposing cross-media relevance fusion, a conceptually simple framework that exploits the power of individual methods for cross-media relevance estimation. Four base cross-media relevance functions are investigated, and later combined by weights optimized on the development set. With NDCG25 of 0.5200 on the test dataset, the proposed image retrieval system secures the first place in the evaluation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Qin Jin; Xirong Li; Haibing Cao; Yujia Huo; Shuai Liao; Gang Yang; Jieping Xu
RUCMM at MediaEval 2015 Affective Impact of Movies Task: Fusion of Audio and Visual Cues Proceedings Article
In: Working Notes Proceedings of the MediaEval 2015 Workshop, 2015.
@inproceedings{mediaeval2015-ruc,
title = {RUCMM at MediaEval 2015 Affective Impact of Movies Task: Fusion of Audio and Visual Cues},
author = {Qin Jin and Xirong Li and Haibing Cao and Yujia Huo and Shuai Liao and Gang Yang and Jieping Xu},
url = {http://ceur-ws.org/Vol-1436/Paper26.pdf},
year = {2015},
date = {2015-09-14},
booktitle = {Working Notes Proceedings of the MediaEval 2015 Workshop},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Qin Jin; Shuai Liao; Junwei Liang; Xixi He; Yujia Huo; Weiyu Lan; Bin Xiao; Yanxiong Lu; Jieping Xu
RUC-Tencent at ImageCLEF 2015: Concept Detection, Localization and Sentence Generation Technical Report
2015.
@techreport{imageclef2015-ruc,
title = {RUC-Tencent at ImageCLEF 2015: Concept Detection, Localization and Sentence Generation},
author = {Xirong Li and Qin Jin and Shuai Liao and Junwei Liang and Xixi He and Yujia Huo and Weiyu Lan and Bin Xiao and Yanxiong Lu and Jieping Xu},
url = {http://lixirong.net/pub/imageclef2015-ruc.pdf},
year = {2015},
date = {2015-09-08},
booktitle = {CLEF (Working Notes)},
abstract = {In this paper we summarize our experiments in the ImageCLEF 2015 Scalable Concept Image Annotation challenge. The RUC-Tencent team participated in all subtasks: concept detection and localization, and image sentence generation. For concept detection, we experiments with automated approaches to gather high-quality training examples from the Web, in particular, visual disambiguation by Hierarchical Semantic Embedding. Per concept, an ensemble of linear SVMs is trained by Negative Bootstrap, with CNN features as image representation. Concept localization is achieved by classifying object proposals generated by Selective Search. For the sentence generation task, we adopt Google’s LSTM-RNN model, train it on the MSCOCO dataset, and fine- tune it on the ImageCLEF 2015 development dataset. We further develop a sentence re-ranking strategy based on the concept detection information from the first task. Overall, our system is ranked the 3rd for concept detection and localization, and is the best for image sentence generation in both clean and noisy tracks.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
Xirong Li; Shuai Liao; Weiyu Lan; Xiaoyong Du; Gang Yang
Zero-shot Image Tagging by Hierarchical Semantic Embedding Proceedings Article
In: SIGIR, 2015.
@inproceedings{sigir2015-hierse,
title = {Zero-shot Image Tagging by Hierarchical Semantic Embedding},
author = {Xirong Li and Shuai Liao and Weiyu Lan and Xiaoyong Du and Gang Yang},
url = {http://lixirong.net/pub/sigir2015-hierse.pdf},
doi = {10.1145/2766462.2767773},
year = {2015},
date = {2015-08-09},
booktitle = {SIGIR},
abstract = {Given the difficulty of acquiring labeled examples for many fine-grained visual classes, there is an increasing interest in zero-shot image tagging, aiming to tag images with novel labels that have no training examples present. Using a semantic space trained by a neural language model, the current state-of-the-art embeds both images and labels into the space, wherein cross-media similarity is computed. However, for labels of relatively low occurrence, its similarity to images and other labels can be unreliable. This paper proposes Hierarchical Semantic Embedding (HierSE), a simple model that exploits the WordNet hierarchy to improve label embedding and consequently image embedding. Moreover, we identify two good tricks, namely training the neural language model using Flickr tags instead of web documents, and using partial match instead of full match for vectorizing a WordNet node. All this lets us outperform the state-of-the-art. On a test set of over 1,500 visual object classes and 1.3 million images, the proposed model beats the current best results (18.3% versus 9.4% in hit@1).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Shuai Liao; Xirong Li; Heng Tao Shen; Yang Yang; Xiaoyong Du
Tag Features for Geo-Aware Image Classification Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 17, no. 7, pp. 1058-1067, 2015.
@article{tmm2015-geotagfeat,
title = { Tag Features for Geo-Aware Image Classification},
author = {Shuai Liao and Xirong Li and Heng Tao Shen and Yang Yang and Xiaoyong Du},
url = {pub/tmm2015_geotags.pdf},
year = {2015},
date = {2015-07-01},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {17},
number = {7},
pages = {1058-1067},
abstract = {As geo tags recording where a picture was taken are becoming part of image metadata, studying image classification approaches that can favorably exploit geo tags and the underlying geo context is an emerging topic. This paper contributes to geo-aware image classification by studying how to encode geo information into image representation. Given a geo-tagged image, we propose to extract geo-aware tag features by tag propagation from the geo and visual neighbors of the given image. Depending on what neighbors are used and how they are weighted, we present and compare eight variants of geo-aware tag features. Using millions of Flickr images as source data for tag feature extraction, experiments on a popular benchmark set justify the effectiveness and robustness of the proposed tag features for geo-aware image classification.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Junwei Liang; Qin Jin; Xixi He; Gang Yang; Jieping Xu; Xirong Li
Detecting semantic concepts in consumer videos using audio Proceedings Article
In: International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2279–2283, 2015.
@inproceedings{icassp2015-dsc-audio,
title = {Detecting semantic concepts in consumer videos using audio},
author = {Junwei Liang and Qin Jin and Xixi He and Gang Yang and Jieping Xu and Xirong Li},
url = {http://dx.doi.org/10.1109/ICASSP.2015.7178377},
doi = {10.1109/ICASSP.2015.7178377},
year = {2015},
date = {2015-04-19},
booktitle = {International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {2279--2283},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Svetlana Kordumova; Xirong Li; Cees G.M. Snoek
Best Practices for Learning Video Concept Detectors from Social Media Examples Journal Article
In: Multimedia Tools and Applications (MTAP), vol. 74, no. 4, pp. 1291–1315, 2015, ISSN: 1380-7501.
@article{mtap2015-video,
title = {Best Practices for Learning Video Concept Detectors from Social Media Examples},
author = { Svetlana Kordumova and Xirong Li and Cees G.M. Snoek},
url = {http://lixirong.net/pub/mtap2015-video.pdf},
doi = {10.1007/s11042-014-2056-5},
issn = {1380-7501},
year = {2015},
date = {2015-02-01},
journal = {Multimedia Tools and Applications (MTAP)},
volume = {74},
number = {4},
pages = {1291--1315},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2014
Xixi He; Xirong Li; Gang Yang; Jieping Xu; Qin Jin
Adaptive Tag Selection for Image Annotation Proceedings Article
In: Proceedings of the 15th Pacific-Rim Conference on Multimedia (PCM), pp. 11–21, 2014.
@inproceedings{pcm2014-tagsel,
title = {Adaptive Tag Selection for Image Annotation},
author = {Xixi He and Xirong Li and Gang Yang and Jieping Xu and Qin Jin},
url = {/pub/pcm2014-tagsel.pdf},
doi = {10.1007/978-3-319-13168-9_2},
year = {2014},
date = {2014-12-01},
booktitle = {Proceedings of the 15th Pacific-Rim Conference on Multimedia (PCM)},
pages = {11--21},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Xixi He; Gang Yang; Qin Jin; Jieping Xu
Renmin University of China at ImageCLEF 2014 Scalable Concept Image Annotation Proceedings Article
In: CLEF (Working Notes), pp. 380–385, 2014.
@inproceedings{imageclef2014-ruc,
title = {Renmin University of China at ImageCLEF 2014 Scalable Concept Image Annotation},
author = {Xirong Li and Xixi He and Gang Yang and Qin Jin and Jieping Xu},
url = {http://ceur-ws.org/Vol-1180/CLEF2014wn-Image-LiEt2014.pdf},
year = {2014},
date = {2014-09-15},
booktitle = {CLEF (Working Notes)},
pages = {380--385},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jieping Xu; Xirong Li; Yun Hao; Gang Yang
Source Separation Improves Music Emotion Recognition Proceedings Article
In: ACM International Conference on Multimedia Retrieval (ICMR), pp. 423, 2014.
@inproceedings{icmr2014-music,
title = {Source Separation Improves Music Emotion Recognition},
author = {Jieping Xu and Xirong Li and Yun Hao and Gang Yang},
url = {/pub/icmr2014-music.pdf},
doi = {10.1145/2578726.2578784},
year = {2014},
date = {2014-04-01},
booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)},
pages = {423},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Masoud Mazloom; Xirong Li; Cees G. M. Snoek
Few-Example Video Event Retrieval using Tag Propagation Proceedings Article
In: ACM International Conference on Multimedia Retrieval (ICMR), pp. 459, 2014.
@inproceedings{icmr2014-video,
title = {Few-Example Video Event Retrieval using Tag Propagation},
author = {Masoud Mazloom and Xirong Li and Cees G. M. Snoek},
url = {/pub/icmr2014-video.pdf},
doi = {10.1145/2578726.2578793},
year = {2014},
date = {2014-01-01},
booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)},
pages = {459},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2013
Shuai Liao; Xirong Li; Xiaoyong Du
Cross-Codebook Image Classification Proceedings Article
In: The 14th Pacific-Rim Conference on Multimedia (PCM), pp. 497–504, 2013.
@inproceedings{pcm2013-crosscodebook,
title = {Cross-Codebook Image Classification},
author = {Shuai Liao and Xirong Li and Xiaoyong Du},
url = {/pub/pcm2013-crosscodebook.pdf},
doi = {10.1007/978-3-319-03731-8_46},
year = {2013},
date = {2013-10-25},
booktitle = {The 14th Pacific-Rim Conference on Multimedia (PCM)},
pages = {497--504},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Cees G. M. Snoek
Classifying tag relevance with relevant positive and negative examples Proceedings Article
In: ACM International Conference on Multimedia (ACMMM), pp. 485–488, 2013.
@inproceedings{mm2013-tagrel.pdf,
title = {Classifying tag relevance with relevant positive and negative examples},
author = {Xirong Li and Cees G. M. Snoek},
url = {/pub/mm2013-tagrel.pdf},
doi = {10.1145/2502081.2502129},
year = {2013},
date = {2013-10-21},
booktitle = {ACM International Conference on Multimedia (ACMMM)},
pages = {485--488},
abstract = {Image tag relevance estimation aims to automatically determine what people label about images is factually present in the pictorial content. Different from previous works, which either use only positive examples of a given tag or use positive and random negative examples, we argue the importance of relevant positive and relevant negative examples for tag relevance estimation. We propose a system that selects positive and negative examples, deemed most relevant with respect to the given tag from crowd-annotated images. While applying models for many tags could be cumbersome, our system trains efficient ensembles of Support Vector Machines per tag, enabling fast classification. Experiments on two benchmark sets show that the proposed system compares favorably against five present day methods. Given extracted visual features, for each image our system can process up to 3,787 tags per second. The new system is both effective and efficient for tag relevance estimation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Svetlana Kordumova; Xirong Li; Cees G. M. Snoek
Evaluating Sources and Strategies for Learning Video Concepts from Social Media Proceedings Article
In: the 11th International Workshop on Content-Based Multimedia Indexing (CBMI), pp. 91–96, 2013.
@inproceedings{cbmi2013-socialmedia,
title = {Evaluating Sources and Strategies for Learning Video Concepts from Social Media},
author = {Svetlana Kordumova and Xirong Li and Cees G. M. Snoek},
url = {http://lixirong.net/pub/cbmi2013-socialmedia.pdf},
doi = {10.1109/CBMI.2013.6576561},
year = {2013},
date = {2013-06-17},
booktitle = {the 11th International Workshop on Content-Based Multimedia Indexing (CBMI)},
pages = {91--96},
abstract = {Learning video concept detectors from social media sources, such as Flickr images and YouTube videos, has the potential to address a wide variety of concept queries for video search. While the potential has been recognized by many, and progress on the topic has been impressive, we argue that two key questions, i.e., What visual tagging source is most suited for selecting positive training examples to learn video concepts? and What strategy should be used for selecting positive examples from tagged sources?, remain open. As an initial attempt to answer the two questions, we conduct an experimental study using a video search engine which is capable of learning concept detectors from social media, be it socially tagged videos or socially tagged images.Within the video search engine we investigate six strategies of positive examples selection. The performance is evaluated on the challenging TRECVID benchmark 2011 with 400 hours of Internet videos. The new experiments lead to novel and nontrivial findings: (1) tagged images are a better source for learning video concepts from the web, (2) selecting tag relevant examples as positives for learning video concepts is always beneficial and it can be done automatically and (3) the best source and strategy compare favorably against several present-day methods.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Cees G. M. Snoek; Marcel Worring; Dennis Koelma; Arnold W. M. Smeulders
Bootstrapping Visual Categorization With Relevant Negatives Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 15, no. 4, pp. 933-945, 2013.
@article{tmm2013-relneg,
title = {Bootstrapping Visual Categorization With Relevant Negatives},
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Dennis Koelma and Arnold W. M. Smeulders},
url = {/pub/tmm2013-relneg.pdf},
doi = {10.1109/TMM.2013.2238523},
year = {2013},
date = {2013-06-01},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {15},
number = {4},
pages = {933-945},
abstract = {Learning classifiers for many visual concepts is important for image categorization and retrieval. As a classifier tends to misclassify negative examples which are visually similar to positive ones, inclusion of such misclassified and thus relevant negatives should be stressed during learning. User-tagged images are abundant online, but which images are the relevant negatives remains unclear. Sampling negatives at random is the de facto standard in the literature. In this paper we go beyond random sampling by proposing Negative Bootstrap. Given a visual concept and a few positive examples, the new algorithm iteratively finds relevant negatives. Per iteration we learn from a small proportion of many user-tagged images, yielding an ensemble of meta classifiers. For efficient classification, we introduce Model Compression such that the classification time is independent of the ensemble size. Compared to the state of the art, we obtain relative gains of 14% and 18% on two present-day benchmarks in terms of mean average precision. For concept search in one million images, model compression reduces the search time from over 20 hours to approximately 6 minutes. The effectiveness and efficiency, without the need of manually labeling any negatives, make negative bootstrap appealing for learning better visual concept classifiers.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xirong Li; Shuai Liao; Binbin Liu; Gang Yang; Qin Jin; Jieping Xu; Xiaoyong Du
Renmin University of China at ImageCLEF 2013 Scalable Concept Image Annotation Proceedings Article
In: CLEF (Working Notes), 2013.
@inproceedings{imageclef2013-ruc,
title = {Renmin University of China at ImageCLEF 2013 Scalable Concept Image Annotation},
author = {Xirong Li and Shuai Liao and Binbin Liu and Gang Yang and Qin Jin and Jieping Xu and Xiaoyong Du},
url = {http://ceur-ws.org/Vol-1179/CLEF2013wn-ImageCLEF-LiEt2013.pdf},
year = {2013},
date = {2013-01-01},
booktitle = {CLEF (Working Notes)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2012
Xirong Li; Cees G. M. Snoek; Marcel Worring; Arnold W. M. Smeulders
Harvesting Social Images for Bi-Concept Search Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 14, no. 4, pp. 1091–1104, 2012.
@article{tmm2012-biconcept,
title = {Harvesting Social Images for Bi-Concept Search},
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
url = {/pub/tmm2012-biconcept.pdf},
doi = {10.1109/TMM.2012.2191943},
year = {2012},
date = {2012-08-01},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {14},
number = {4},
pages = {1091--1104},
abstract = {Searching for the co-occurrence of two visual concepts in unlabeled images is an important step towards answering complex user queries. Traditional visual search methods use combinations of the confidence scores of individual concept detectors to tackle such queries. In this paper we introduce the notion of bi-concepts, a new concept-based retrieval method that is directly learned from social-tagged images. As the number of potential bi-concepts is gigantic, manually collecting training examples is infeasible. Instead, we propose a multimedia framework to collect de-noised positive as well as informative negative training examples from the social web, to learn bi-concept detectors from these examples, and to apply them in a search engine for retrieving bi-concepts in unlabeled images. We study the behavior of our bi-concept search engine using 1.2 M social-tagged images as a data source. Our experiments indicate that harvesting examples for bi-concepts differs from traditional single-concept methods, yet the examples can be collected with high accuracy using a multi-modal approach. We find that directly learning bi-concepts is better than oracle linear fusion of single-concept detectors, with a relative improvement of 100%. This study reveals the potential of learning high-order semantics from social images, for free, suggesting promising new lines of research.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xirong Li; Cees G. M. Snoek; Marcel Worring; Arnold W. M. Smeulders
Fusing Concept Detection and Geo Context for Visual Search Proceedings Article
In: ACM International Conference on Multimedia Retrieval (ICMR), pp. 4:1–4:8, 2012, ISBN: 978-1-4503-1329-2.
@inproceedings{icmr2012-geocontext,
title = {Fusing Concept Detection and Geo Context for Visual Search},
author = { Xirong Li and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
url = {/pub/icmr2012-geocontext.pdf},
doi = {10.1145/2324796.2324801},
isbn = {978-1-4503-1329-2},
year = {2012},
date = {2012-06-05},
booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)},
pages = {4:1--4:8},
series = {ICMR '12},
abstract = {Given the proliferation of geo-tagged images, the question of how to exploit geo tags and the underlying geo context for visual search is emerging. Based on the observation that the importance of geo context varies over concepts, we propose a concept-based image search engine which fuses visual concept detection and geo context in a concept-dependent manner. Compared to individual content-based and geo-based concept detectors and their uniform combination, concept-dependent fusion shows improvements. Moreover, since the proposed search engine is trained on social-tagged images alone without the need of human interaction, it is flexible to cope with many concepts. Search experiments on 101 popular visual concepts justify the viability of the proposed solution. In particular, for 79 out of the 101 concepts, the learned weights yield improvements over the uniform weights, with a relative gain of at least 5% in terms of average precision.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li
Content-based Visual Search Learned from Social Media PhD Thesis
Informatics Institute, University of Amsterdam, 2012, ISBN: 9789461820822, (ACM SIGMM 2013 Best PhD Thesis Award).
@phdthesis{phd2012,
title = {Content-based Visual Search Learned from Social Media},
author = {Xirong Li},
url = {/pub/phdthesis.pdf},
isbn = {9789461820822},
year = {2012},
date = {2012-03-09},
school = {Informatics Institute, University of Amsterdam},
abstract = {This thesis contributes to social image search, a research field emerging due to the fact that digital images have become social. For effective retrieval and repurposing of images on the social web, we have to determine whether what people spontaneously say about an image is factually in the pictorial content. Moreover, as the majority of social images are untagged, methods for deriving semantics from the content are required. Social image search is thus of scientific and social importance. We exploited socially tagged images for extracting objective semantics perceived by the community and subjective semantics related to individual users from the pictorial content.},
note = {ACM SIGMM 2013 Best PhD Thesis Award},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2011
Xirong Li; Efstratios Gavves; Cees G. M. Snoek; Marcel Worring; Arnold W. M. Smeulders
Personalizing automated image annotation using cross-entropy Proceedings Article
In: ACM International Conference on Multimedia (ACMMM), pp. 233–242, 2011.
@inproceedings{mm2011-personalized,
title = {Personalizing automated image annotation using cross-entropy},
author = {Xirong Li and Efstratios Gavves and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
url = {/pub/mm2011-personalized.pdf},
doi = {10.1145/2072298.2072330},
year = {2011},
date = {2011-11-28},
booktitle = {ACM International Conference on Multimedia (ACMMM)},
pages = {233--242},
abstract = {Annotating the increasing amounts of user-contributed images in a personalized manner is in great demand. However, this demand is largely ignored by the mainstream of automated image annotation research. In this paper we aim for personalizing automated image annotation by jointly exploiting personalized tag statistics and content-based image annotation. We propose a cross-entropy based learning algorithm which personalizes a generic annotation model by learning from a user's multimedia tagging history. Using cross-entropy-minimization based Monte Carlo sampling, the proposed algorithm optimizes the personalization process in terms of a performance measurement which can be flexibly chosen. Automatic image annotation experiments with 5,315 realistic users in the social web show that the proposed method compares favorably to a generic image annotation method and a method using personalized tag statistics only. For 4,442 users the performance improves, where for 1,088 users the absolute performance gain is at least 0.05 in terms of average precision. The results show the value of the proposed method.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Cees G. M. Snoek; Marcel Worring; Arnold W. M. Smeulders
Social negative bootstrapping for visual categorization Proceedings Article
In: ACM International Conference on Multimedia Retrieval (ICMR), pp. 12, 2011.
@inproceedings{icmr2011-negative,
title = {Social negative bootstrapping for visual categorization},
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
url = {/pub/icmr2011-negative.pdf},
doi = {10.1145/1991996.1992008},
year = {2011},
date = {2011-04-18},
booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)},
pages = {12},
abstract = {To learn classifiers for many visual categories, obtaining labeled training examples in an efficient way is crucial. Since a classifier tends to misclassify negative examples which are visually similar to positive examples, inclusion of such informative negatives should be stressed in the learning process. However, they are unlikely to be hit by random sampling, the de facto standard in literature. In this paper, we go beyond random sampling by introducing a novel social negative bootstrapping approach. Given a visual category and a few positive examples, the proposed approach adaptively and iteratively harvests informative negatives from a large amount of social-tagged images. To label negative examples without human interaction, we design an effective virtual labeling procedure based on simple tag reasoning. Virtual labeling, in combination with adaptive sampling, enables us to select the most misclassified negatives as the informative samples. Learning from the positive set and the informative negative sets results in visual classifiers with higher accuracy. Experiments on two present-day image benchmarks employing 650K virtually labeled negative examples show the viability of the proposed approach. On a popular visual categorization benchmark our precision at 20 increases by 34%, compared to baselines trained on randomly sampled negatives. We achieve more accurate visual categorization without the need of manually labeling any negatives.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2010
Xirong Li; Cees G. M. Snoek; Marcel Worring
Unsupervised multi-feature tag relevance learning for social image retrieval Proceedings Article
In: ACM International Conference on Image and Video Retrieval (CIVR), pp. 10–17, 2010, (best paper award).
@inproceedings{civr2010-multifeature,
title = {Unsupervised multi-feature tag relevance learning for social image retrieval},
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
url = {/pub/civr2010-multifeature.pdf},
doi = {10.1145/1816041.1816044},
year = {2010},
date = {2010-07-05},
booktitle = {ACM International Conference on Image and Video Retrieval (CIVR)},
pages = {10--17},
abstract = {Interpreting the relevance of a user-contributed tag with respect to the visual content of an image is an emerging problem in social image retrieval. In the literature this problem is tackled by analyzing the correlation between tags and images represented by specific visual features. Unfortunately, no single feature represents the visual content completely, e.g., global features are suitable for capturing the gist of scenes, while local features are better for depicting objects. To solve the problem of learning tag relevance given multiple features, we introduce in this paper two simple and effective methods: one is based on the classical Borda Count and the other is a method we name UniformTagger. Both methods combine the output of many tag relevance learners driven by diverse features in an unsupervised, rather than supervised, manner. Experiments on 3.5 million social-tagged images and two test sets verify our proposal. Using learned tag relevance as updated tag frequency for social image retrieval, both Borda Count and UniformTagger outperform retrieval without tag relevance learning and retrieval with single-feature tag relevance learning. Moreover, the two unsupervised methods are comparable to a state-of-the-art supervised alternative, but without the need of any training data.},
note = {best paper award},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2009
Xirong Li; Cees G. M. Snoek; Marcel Worring
Learning Social Tag Relevance by Neighbor Voting Journal Article
In: IEEE Transactions on Multimedia (TMM), vol. 11, no. 7, pp. 1310–1322, 2009, (best paper award).
@article{tmm2009-tagrel,
title = {Learning Social Tag Relevance by Neighbor Voting},
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
url = {/pub/tmm2009-tagrel.pdf},
doi = {10.1109/TMM.2009.2030598},
year = {2009},
date = {2009-11-01},
journal = {IEEE Transactions on Multimedia (TMM)},
volume = {11},
number = {7},
pages = {1310--1322},
abstract = {Social image analysis and retrieval is important for helping people organize and access the increasing amount of user-tagged multimedia. Since user tagging is known to be uncontrolled, ambiguous, and overly personalized, a fundamental problem is how to interpret the relevance of a user-contributed tag with respect to the visual content the tag is describing. Intuitively, if different persons label visually similar images using the same tags, these tags are likely to reflect objective aspects of the visual content. Starting from this intuition, we propose in this paper a neighbor voting algorithm which accurately and efficiently learns tag relevance by accumulating votes from visual neighbors. Under a set of well-defined and realistic assumptions, we prove that our algorithm is a good tag relevance measurement for both image ranking and tag ranking. Three experiments on 3.5 million Flickr photos demonstrate the general applicability of our algorithm in both social image retrieval and image tag suggestion. Our tag relevance learning algorithm substantially improves upon baselines for all the experiments. The results suggest that the proposed algorithm is promising for real-world applications.},
note = {best paper award},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xirong Li; Cees G. M. Snoek
Visual categorization with negative examples for free Proceedings Article
In: ACM International Conference on Multimedia (ACMMM), pp. 661–664, 2009.
@inproceedings{mm2009-neg4free,
title = {Visual categorization with negative examples for free},
author = {Xirong Li and Cees G. M. Snoek},
url = {/pub/mm2009-neg4free.pdf},
doi = {10.1145/1631272.1631382},
year = {2009},
date = {2009-10-19},
booktitle = {ACM International Conference on Multimedia (ACMMM)},
pages = {661--664},
abstract = {Automatic visual categorization is critically dependent on labeled examples for supervised learning. As an alternative to traditional expert labeling, social-tagged multimedia is becoming a novel yet subjective and inaccurate source of learning examples. Different from existing work focusing on collecting positive examples, we study in this paper the potential of substituting social tagging for expert labeling for creating negative examples. We present an empirical study using 6.5 million Flickr photos as a source of social tagging. Our experiments on the PASCAL VOC challenge 2008 show that with a relative loss of only 4.3% in terms of mean average precision, expert-labeled negative examples can be completely replaced by social-tagged negative examples for consumer photo categorization.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Cees G. M. Snoek; Marcel Worring
Annotating images by harnessing worldwide user-tagged photos Proceedings Article
In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 3717–3720, 2009.
@inproceedings{icassp2009-userphotos,
title = {Annotating images by harnessing worldwide user-tagged photos},
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
url = {/pub/icassp2009-userphotos.pdf},
doi = {10.1109/ICASSP.2009.4960434},
year = {2009},
date = {2009-04-19},
booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {3717--3720},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Dong Wang; Zhikun Wang; Jianmin Li; Bo Zhang; Xirong Li
Query representation by structured concept threads with application to interactive video retrieval Journal Article
In: Journal of Visual Communication and Image Representation (JVCIR), vol. 20, no. 2, pp. 104–116, 2009.
@article{vcir2009-video,
title = {Query representation by structured concept threads with application to interactive video retrieval},
author = {Dong Wang and Zhikun Wang and Jianmin Li and Bo Zhang and Xirong Li},
url = {/pub/vcir2009-videoquery.pdf},
doi = {10.1016/j.jvcir.2008.12.001},
year = {2009},
date = {2009-02-01},
journal = {Journal of Visual Communication and Image Representation (JVCIR)},
volume = {20},
number = {2},
pages = {104--116},
abstract = {In this paper, we provide a new formulation for video queries as structured combination of concept threads, contributing to the general query-by-concept paradigm. Occupying a low-dimensional region in the concept space, concept thread defines a ranked list of video documents ordered by their combined concept predictions. This localized representation incorporates the previous concept based formulation as a special case and extends the restricted AND concept combination logic to a two-level concept inference network. We apply this new formulation to interactive video retrieval and utilize abundant feedback information to mine the latent semantic concept threads for answering complex query semantics. Simulative experiments which are conducted on two years’ TRECVID data sets with two sets of concept lexicons demonstrate the advantage of the proposed formulation. The proposed query formulation offers some 60% improvements over the simple browsing search baseline in nearly real time. It has clear advantages over c-tf-idf and achieves better results over the state-of-the-art online ordinal reranking approach. Meanwhile, it not only alleviates user’s workload significantly but also is robust to user mislabeling errors.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2008
Xin-Jing Wang; Lei Zhang; Xirong Li; Wei-Ying Ma
Annotating Images by Mining Image Search Results Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI), vol. 30, no. 11, pp. 1919–1932, 2008.
@article{pami2008-annoseaerch,
title = {Annotating Images by Mining Image Search Results},
author = {Xin-Jing Wang and Lei Zhang and Xirong Li and Wei-Ying Ma},
url = {/pub/pami2008-annosearch},
doi = {10.1109/TPAMI.2008.127},
year = {2008},
date = {2008-11-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},
volume = {30},
number = {11},
pages = {1919--1932},
abstract = {In this paper, we propose a novel attempt of model-free image annotation which annotates images by mining their search results. It contains three steps: 1) the search process to discover visually and semantically similar search results; 2) the mining process to identify salient terms from textual descriptions of the search results; and 3) the annotation rejection process to filter out noisy terms yielded by step 2). To ensure real time annotation, two key techniques are leveraged - one is to map the high dimensional image visual features into hash codes, the other is to implement it as a distributed system, of which the search and mining processes are provided as Web services. As a typical result, the entire process finishes in less than 1 second. Our proposed approach enables annotating with unlimited vocabulary, and is highly scalable and robust to outliers. Experimental results on both real web images and a bench mark image dataset show the effectiveness and efficiency of the proposed algorithm.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xirong Li; Cees G. M. Snoek; Marcel Worring
Learning tag relevance by neighbor voting for social image retrieval Proceedings Article
In: ACM International Conference on Multimedia Information Retrieval (MIR), pp. 180–187, 2008.
@inproceedings{mir2008-tagrel,
title = {Learning tag relevance by neighbor voting for social image retrieval},
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
url = {/pub/mir2008-tagrel.pdf},
doi = {10.1145/1460096.1460126},
year = {2008},
date = {2008-10-30},
booktitle = {ACM International Conference on Multimedia Information Retrieval (MIR)},
pages = {180--187},
abstract = {Social image retrieval is important for exploiting the increasing amounts of amateur-tagged multimedia such as Flickr images. Since amateur tagging is known to be uncontrolled, ambiguous, and personalized, a fundamental problem is how to reliably interpret the relevance of a tag with respect to the visual content it is describing. Intuitively, if different persons label similar images using the same tags, these tags are likely to reflect objective aspects of the visual content. Starting from this intuition, we propose a novel algorithm that scalably and reliably learns tag relevance by accumulating votes from visually similar neighbors. Further, treated as tag frequency, learned tag relevance is seamlessly embedded into current tag-based social image retrieval paradigms.
Preliminary experiments on one million Flickr images demonstrate the potential of the proposed algorithm. Overall comparisons for both single-word queries and multiple-word queries show substantial improvement over the baseline by learning and using tag relevance. Specifically, compared with the baseline using the original tags, on average, retrieval using improved tags increases mean average precision by 24%, from 0.54 to 0.67. Moreover, simulated experiments indicate that performance can be improved further by scaling up the amount of images used in the proposed neighbor voting algorithm.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Preliminary experiments on one million Flickr images demonstrate the potential of the proposed algorithm. Overall comparisons for both single-word queries and multiple-word queries show substantial improvement over the baseline by learning and using tag relevance. Specifically, compared with the baseline using the original tags, on average, retrieval using improved tags increases mean average precision by 24%, from 0.54 to 0.67. Moreover, simulated experiments indicate that performance can be improved further by scaling up the amount of images used in the proposed neighbor voting algorithm.
2007
Dong Wang; Xirong Li; Jianmin Li; Bo Zhang
The importance of query-concept-mapping for automatic video retrieval Proceedings Article
In: ACM International Conference on Multimedia (ACMMM), pp. 285–288, 2007.
@inproceedings{mm2007-query-concept-mapping,
title = {The importance of query-concept-mapping for automatic video retrieval},
author = {Dong Wang and Xirong Li and Jianmin Li and Bo Zhang},
url = {/pub/mm2007-query-concept-mapping.pdf},
doi = {10.1145/1291233.1291293},
year = {2007},
date = {2007-09-24},
booktitle = {ACM International Conference on Multimedia (ACMMM)},
pages = {285--288},
crossref = {DBLP:conf/mm/2007},
abstract = {A new video retrieval paradigm of query-by-concept emerges recently. However, it remains unclear how to exploit the detected concepts in retrieval given a multimedia query. In this paper, we point out that it is important to map the query to a few relevant concepts instead of search with all concepts. In addition, we show that solving this problem through both text and image inputs are effective for search, and it is possible to determine the number of related concepts by a language modeling approach. Experimental evidence is obtained on the automatic search task of TRECVID 2006 using a large lexicon of 311 learned semantic concept detectors.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xirong Li; Dong Wang; Jianmin Li; Bo Zhang
Video search in concept subspace: a text-like paradigm Proceedings Article
In: ACM International Conference on Image and Video Retrieval (CIVR), pp. 603–610, 2007.
@inproceedings{civr2007-conceptsubspace,
title = {Video search in concept subspace: a text-like paradigm},
author = {Xirong Li and Dong Wang and Jianmin Li and Bo Zhang},
url = {/pub/civr2007-conceptsubspace.pdf},
doi = {10.1145/1282280.1282366},
year = {2007},
date = {2007-07-09},
booktitle = {ACM International Conference on Image and Video Retrieval (CIVR)},
pages = {603--610},
abstract = {Though both quantity and quality of semantic concept detection in video are continuously improving, it still remains unclear how to exploit these detected concepts as semantic indices in video search, given a specific query. In this paper, we tackle this problem and propose a video search framework which operates like searching text documents. Noteworthy for its adoption of the well-founded text search principles, this framework first selects a few related concepts for a given query, by employing a tf-idf like scheme, called c-tf-idf, to measure the informativeness of the concepts to this query. These selected concepts form a concept subspace. Then search can be conducted in this concept subspace, either by a Vector Model or a Language Model. Further, two algorithms, i.e., Linear Summation and Random Walk through Concept-Link, are explored to combine the concept search results and other baseline search results in a reranking scheme. This framework is both effective and efficient. Using a lexicon of 311 concepts from the LSCOM concept ontology, experiments conducted on the TRECVID 2006 search data set show that: when used solely, search within the concept subspace achieves the state-of-the-art concept search result; when used to rerank the baseline results, it can improve over the top 20 automatic search runs in TRECVID 2006 on average by approx. 20%, on the most significant one by approx. 50%, all within 180 milliseconds on a normal PC.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2006
Xirong Li; Le Chen; Lei Zhang; Fuzong Lin; Wei-Ying Ma
Image annotation by large-scale content-based image retrieval Proceedings Article
In: ACM International Conference on Multimedia (ACMMM), pp. 607–610, 2006.
@inproceedings{mm2006-imageanno,
title = {Image annotation by large-scale content-based image retrieval},
author = {Xirong Li and Le Chen and Lei Zhang and Fuzong Lin and Wei-Ying Ma},
url = {/pub/mm2006-imageanno.pdf},
doi = {10.1145/1180639.1180764},
year = {2006},
date = {2006-10-23},
booktitle = {ACM International Conference on Multimedia (ACMMM)},
pages = {607--610},
abstract = {Image annotation has been an active research topic in recent years due to its potentially large impact on both image understanding and Web image search. In this paper, we target at solving the automatic image annotation problem in a novel search and mining framework. Given an uncaptioned image, first in the search stage, we perform content-based image retrieval (CBIR) facilitated by high-dimensional indexing to find a set of visually similar images from a large-scale image database. The database consists of images crawled from the World Wide Web with rich annotations, e.g. titles and surrounding text. Then in the mining stage, a search result clustering technique is utilized to find most representative keywords from the annotations of the retrieved image subset. These keywords, after salience ranking, are finally used to annotate the uncaptioned image. Based on search technologies, this framework does not impose an explicit training stage, but efficiently leverages large-scale and well-annotated images, and is potentially capable of dealing with unlimited vocabulary. Based on 2.4 million real Web images, comprehensive evaluation of image annotation on Corel and U. Washington image databases show the effectiveness and efficiency of the proposed approach.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
0000
Xirong Li; Fangming Zhou; Chaoxi Xu; Jiaqi Ji; Gang Yang
SEA: Sentence Encoder Assembly for Video Retrieval by Textual Queries Journal Article
In: IEEE Transactions on Multimedia (TMM), 0000.
@article{tmm2021-sea,
title = {SEA: Sentence Encoder Assembly for Video Retrieval by Textual Queries},
author = {Xirong Li and Fangming Zhou and Chaoxi Xu and Jiaqi Ji and Gang Yang},
doi = {10.1109/TMM.2020.3042067},
journal = {IEEE Transactions on Multimedia (TMM)},
abstract = {Retrieving unlabeled videos by textual queries, known as Ad-hoc Video Search (AVS), is a core theme in multimedia data management and retrieval. The success of AVS counts on cross-modal representation learning that encodes both query sentences and videos into common spaces for semantic similarity computation. Inspired by the initial success of previously few works in combining multiple sentence encoders, this paper takes a step forward by developing a new and general method for effectively exploiting diverse sentence encoders. The novelty of the proposed method, which we term Sentence Encoder Assembly (SEA), is two-fold. First, different from prior art that uses only a single common space, SEA supports text-video matching in multiple encoder-specific common spaces. Such a property prevents the matching from being dominated by a specific encoder that produces an encoding vector much longer than other encoders. Second, in order to explore complementarities among the individual common spaces, we propose multi-space multi-loss learning. As extensive experiments on four benchmarks (MSR-VTT, TRECVID AVS 2016-2019, TGIF and MSVD) show, SEA surpasses the state-of-the-art. In addition, SEA is extremely ease to implement. All this makes SEA an appealing solution for AVS and promising for continuously advancing the task by harvesting new sentence encoders.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}