%% This BibTeX bibliography file was created using BibDesk. %% http://bibdesk.sourceforge.net/ %% Created for George Fazekas at 2020-12-27 15:41:21 +0000 %% Saved with string encoding Unicode (UTF-8) @conference{pasini2024audio, author = {Pasini, M. and Nistal, J. and Lattner, S. and Fazekas, G.}, title = {Music2Latent2: Audio Compression with Summary Embeddings and Autoregressive Decoding}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 06-11, Hyderabad, India}, year = {2025}, abstract = {Efficiently compressing high-dimensional audio signals into a compact and informative latent space is crucial for various tasks, including generative modeling and music information retrieval (MIR). Existing audio autoencoders, however, often struggle to achieve high compression ratios while preserving audio fidelity and facilitating efficient downstream applications. We introduce Music2Latent2, a novel audio autoencoder that addresses these limitations by leveraging consistency models and a novel approach to representation learning based on unordered latent embeddings, which we call summary embeddings. Unlike conventional methods that encode local audio features into ordered sequences, Music2Latent2 compresses audio signals into sets of summary embeddings, where each embedding can capture distinct global features of the input sample. This enables to achieve higher reconstruction quality at the same compression ratio. To handle arbitrary audio lengths, Music2Latent2 employs an autoregressive consistency model trained on two consecutive audio chunks with causal masking, ensuring coherent reconstruction across segment boundaries. Additionally, we propose a novel two-step decoding procedure that leverages the denoising capabilities of consistency models to further refine the generated audio at no additional cost. Our experiments demonstrate that Music2Latent2 outperforms existing continuous audio autoencoders regarding audio quality and performance on downstream tasks. Music2Latent2 paves the way for new possibilities in audio compression.}, doi = {10.48550/arXiv.2501.17578}, keywords = {Audio, Compression, Music Informatics}, url = {https://arxiv.org/abs/2501.17578}, } @conference{pasini2024continuous, author = {Pasini, M. and Nistal, J. and Lattner, S. and Fazekas, G.}, title = {Continuous Autoregressive Models with Noise Augmentation Avoid Error Accumulation}, booktitle = {38th NeurIPS Conference on Neural Information Processing Systems, AI-Driven Speech, Music, and Sound Generation (Audio Imagination Workshop), December 10-15, Vancouver, Canada}, year = {2024}, abstract = {Autoregressive models are typically applied to sequences of discrete tokens, but recent research indicates that generating sequences of continuous embeddings in an autoregressive manner is also feasible. However, such Continuous Autoregressive Models (CAMs) can suffer from a decline in generation quality over extended sequences due to error accumulation during inference. We introduce a novel method to address this issue by injecting random noise into the input embeddings during training. This procedure makes the model robust against varying error levels at inference. We further reduce error accumulation through an inference procedure that introduces low-level noise. Experiments on musical audio generation show that CAM substantially outperforms existing autoregressive and non-autoregressive approaches while preserving audio quality over extended sequences. This work paves the way for generating continuous embeddings in a purely autoregressive setting, opening new possibilities for real-time and interactive generative applications.}, doi = {10.48550/arXiv.2411.18447}, keywords = {Audio, Generation, Music, LLM, Autoregressive, Transformer, Noise augmentation, Audio generation, Error accumulation}, url = {https://arxiv.org/abs/2411.18447}, Publisher-Url = {https://neurips.cc/virtual/2024/105762}, } @conference{row2024composers, author = {Row, E. and Fazekas, G.}, booktitle = {38th NeurIPS Conference on Neural Information Processing Systems (Creativity & Generative AI Workshop), December 10-15, Vancouver, Canada}, title = {Composers' Evaluations of an AI Music Tool: Insights for Human-Centred Desi}, year = {2024}, abstract = {We present a study that explores the role of user-centred design in developing Generative AI (GenAI) tools for music composition. Through semi-structured interviews with professional composers, we gathered insights on a novel generative model for creating variations, highlighting concerns around trust, transparency, and ethical design. The findings helped form a feedback loop, guiding improvements to the model that emphasised traceability, transparency and explainability. They also revealed new areas for innovation, including novel features for controllability and research questions on the ethical and practical implementation of GenAI models.}, doi = {doi.org/10.48550/arXiv.2412.10968}, keywords = {Audio, Generation, Music, LLM, Autoregressive, Transformer, Noise augmentation, Audio generation, Error accumulation}, url = {https://arxiv.org/abs/2412.10968}, Publisher-Url = {https://neurips.cc/virtual/2024/98373}, } @article{yu2024golffilters, Abstract = {This paper introduces GlOttal‑flow LPC Filter (GOLF), a novel method for singing voice synthesis (SVS) that exploits the physical characteristics of the human voice using differentiable digital signal processing. GOLF employs a glottal model as the harmonic source and LPC filters to simulate the vocal tract, resulting in an interpretable and efficient synthesis approach. We show it is competitive with state‑of‑the‑art singing voice vocoders, requiring fewer synthesis parameters and less memory to train, and runs an order of magnitude faster for inference. Additionally, we demonstrate that GOLF implicitly learns to model the phase components and formants of the human voice, having the potential to control and analyse singing voices in a differentiable manner. Our results highlight the effectiveness of incorporating the physical properties of the voice production mechanism into SVS and underscore the advantages of signal‑processing‑based approaches, which offer greater interpretability and efficiency in synthesis.}, Author = {Yu, C-Y. and Fazekas, G.}, Journal = {Transactions of the International Society for Music Information Retrieval (TISMIR)}, Keywords = {Singing Voice Synthesis, Glottal Flow, LPC Filters, Music Technology}, Number = {1}, Pages = {316–330}, % Placeholder, update if found Publisher-Url = {https://transactions.ismir.net/}, Title = {GOLF: A Singing Voice Synthesiser with Glottal Flow Wavetables and LPC Filters}, Url = {https://transactions.ismir.net/articles/10.5334/tismir.210/}, doi = {10.5334/tismir.210}, issn = {2514-3298}, Volume = {7}, Year = {2024}, % Assuming the article is recent; verify and update if needed } @conference{tang2025icassp, Author = {Tang, J. and Cooper, E. and Wang, X. and Yamagishi, J. and Fazekas, G.}, Title = {Towards An Integrated Approach for Expressive Piano Performance Synthesis from Music Scores}, Booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 06-11, Hyderabad, India}, Year = {2025}, Abstract = {This paper presents an integrated system that transforms symbolic music scores into expressive piano performance audio. By combining a Transformer-based Expressive Performance Rendering (EPR) model with a fine-tuned neural MIDI synthesiser, our approach directly generates expressive audio performances from score inputs. To the best of our knowledge, this is the first system to offer a streamlined method for converting score MIDI files lacking expression control into rich, expressive piano performances. We conducted experiments using subsets of the ATEPP dataset, evaluating the system with both objective metrics and subjective listening tests. Our system not only accurately reconstructs human-like expressiveness, but also captures the acoustic ambience of environments such as concert halls and recording studios. Additionally, the proposed system demonstrates its ability to achieve musical expressiveness while ensuring good audio quality in its outputs.}, doi = {10.48550/arXiv.2501.10222}, Keywords = {Expressive piano performance, Music score synthesis, Neural MIDI synthesis}, Url = {https://arxiv.org/abs/2501.10222} } @conference{guinot2025leave, Author = {Guinot, J and Quinton, E. and Fazekas, G.}, Title = {Leave-One-EquiVariant: Alleviating invariance-related information loss in contrastive music representations}, Booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 06-11, Hyderabad, India}, Year = {2025}, Abstract = {Contrastive learning has proven effective in self-supervised musical representation learning, particularly for Music Information Retrieval (MIR) tasks. However, reliance on augmentation chains for contrastive view generation and the resulting learnt invariances pose challenges when different downstream tasks require sensitivity to certain musical attributes. To address this, we propose the Leave One EquiVariant (LOEV) framework, which introduces a flexible, task-adaptive approach compared to previous work by selectively preserving information about specific augmentations, allowing the model to maintain task-relevant equivariances. We demonstrate that LOEV alleviates information loss related to learned invariances, improving performance on augmentation related tasks and retrieval without sacrificing general representation quality. Furthermore, we introduce a variant of LOEV, LOEV++, which builds a disentangled latent space by design in a self-supervised manner, and enables targeted retrieval based on augmentation related attributes.}, doi = {10.48550/arXiv.2412.18955}, Keywords = {Contrastive Learning, Music Representation}, Url = {https://arxiv.org/abs/2412.18955} } @conference{shatri2025lowdataapproach, author = {Shatri, E and Raymond, D and Fazekas, G}, booktitle = {IEEE 6th International Image Processing, Applications and Systems Conference, 9-11 Jan. 2025, Lyon, France.}, title = {Low-Data Classification of Historical Music Manuscripts: A Few-Shot Learning Approach}, year = {2025}, abstract = {In this paper, we explore the intersection of technology and cultural preservation by developing a self-supervised learning framework for the classification of musical symbols in historical manuscripts. Optical Music Recognition (OMR) plays a vital role in digitising and preserving musical heritage, but historical documents often lack the labelled data required by traditional methods. We overcome this challenge by training a neural-based feature extractor on unlabelled data, enabling effective classification with minimal samples. Key contributions include optimising crop preprocessing for a self-supervised Convolutional Neural Network and evaluating classification methods, including SVM, multilayer perceptrons, and prototypical networks. Our experiments yield an accuracy of 87.66\\%, showcasing the potential of AI-driven methods to ensure the survival of historical music for future generations through advanced digital archiving techniques.}, doi = {10.48550/arxiv.2411.16408}, keywords = {music recognition, few-shot learning, Machine Learning and Artificial Intelligence}, Url = {https://arxiv.org/pdf/2411.16408}, } @conference{shatri2024synthesisingdcgan, author = {Shatri, E and Palavala, K and Fazekas, G}, booktitle = {IEEE Big Data 2nd Workshop on AI Music Generation (AIMG 2024), 15-18 Dec. Washington D.C., USA}, title = {Synthesising Handwritten Music with GANs: A Comprehensive Evaluation of CycleWGAN, ProGAN, and DCGAN}, year = {2024}, abstract = {The generation of handwritten music sheets is a crucial step toward enhancing Optical Music Recognition (OMR) systems, which rely on large and diverse datasets for optimal performance. However, handwritten music sheets, often found in archives, present challenges for digitisation due to their fragility, varied handwriting styles, and image quality. This paper addresses the data scarcity problem by applying Generative Adversarial Networks (GANs) to synthesise realistic handwritten music sheets. We provide a comprehensive evaluation of three GAN models—DCGAN, ProGAN, and CycleWGAN—comparing their ability to generate diverse and high-quality handwritten music images. The proposed CycleWGAN model, which enhances style transfer and training stability, significantly outperforms DCGAN and ProGAN in both qualitative and quantitative evaluations. CycleWGAN achieves superior performance, with an FID score of 41.87, an IS of 2.29, and a KID of 0.05, making it a promising solution for improving OMR systems.}, doi = {10.48550/arxiv.2411.16405}, keywords = {Image Translation, Generative Adversarial Networks, Sheet Music}, Url = {https://arxiv.org/pdf/2411.16405}, } @conference{shatri2024knowledgesegmentation, author = {Shatri, E and Fazekas, G}, booktitle = {17th International Conference on Knowledge Discovery and Information Retrieval (KDIR), 17-19 Nov. Porto, Portugal.}, title = {Knowledge Discovery in Optical Music Recognition: Enhancing Information Retrieval with Instance Segmentation [Best Student Paper Award]}, year = {2024}, abstract = {Optical Music Recognition (OMR) automates the transcription of musical notation from images into machine-readable formats like MusicXML, MEI, or MIDI, significantly reducing the costs and time of manual transcription. This study explores knowledge discovery in OMR by applying instance segmentation using Mask R-CNN to enhance the detection and delineation of musical symbols in sheet music. Unlike Optical Character Recognition (OCR), OMR must handle the intricate semantics of Common Western Music Notation (CWMN), where symbol meanings depend on shape, position, and context. Our approach leverages instance segmentation to manage the density and overlap of musical symbols, facilitating more precise information retrieval from music scores. Evaluations on the DoReMi and MUSCIMA++ datasets demonstrate substantial improvements, with our method achieving a mean Average Precision (mAP) of up to 59.70\\% in dense symbol environments, achieving comparable results to object detection. Furthermore, using traditional computer vision techniques, we add a parallel step for staff detection to infer the pitch for the recognised symbols. This study emphasises the role of pixel-wise segmentation in advancing accurate music symbol recognition, contributing to knowledge discovery in OMR. Our findings indicate that instance segmentation provides more precise representations of musical symbols, particularly in densely populated scores, advancing OMR technology. We make our implementation, pre-processing scripts, trained models, and evaluation results publicly available to support further research and development.}, doi = {10.5220/0012947500003838}, Keywords = {Dense Objects, Instance Segmentation, OMR}, Url = {https://arxiv.org/pdf/2408.15002}, } @conference{vanka2024diffmstccubase, author = {Vanka, S and Hannink, L and Rolland, J-B and Fazekas, G}, booktitle = {International Society for Music Information Retrieval (ISMIR) Conference, 10-15 Nov. 2024, San Francisco, US. (Late-breaking and Demo session)}, title = {Diff-MSTC: A Mixing Style Transfer Prototype for Cubase}, url = {https://arxiv.org/abs/2411.06576}, year = {2024}, abstract = {In our demo, participants are invited to explore the DiffMSTC prototype, which integrates the Diff-MST model into Steinberg’s digital audio workstation (DAW), Cubase. Diff-MST, a deep learning model for mixing style transfer, forecasts mixing console parameters for tracks using a reference song. The system processes up to 20 raw tracks along with a reference song to predict mixing console parameters that can be used to create an initial mix. Users have the option to manually adjust these parameters further for greater control. In contrast to earlier deep learning systems that are limited to research ideas, Diff-MSTC is a first-of-its-kind prototype integrated into a DAW. This integration facilitates mixing decisions on multitracks and lets users input context through a reference song, followed by fine-tuning of audio effects in a traditional manner.}, keywords = {Music Production, Audio Engineering, Multitrack Mixing}, } @conference{guinot2024semiSupCon, author = {Guinot, J and Quinton, E and Fazekas, G}, booktitle = {25th International Society for Music Information Retrieval (ISMIR) Conference, 9-15 Nov. 2024, San Francisco, USA}, title = {Semi-Supervised Contrastive Learning of Musical Representations}, url = {https://arxiv.org/abs/2407.13840}, year = {2024}, abstract = {Despite the success of contrastive learning in Music Information Retrieval, the inherent ambiguity of contrastive self-supervision presents a challenge. Relying solely on augmentation chains and self-supervised positive sampling strategies may lead to a pretraining objective that does not capture key musical information for downstream tasks. We introduce semi-supervised contrastive learning (SemiSupCon), an architecturally simple method for leveraging musically informed supervision signals in the contrastive learning of musical representations. Our approach introduces musically-relevant supervision signals into self-supervised contrastive learning by combining supervised and self-supervised contrastive objectives in a simple framework compared to previous work. This framework improves downstream performance and robustness to audio corruptions on a range of downstream MIR tasks with moderate amounts of labeled data. Our approach enables shaping the learned similarity metric through the choice of labeled data which (1) infuses the representations with musical domain knowledge and (2) improves out-of-domain performance with minimal general downstream performance loss. We show strong transfer learning performance on musically related yet not trivially similar tasks - such as pitch and key estimation. Additionally, our approach shows performance improvement on automatic tagging over self-supervised approaches with only 5\% of available labels included in pretraining.}, keywords = {Contrastive Learning, Music Information Retrieval, Representation Learning, Semi-Supervised Learning}, } @conference{weck2024muchomusic, author = {Weck, B and Manco, I and Benetos, E and Quinton, E and Fazekas, G and Bogdanov, D}, booktitle = {25th International Society for Music Information Retrieval (ISMIR) Conference, 10-14 Nov. 2024, San Francisco, CA, USA}, title = {MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models}, url = {https://arxiv.org/abs/2408.01337}, year = {2024}, abstract = {Multimodal models that jointly process audio and language hold great promise in audio understanding and are increasingly being adopted in the music domain. By allowing users to query via text and obtain information about a given audio input, these models have the potential to enable a variety of music understanding tasks via language-based interfaces. However, their evaluation poses considerable challenges, and it remains unclear how to effectively assess their ability to correctly interpret music-related inputs with current methods. Motivated by this, we introduce MuChoMusic, a benchmark for evaluating music understanding in multimodal language models focused on audio. MuChoMusic comprises 1,187 multiple-choice questions, all validated by human annotators, on 644 music tracks sourced from two publicly available music datasets, and covering a wide variety of genres. Questions in the benchmark are crafted to assess knowledge and reasoning abilities across several dimensions that cover fundamental musical concepts and their relation to cultural and functional contexts. Through the holistic analysis afforded by the benchmark, we evaluate five open-source models and identify several pitfalls, including an over-reliance on the language modality, pointing to a need for better multimodal integration. Data and code are open-sourced.}, keywords = {Multimodal Learning, Music Understanding, Audio-Language Models, Music Benchmarks}, } @conference{pasini2024music, author = {Pasini, M. and Lattner, S. and Fazekas, G}, booktitle = {25th International Society for Music Information Retrieval (ISMIR) Conference, 10-14 Nov. 2024, San Francisco, CA, USA}, title = {Music2Latent: Consistency Autoencoders for Latent Audio Compression}, url = {https://arxiv.org/pdf/2408.06500}, year = {2024}, doi = {10.48550/arXiv.2408.06500}, abstract = {Efficient audio representations in a compressed continuous latent space are critical for generative audio modeling and Music Information Retrieval (MIR) tasks. However, some existing audio autoencoders have limitations, such as multi-stage training procedures, slow iterative sampling, or low reconstruction quality. We introduce Music2Latent, an audio autoencoder that overcomes these limitations by leveraging consistency models. Music2Latent encodes samples into a compressed continuous latent space in a single end-to-end training process while enabling high-fidelity single-step reconstruction. Key innovations include conditioning the consistency model on upsampled encoder outputs at all levels through cross connections, using frequency-wise self-attention to capture long-range frequency dependencies, and employing frequency-wise learned scaling to handle varying value distributions across frequencies at different noise levels. We demonstrate that Music2Latent outperforms existing continuous audio autoencoders in sound quality and reconstruction accuracy while achieving competitive performance on downstream MIR tasks using its latent representations. To our knowledge, this represents the first successful attempt at training an end-to-end consistency autoencoder model.}, keywords = {Audio autoencoder, latent space, consistency models, Music Information Retrieval (MIR)}, } @conference{vanka2024diffmst, author = {Vanka, S and Steinmetz, C and Rolland, J-B and Reiss, J and Fazekas, G}, booktitle = {25th International Society for Music Information Retrieval (ISMIR) Conference, 10-14 Nov. 2024, San Francisco, USA}, title = {Diff-MST: Differentiable Mixing Style Transfer}, url = {https://arxiv.org/abs/2407.08889}, year = {2024}, abstract = {Mixing style transfer automates the generation of a multitrack mix for a given set of tracks by inferring production attributes from a reference song. However, existing systems for mixing style transfer are limited in that they often operate only on a fixed number of tracks, introduce artifacts, and produce mixes in an end-to-end fashion, without grounding in traditional audio effects, prohibiting interpretability and controllability. To overcome these challenges, we introduce Diff-MST, a framework comprising a differentiable mixing console, a transformer controller, and an audio production style loss function. By inputting raw tracks and a reference song, our model estimates control parameters for audio effects within a differentiable mixing console, producing high-quality mixes and enabling post-hoc adjustments. Moreover, our architecture supports an arbitrary number of input tracks without source labelling, enabling real-world applications. We evaluate our model's performance against robust baselines and showcase the effectiveness of our approach, architectural design, tailored audio production style loss, and innovative training methodology for the given task.}, keywords = {Audio Engineering, Automatic Mixing, DDSP, Music Production}, } @conference{Tirupati2024Crafting, author = {Tirupati, N. and Shatri, E. and Fazekas, G.}, booktitle = {6th International Workshop on Reading Music Systems (WORMS) co-located with ISMIR 2024, Nov 22, Online }, title = {Crafting Handwritten Notations: Towards Sheet Music Generation}, year = {2024}, abstract = {Handwritten musical notation represents a significant part of the world's cultural heritage, yet its complex and unstructured nature presents challenges for digitisation through Optical Music Recognition (OMR). While existing OMR systems perform well with printed scores, they struggle with handwritten music due to inconsistencies in writing styles and the quality of scanned images. This paper addresses these challenges by applying Enhanced Super-Resolution Generative Adversarial Networks (ESRGAN) to generate high-quality, synthetic handwritten music sheets. The generated sheets can then be used to improve OMR handwritten datasets with more style variability. Experimental results demonstrate that ESRGAN outperforms conventional models, producing detailed and high-fidelity synthetic music sheets. This research offers a practical approach to improving the preservation and digitisation of handwritten music, benefiting musicologists, educators, and archivists.}, keywords = {Handwritten Musical Notation, OMR, Enhanced Super-Resolution Generative Adversarial Networks, Synthetic Handwriting}, Url = {https://arxiv.org/pdf/2411.15741}, } @conference{Bolt2024MultiSignal, author = {Bolt, J. and Pauwels, J. and Fazekas, G.}, booktitle = {2024 IEEE 5th International Symposium on the Internet of Sounds (IS2), Oct 2, Online}, title = {Multi-Signal Informed Attention for Beat and Downbeat Detection}, year = {2024}, doi = {10.1109/is262782.2024.10704128}, abstract = {Processing multiple signal sources presents a challenge but also an opportunity in music information processing, especially when the sources provide complementary information. The attention mechanism paves the way toward addressing this challenge. In this paper, a novel transformer for beat and down-beat detection is proposed, named Informed Beat Transformer. It is theorised to both improve upon previous beat and downbeat detection models and take advantage of auxiliary information. Two experiments are run to test these two hypotheses. In the first experiment, it is directly compared to the Beat Transformer and found to have an average improvement of 0.005 in the F1 score across 4 datasets. The second experiment compares the Beat Transformer and madmom's beat tracker to the Informed Beat Transformer in a situation where a beat coherent signal stream is available, in this case a drum track. It was found to have a significant improvement to the Beat Transformer with an increase in F1 score of 0.702 and an increase of 0.018 to the madmom beat tracker. These results show the efficacy of the Informed Beat Transformer in both experimental settings.}, keywords = {Beat Detection, Downbeat Detection, Attention Mechanisms, Audio Signals}, Url = {https://doi.org/10.1109/is262782.2024.10704128}, } @conference{Zhang2024Composer, author = {Zhang, J. and Fazekas, G. and Saitis, C.}, booktitle = {IEEE International Workshop on Machine Learning for Signal Processing (MLSP), Sep 22-25, Online}, title = {Composer Style-specific Symbolic Music Generation Using Vector Quantized Discrete Diffusion Models}, year = {2024}, abstract = {Emerging Denoising Diffusion Probabilistic Models (DDPM) have become increasingly utilised because of promising results they have achieved in diverse generative tasks with continuous data, such as image and sound synthesis. Nonetheless, the success of diffusion models has not been fully extended to discrete symbolic music. We propose to combine a vector quantized variational autoencoder (VQ-VAE) and discrete diffusion models for the generation of symbolic music with desired composer styles. The trained VQ-VAE can represent symbolic music as a sequence of indexes that correspond to specific entries in a learned codebook. Subsequently, a discrete diffusion model is used to model the VQ-VAE's discrete latent space. The diffusion model is trained to generate intermediate music sequences consisting of codebook indexes, which are then decoded to symbolic music using the VQ-VAE's decoder. The evaluation results demonstrate our model can generate symbolic music with target composer styles that meet the given conditions with a high accuracy of 72.36%.}, doi = {10.1109/MLSP58920.2024.10734713}, keywords = {Deep learning, Diffusion models, Symbolic music generation}, Url = {https://doi.org/10.48550/arXiv.2310.14044}, } @conference{Yu2024Differentiable, author = {Yu, C-Y. and Mitcheltree, C. and Carson, A. and Bilbao, S. and Reiss, J. and Fazekas, G.}, booktitle = {International Conference on Digital Audio Effects 2024, Sep 3-7, Guildford, Surrey, UK}, title = {Differentiable All-pole Filters for Time-varying Audio Systems}, year = {2024}, abstract = {Infinite impulse response filters are an essential building block of many time-varying audio systems, such as audio effects and synthesisers. However, their recursive structure impedes end-to-end training of these systems using automatic differentiation. Although non-recursive filter approximations like frequency sampling and frame-based processing have been proposed and widely used in previous works, they cannot accurately reflect the gradient of the original system. We alleviate this difficulty by re-expressing a time-varying all-pole filter to backpropagate the gradients through itself, so the filter implementation is not bound to the technical limitations of automatic differentiation frameworks. This implementation can be employed within audio systems containing filters with poles for efficient gradient evaluation. We demonstrate its training efficiency and expressive capabilities for modelling real-world dynamic audio systems on a phaser, time-varying subtractive synthesiser, and feed-forward compressor. We make our code and audio samples available and provide the trained audio effect and synth models in a VST plugin.}, doi = {10.48550/arXiv.2404.07970}, keywords = {Time-varying audio systems, Infinite impulse response filters, Differentiable audio processing, Digital audio effects}, Url = {https://doi.org/10.48550/arXiv.2404.07970}, } @conference{Yu2024Differentiable, author = {Yu, C-Y. and Fazekas, G.}, booktitle = {INTERSPEECH 2024, Sep 1-5, Kos Island, Greece}, title = {Differentiable Time-Varying Linear Prediction in the Context of End-to-End Analysis-by-Synthesis}, year = {2024}, abstract = {Training the linear prediction (LP) operator end-to-end for audio synthesis in modern deep learning frameworks is slow due to its recursive formulation. In addition, frame-wise approximation as an acceleration method cannot generalise well to test time conditions where the LP is computed sample-wise. Efficient differentiable sample-wise LP for end-to-end training is the key to removing this barrier. We generalise the efficient time-invariant LP implementation from the GOLF vocoder to time-varying cases. Combining this with the classic source-filter model, we show that the improved GOLF learns LP coefficients and reconstructs the voice better than its frame-wise counterparts. Moreover, in our listening test, synthesised outputs from GOLF scored higher in quality ratings than the state-of-the-art differentiable WORLD vocoder.}, doi = {https://doi.org/10.21437/Interspeech.2024-1187}, keywords = {Time-varying linear prediction, End-to-end learning, Analysis-by-synthesis, Speech processing}, Url = {https://arxiv.org/abs/2406.05128}, } @conference{Wilson2024Tidal, author = {Wilson, E. and Fazekas, G. and Wiggins, G.}, booktitle = {International Conference on AI and Musical Creativity, AIMC 2024, Sep 9-11, Oxford, UK}, title = {Tidal MerzA: Combining Affective Modelling and Autonomous Code Generation through Reinforcement Learning}, year = {2024}, abstract = {This paper presents Tidal-MerzA, a novel system designed for collaborative performances between humans and a machine agent in the context of live coding, specifically focusing on the generation of musical patterns. Tidal-MerzA fuses two foundational models: ALCAA (Affective Live Coding Autonomous Agent) and Tidal Fuzz, a computational framework. By integrating affective modelling with computational generation, this system leverages reinforcement learning techniques to dynamically adapt music composition parameters within the TidalCycles framework, ensuring both affective qualities to the patterns and syntactical correctness. The development of Tidal-MerzA introduces two distinct agents: one focusing on the generation of mini-notation strings for musical expression, and another on the alignment of music with targeted affective states through reinforcement learning. This approach enhances the adaptability and creative potential of live coding practices and allows exploration of human-machine creative interactions. Tidal-MerzA advances the field of computational music generation, presenting a novel methodology for incorporating artificial intelligence into artistic practices.}, doi = {10.48550/arXiv.2409.11887}, keywords = {live coding, affective modelling, reinforcement learning, music generation, AI}, Url = {https://aimc2024.pubpub.org/pub/4na79ff3/release/1}, } @article{Ma2024FoundationSurvey, author = {Ma, Y. and Øland, A. and Ragni, A. and Del Sette, B.M. and Saitis, C. and Donahue, C. and Lin, C. and Plachouras, C. and Benetos, E. and Shatri, E. and Morreale, F. and Zhang, G. and Fazekas, G. and Xia, G. and Zhang, H. and Manco, I. and Huang, J. and Guinot, J. and Lin, L. and Marinelli, L. and Lam, M.W.Y. and Sharma, M. and Kong, Q. and Dannenberg, R.B. and Yuan, R. and Wu, S. and Wu, S-L. and Dai, S. and Lei, S. and Kang, S. and Dixon, S. and Chen, W. and Huang, W. and Du, X. and Qu, X. and Tan, X. and Li, Y. and Tian, Z. and Wu, Z. and Wu, Z. and Ma, Z. and Wang, Z.}, booktitle = {ACM Computing Surveys (in review) and ArXiv}, title = {Foundation Models for Music: A Survey}, year = {2024}, abstract = {In recent years, foundation models (FMs) such as large language models (LLMs) and latent diffusion models (LDMs) have profoundly impacted diverse sectors, including music. This comprehensive review examines state-of-the-art (SOTA) pre-trained models and foundation models in music, spanning from representation learning, generative learning and multimodal learning. We first contextualise the significance of music in various industries and trace the evolution of AI in music. By delineating the modalities targeted by foundation models, we discover many of the music representations are underexplored in FM development. Then, emphasis is placed on the lack of versatility of previous methods on diverse music applications, along with the potential of FMs in music understanding, generation and medical application. By comprehensively exploring the details of the model pre-training paradigm, architectural choices, tokenisation, finetuning methodologies and controllability, we emphasise the important topics that should have been well explored, like instruction tuning and in-context learning, scaling law and emergent ability, as well as long-sequence modelling etc. A dedicated section presents insights into music agents, accompanied by a thorough analysis of datasets and evaluations essential for pre-training and downstream tasks. Finally, by underscoring the vital importance of ethical considerations, we advocate that following research on FM for music should focus more on such issues as interpretability, transparency, human responsibility, and copyright issues. The paper offers insights into future challenges and trends on FMs for music, aiming to shape the trajectory of human-AI collaboration in the music realm.}, doi = {10.48550/arxiv.2408.14340}, keywords = {Foundational model, music, AI, survey, IEEE}, Url = {https://arxiv.org/abs/2408.14340}, } @conference{Crocker2024TemporalDataset, author = {Crocker, R. and Fazekas, G.}, booktitle = {Sound and Music Computing (SMC) conference 2024, Jul 6-9, ESMAE, Porto, Portugal}, title = {Temporal Analysis of Emotion Perception in Film Music: Insights from the FME-24 Dataset}, year = {2024}, abstract = {Understanding how composers evoke emotions through music remains limited due to the lack of diversity in existing datasets used for computational analysis of film music. To address this gap, the Film Music Emotion Dataset (FME-24) was created to explore emotion perception in modern and contemporary film music, covering various genres and compositional styles. FME-24 introduces precise temporal control through comprehensive time stamps, enabling nuanced analyses often overlooked in previous studies. Over 185 participants contributed annotations for the dataset. While there is consensus on arousal, there is significant valence variation across samples, possibly due to temporal aspects and participant differences. On average, each participant completed 17 annotations across 10 randomly generated samples, totaling 1580 annotations. These annotations will help uncover parallels between emotional states and musical transitions, enriching our understanding of music’s impact on audience immersion. Initial findings indicate potential for meaningful insights, future work including extracting musical features to compare with valence-arousal values and expanding the sample size with additional participants will enhance the reliability of the results, and uncover patterns between emotion perception and musical elements.}, doi = {10.5281/zenodo.13918961}, keywords = {Emotion perception, film music, temporal analysis, music datasets}, Url = {http://www.semanticaudio.net/files/papers/crocker2024temporal.pdf}, } @conference{Yu2024TimeOfArrivalProgramming, author = {Yu, C.-Y. and Pauwels, J. and Fazekas, G.}, booktitle = {Audio Engineering Society 156th Convention, June 15-17, 2024, Madrid, Spain}, title = {Time-of-arrival Estimation and Phase Unwrapping of Head-related Transfer Functions With Integer Linear Programming}, year = {2024}, abstract = {In binaural audio synthesis, aligning head-related impulse responses (HRIRs) in time has been an important pre-processing step, enabling accurate spatial interpolation and efficient data compression. The maximum correlation time delay between spatially nearby HRIRs has previously been used to get accurate and smooth alignment by solving a matrix equation in which the solution has the minimum Euclidean distance to the time delay. However, the Euclidean criterion could lead to an over-smoothing solution in practice. In this paper, we solve the smoothing issue by formulating the task as solving an integer linear programming problem equivalent to minimising an $L^1$-norm. Moreover, we incorporate 1) the cross-correlation of inter-aural HRIRs, and 2) HRIRs with their minimum-phase responses to have more reference measurements for optimisation. We show the proposed method can get more accurate alignments than the Euclidean-based method by comparing the spectral reconstruction loss of time-aligned HRIRs using spherical harmonics representation on seven HRIRs consisting of human and dummy heads. The extra correlation features and the $L^1$-norm are also beneficial in extremely noisy conditions. In addition, this method can be applied to phase unwrapping of head-related transfer functions, where the unwrapped phase could be a compact feature for downstream tasks.}, doi = {10.48550/arXiv.2405.06804}, keywords = {Time-of-arrival estimation, Phase unwrapping, Head-related transfer functions, Integer linear programming, Binaural audio synthesis}, url = {https://arxiv.org/abs/2405.06804}, } @article{vanka2024jaes, Abstract = {Effective music mixing requires technical and creative finesse, but clear communication with the client is crucial. The mixing engineer must grasp the client's expectations, and preferences, and collaborate to achieve the desired sound. The tacit agreement for the desired sound of the mix is often established using guides like reference songs and demo mixes exchanged between the artist and the engineer and sometimes verbalised using semantic terms. This paper presents the findings of a two-phased exploratory study aimed at understanding how professional mixing engineers interact with clients and use their feedback to guide the mixing process. For phase one, semi-structured interviews were conducted with five mixing engineers with the aim of gathering insights about their communication strategies, creative processes, and decision-making criteria. Based on the inferences from these interviews, an online questionnaire was designed and administered to a larger group of 22 mixing engineers during the second phase. The results of this study shed light on the importance of collaboration, empathy, and intention in the mixing process, and can inform the development of smart multi-track mixing systems that better support these practices. By highlighting the significance of these findings, this paper contributes to the growing body of research on the collaborative nature of music production and provides actionable recommendations for the design and implementation of innovative mixing tools.}, Author = {Vanka, S. and Safi, M. and Rolland, J-B. and Fazekas, G.}, Journal = {Journal of the Audio Engineering Society (JAES)}, Keywords = {Automatic Mixing, Intelligent Music Production, Mixing}, Number = {1}, Pages = {5--16}, Publisher-Url = {https://aes2.org/publications/elibrary-page/?id=22374}, Title = {The Role of Communication and Reference Songs in the Mixing Process: Insights from Professional Mix Engineers}, Url = {https://arxiv.org/pdf/2309.03404}, doi = {10.17743/jaes.2024.0001}, Volume = {72}, Year = {2024}, } @conference{yu2024differentiable, author = {Yu, C.-Y. and Mitcheltree, C. and Carson, A. and Bilbao, S. and Reiss, J.D. and Fazekas, G.}, booktitle = {27th International Conference on Digital Audio Effects (DAFx), Sept 3-7, 2024, Guilford, UK}, title = {Differentiable All-Pole Filters for Time-Varying Audio Systems}, year = {2024}, abstract = {Infinite impulse response filters are an essential building block of many time-varying audio systems, such as audio effects and synthesisers. However, their recursive structure impedes end-to-end training of these systems using automatic differentiation. Although non-recursive filter approximations like frequency sampling and frame-based processing have been proposed and widely used in previous works, they cannot accurately reflect the gradient of the original system. We alleviate this difficulty by re-expressing a time-varying all-pole filter to backpropagate the gradients through itself, so the filter implementation is not bound to the technical limitations of automatic differentiation frameworks. This implementation can be employed within audio systems containing filters with poles for efficient gradient evaluation. We demonstrate its training efficiency and expressive capabilities for modelling real-world dynamic audio systems on a phaser, time-varying subtractive synthesiser, and feed-forward compressor. We make our code and audio samples available and provide the trained audio effect and synth models in a VST plugin.}, doi = {https://doi.org/10.48550/arXiv.2404.07970}, keywords = {Differentiable Audio Systems, IIR Filters, Time-Varying Audio Processing, Audio Effects, Synthesizers}, Url = {https://arxiv.org/abs/2404.07970}, % Replace with the actual ArXiv link if available } @conference{manco2023song, author = {Manco, I. and Weck, B. and Doh, S. and Won, M. and Zhang, Y. and Bogdanov, D. and Wu, Y. and Chen, K. and Tovstogan, P. and Benetos, E. and Quinton, E. and Fazekas, G. and Nam, J.}, booktitle = {NeurIPS Machine Learning for Audio Workshop, December 16, 2023, New Orleans, USA}, title = {The Song Describer Dataset: A Corpus of Audio Captions for Music-and-Language Evaluation}, year = {2023}, abstract = {We introduce the Song Describer dataset (SDD), a new crowdsourced corpus of high-quality audio-caption pairs, designed for the evaluation of music-and-language models. The dataset consists of 1.1k human-written natural language descriptions of 706 music recordings, all publicly accessible and released under Creative Common licenses. To showcase the use of our dataset, we benchmark popular models on three key music-and-language tasks (music captioning, text-to-music generation, and music-language retrieval). Our experiments highlight the importance of cross-dataset evaluation and offer insights into how researchers can use SDD to gain a broader understanding of model performance.}, doi = {10.48550/arXiv.2311.10057}, keywords = {Music-Language Models, Audio Captions, Dataset Evaluation, Cross-Modal Learning}, Url = {https://arxiv.org/abs/2311.10057} } @conference{pasini2023selfsup, author = {Pasini, M and Lattner, S. and Fazekas, G.}, booktitle = {NeurIPS Machine Learning for Audio Workshop, December 16, 2023, New Orleans, USA}, title = {Self-Supervised Music Source Separation Using Vector-Quantized Source Category Estimates}, year = {2023}, abstract = {Music source separation is focused on extracting distinct sonic elements from composite tracks. Historically, many methods have been grounded in supervised learning, necessitating labeled data, which is occasionally constrained in its diversity. More recent methods have delved into N-shot techniques that utilize one or more audio samples to aid in the separation. However, a challenge with some of these methods is the necessity for an audio query during inference, making them less suited for genres with varied timbres and effects. This paper offers a proof-of-concept for a self-supervised music source separation system that eliminates the need for audio queries at inference time. In the training phase, while it adopts a query-based approach, we introduce a modification by substituting the continuous embedding of query audios with Vector Quantized (VQ) representations. Trained end-to-end with up to N classes as determined by the VQ's codebook size, the model seeks to effectively categorise instrument classes. During inference, the input is partitioned into N sources, with some potentially left unutilized based on the mix's instrument makeup. This methodology suggests an alternative avenue for considering source separation across diverse music genres. We provide examples and additional results online.}, doi = {doi.org/10.48550/arXiv.2311.13058}, keywords = {Sel-supervision, Vector-quantization, Source-separation, Music Informatics}, Url = {https://arxiv.org/abs/2311.13058} } @conference{yu2023zeroshot, author = {Yu, C-Y. and Postolache, E. and Rodolà, E. and Fazekas, G.}, booktitle = {Sound Demixing Workshop 2023, November 13, Milan, Italy}, title = {Zero-Shot Duet Singing Voices Separation with Diffusion Models}, year = {2023}, abstract = {In recent studies, diffusion models have shown promise as priors for solving audio inverse problems. These models allow us to sample from the posterior distribution of a target signal given an observed signal by manipulating the diffusion process. However, when separating audio sources of the same type, such as duet singing voices, the prior learned by the diffusion process may not be sufficient to maintain the consistency of the source identity in the separated audio. For example, the singer may change from one to another occasionally. Tackling this problem will be useful for separating sources in a choir, or a mixture of multiple instruments with similar timbre, without acquiring large amounts of paired data. In this paper, we examine this problem in the context of duet singing voices separation, and propose a method to enforce the coherency of singer identity by splitting the mixture into overlapping segments and performing posterior sampling in an auto-regressive manner, conditioning on the previous segment. We evaluate the proposed method on the MedleyVox dataset and show that the proposed method outperforms the naive posterior sampling baseline. Our source code and the pre-trained model are publicly available at https://github.com/iamycy/duet-svs-diffusion.}, doi = {10.48550/arXiv.2311.07345}, keywords = {Diffusion Models, Zero-Shot Learning, Singing Voice Separation}, Url = {http://arxiv.org/abs/2311.07345v2}, } @conference{tang2023Reconstructing, author = {Tang, J. and Wiggins, G. and Fazekas, G.}, booktitle = {Proceedings of the 16th International Symposium on Computer Music Multidisciplinary Research (CMMR 2023), November 13-17, 2023, Tokyo, Japan}, title = {Reconstructing Human Expressiveness in Piano Performances with a Transformer Network}, year = {2023}, abstract = {Capturing intricate and subtle variations in human expressiveness in music performance using computational approaches is challenging. In this paper, we propose a novel approach for reconstructing human expressiveness in piano performance with a multi-layer bi-directional Transformer encoder. To address the needs for large amounts of accurately captured and score-aligned performance data in training neural networks, we use transcribed scores obtained from an existing transcription model to train our model. We integrate pianist identities to control the sampling process and explore the ability of our system to model variations in expressiveness for different pianists. The system is evaluated through statistical analysis of generated expressive performances and a listening test. Overall, the results suggest that our method achieves state-of-the-art in generating human-like piano performances from transcribed scores, while fully and consistently reconstructing human expressiveness poses further challenges.}, doi = {10.48550/arXiv.2306.06040}, keywords = {music generation, expressive music performance, transformer model}, url = {https://arxiv.org/abs/2306.06040} } @conference{row2023jazzvar, author = {Row, E. and Tang, J. and Fazekas, G.}, booktitle = {Proceedings of the 16th International Symposium on Computer Music Multidisciplinary Research (CMMR 2023), November 13-17, 2023, Tokyo, Japan}, title = {JAZZVAR: A Dataset of Variations found within Solo Piano Performances of Jazz Standards for Music Overpainting}, year = {2023}, abstract = {Jazz pianists often uniquely interpret jazz standards. Passages from these interpretations can be viewed as sections of variation. We manually extracted such variations from solo jazz piano performances. The JAZZVAR dataset is a collection of 502 pairs of Variation and Original MIDI segments. Each Variation in the dataset is accompanied by a corresponding Original segment containing the melody and chords from the original jazz standard. Our approach differs from many existing jazz datasets in the music information retrieval (MIR) community, which often focus on improvisation sections within jazz performances. In this paper, we outline the curation process for obtaining and sorting the repertoire, the pipeline for creating the Original and Variation pairs, and our analysis of the dataset. We also introduce a new generative music task, Music Overpainting, and present a baseline Transformer model trained on the JAZZVAR dataset for this task. Other potential applications of our dataset include expressive performance analysis and performer identification.}, doi = {10.5281/zenodo.11264676}, keywords = {music generation, dataset, jazz}, url = {https://arxiv.org/abs/2307.09670} } @conference{yu2023singing, author = {Yu, C.-Y. and Fazekas, G.}, booktitle = {24th International Society for Music Information Retrieval Conference (ISMIR 2023), November 5-9, Milan, Italy}, title = {Singing Voice Synthesis Using Differentiable LPC and Glottal-Flow-Inspired Wavetables}, year = {2023}, abstract = {This paper introduces GlOttal-flow LPC Filter (GOLF), a novel method for singing voice synthesis (SVS) that exploits the physical characteristics of the human voice using differentiable digital signal processing. GOLF employs a glottal model as the harmonic source and IIR filters to simulate the vocal tract, resulting in an interpretable and efficient approach. We show it is competitive with state-of-the-art singing voice vocoders, requiring fewer synthesis parameters and less memory to train, and runs an order of magnitude faster for inference. Additionally, we demonstrate that GOLF can model the phase components of the human voice, which has immense potential for rendering and analysing singing voices in a differentiable manner. Our results highlight the effectiveness of incorporating the physical properties of the human voice mechanism into SVS and underscore the advantages of signal-processing-based approaches, which offer greater interpretability and efficiency in synthesis.}, doi = {10.48550/arXiv.2306.17252}, keywords = {Differentiable Digital Signal Processing, Neural Vocoder, Singing Voice Synthesis}, Url = {https://arxiv.org/abs/2306.17252} } @conference{marinelli2023gender, author = {Marinelli, L. and Fazekas, G. and Saitis, C.}, booktitle = {24th International Society for Music Information Retrieval Conference (ISMIR 2023), November 5-9, Milan, Italy}, title = {Gender-Coded Sound: Analysing the Gendering of Music in Toy Commercials via Multi-Task Learning}, year = {2023}, abstract = {Music can convey ideological stances, and gender is just one of them. Evidence from musicology and psychology research shows that gender-loaded messages can be reliably encoded and decoded via musical sounds. However, much of this evidence comes from examining music in isolation, while studies of the gendering of music within multimodal communicative events are sparse. In this paper, we outline a method to automatically analyse how music in TV advertising aimed at children may be deliberately used to reinforce traditional gender roles. Our dataset of 606 commercials included music-focused mid-level perceptual features, multimodal aesthetic emotions, and content analytical items. Despite its limited size, and because of the extreme gender polarisation inherent in toy advertisements, we obtained noteworthy results by leveraging multi-task transfer learning on our densely annotated dataset. The models were trained to categorise commercials based on their intended target audience, specifically distinguishing between masculine, feminine, and mixed audiences. Additionally, to provide explainability for the classification in gender targets, the models were jointly trained to perform regressions on emotion ratings across six scales, and on mid-level musical perceptual attributes across twelve scales. Standing in the context of MIR, computational social studies and critical analysis, this study may benefit not only music scholars but also advertisers, policymakers, and broadcasters.}, keywords = {Gender-Coded Sound, Music Information Retrieval, Multi-Task Learning, Toy Commercials, Gender Studies}, Url = {https://qmro.qmul.ac.uk/xmlui/handle/123456789/91180} } @conference{tang2023pianist, author = {Tang, J. and Wiggins, G. and Fazekas, G.}, booktitle = {4th IEEE International Symposium on the Internet of Sounds (ISIoS 2023), October 26-27, Pisa, Italy}, title = {Pianist Identification Using Convolutional Neural Networks}, year = {2023}, abstract = {This paper investigates the use of convolutional neural networks (CNNs) for identifying pianists based on their performance style. By analyzing spectrograms of piano recordings, the proposed method captures unique characteristics of each pianist's playing. The system achieves high accuracy, demonstrating the effectiveness of CNNs in music information retrieval and performer classification tasks. This research paves the way for further applications of machine learning in music performance analysis.}, doi = {10.1109/ieeeconf59510.2023.10335427}, keywords = {Machine Learning, Pianist Identification, Music Information Retrieval}, Url = {https://arxiv.org/abs/2310.00699} } @conference{rice2023general, author = {Rice, M. and Steinmetz, C. J. and Fazekas, G. and Reiss, J. D.}, booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), October 22-25, New Paltz, NY, USA}, title = {General Purpose Audio Effect Removal}, year = {2023}, abstract = {Although the design and application of audio effects is well understood, the inverse problem of removing these effects is significantly more challenging and far less studied. Recently, deep learning has been applied to audio effect removal; however, existing approaches have focused on narrow formulations considering only one effect or source type at a time. In realistic scenarios, multiple effects are applied with varying source content. This motivates a more general task, which we refer to as general purpose audio effect removal. We developed a dataset for this task using five audio effects across four different sources and used it to train and evaluate a set of existing architectures. We found that no single model performed optimally on all effect types and sources. To address this, we introduced RemFX, an approach designed to mirror the compositionality of applied effects. We first trained a set of the best-performing effect-specific removal models and then leveraged an audio effect classification model to dynamically construct a graph of our models at inference. We found our approach to outperform single model baselines, although examples with many effects present remain challenging.}, doi = {10.1109/waspaa58266.2023.10248157}, keywords = {Audio Effect Removal, Signal Processing, Machine Learning, Audio Restoration}, Url = {https://arxiv.org/abs/2308.16177} } @conference{vahidi2023perceptual, author = {Vahidi, C. and Singh, S. and Benetos, E. and Phan, Q. H. and Stowell, D. and Fazekas, G. and Lagrange, M.}, booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), October 22-25, New Paltz, NY, USA}, title = {Perceptual Musical Similarity Metric Learning with Graph Neural Networks}, year = {2023}, abstract = {Sound retrieval for assisted music composition depends on evaluating similarity between musical instrument sounds, which is partly influenced by playing techniques. Previous methods utilizing Euclidean nearest neighbours over acoustic features show some limitations in retrieving sounds sharing equivalent timbral properties, but potentially generated using a different instrument, playing technique, pitch, or dynamic. In this paper, we present a metric learning system designed to approximate human similarity judgments between extended musical playing techniques using graph neural networks. Such structure is a natural candidate for solving similarity retrieval tasks, yet has seen little application in modelling perceptual music similarity. We optimize a Graph Convolutional Network (GCN) over acoustic features via a proxy metric learning loss to learn embeddings that reflect perceptual similarities. Specifically, we construct the graph's adjacency matrix from the acoustic data manifold with an example-wise adaptive k-nearest neighbourhood graph: Adaptive Neighbourhood Graph Neural Network (AN-GNN). Our approach achieves 96.4\% retrieval accuracy compared to 38.5\% with a Euclidean metric and 86.0\% with a multilayer perceptron (MLP), while effectively considering retrievals from distinct playing techniques to the query example.}, doi = {10.1109/WASPAA58266.2023.10248151}, keywords = {Auditory Similarity, Content-Based Music Retrieval, Graph Neural Networks, Metric Learning}, Url = {https://ieeexplore.ieee.org/document/10248151} } @conference{zhang2023fast, author = {Zhang, J. and Fazekas, G. and Saitis, C.}, booktitle = {Preprint on arXiv, October 21, 2023}, title = {Fast Diffusion GAN Model for Symbolic Music Generation Controlled by Emotions}, year = {2023}, abstract = {This paper presents a novel approach for symbolic music generation using a Fast Diffusion GAN model. The system integrates emotion control mechanisms to enable the generation of music that aligns with specified emotional contexts. By leveraging advancements in diffusion models and generative adversarial networks, the proposed method achieves both high-quality and computational efficiency, demonstrating its potential in emotion-driven music composition tasks.}, doi = {10.48550/arxiv.2310.14040}, keywords = {Symbolic Music Generation, Emotion Control, Generative Models, Diffusion GAN}, Url = {https://arxiv.org/abs/2310.14040} } @article{vahidi2023jaes, Author = {Vahidi, C. and Han, H. and Wang, C. and Fazekas, G. and Lagrange, M. and Lostanlen, V.}, Title = {Mesostructures: Beyond Spectrogram Loss in Differentiable Time–Frequency Analysis}, Journal = {Journal of the Audio Engineering Society (JAES)}, Volume = {71}, Number = {9}, Pages = {577-585}, % Verify page range online Abstract = {Computer musicians refer to mesostructures as the intermediate levels of articulation between the microstructure of waveshapes and the macrostructure of musical forms. Examples of mesostructures include melody, arpeggios, syncopation, polyphonic grouping, and textural contrast. Despite their central role in musical expression, they have received limited attention in recent applications of deep learning to the analysis and synthesis of musical audio. Currently, autoencoders and neural audio synthesizers are only trained and evaluated at the scale of microstructure: i.e., local amplitude variations up to 100 milliseconds or so. In this paper, we formulate and address the problem of mesostructural audio modeling via a composition of a differentiable arpeggiator and time-frequency scattering. We empirically demonstrate that time--frequency scattering serves as a differentiable model of similarity between synthesis parameters that govern mesostructure. By exposing the sensitivity of short-time spectral distances to time alignment, we motivate the need for a time-invariant and multiscale differentiable time--frequency model of similarity at the level of both local spectra and spectrotemporal modulations.}, doi = {10.17743/jaes.2022.0103}, Publisher-Url = {https://www.aes.org/e-lib/browse.cfm?elib=21983}, Url = {https://arxiv.org/abs/2301.10183}, Keywords = {Mesostructures, Time-Frequency Analysis, Deep Learning, Audio Effects, Spectrogram}, Year = {2023}, } @conference{bolt2023supervised, author = {Bolt, J. and Fazekas, G.}, booktitle = {Proc. of the 18th ACM International Audio Mostly Conference, August 30 - September 1, 2023, Edinburgh, United Kingdom}, title = {Supervised Contrastive Learning For Musical Onset Detection}, year = {2023}, abstract = {This paper applies supervised contrastive learning to musical onset detection to alleviate the issue of noisy annotated data for onset datasets. The results are compared against a state-of-the-art, convolutional, cross-entropy model. Both models were trained on two datasets. The first dataset comprised of a manually annotated selection of music. This data was then augmented with inaccurate labelling to produce the second data set. When trained on the original data the supervised contrastive model produced an F1 score of 0.878. This was close to the cross-entropy model score of 0.888. This showed that supervised contrastive loss is applicable to onset detection but does not outperform cross-entropy models in an ideal training case. When trained on the augmented set the contrastive model consistently outperformed the cross-entropy model across increasing percentage inaccuracies, with a difference in F1 score of 0.1 for the most inaccurate data. This demonstrates the robustness of supervised contrastive learning with inaccurate data for onset detection, suggesting that supervised contrastive loss could provide a new onset detection architecture which is invariant to noise in the data or inaccuracies in labelling.}, doi = {10.1145/3616195.3616215}, keywords = {Musical Onset Detection, Contrastive Learning, Audio Signal Processing}, Url = {https://dl.acm.org/doi/10.1145/3616195.3616215}, } @article{hayes2023frontiers, Author = {Hayes, B. and Shier, J. and Fazekas, G. and McPherson, A. and Saitis, C.}, Title = {A Review of Differentiable Digital Signal Processing for Music and Speech Synthesis}, Journal = {Frontiers in Signal Processing, Sec. Audio and Acoustic Signal Processing}, Volume = {3}, Number = {1284100}, Pages = {1--29}, Abstract = {The term “differentiable digital signal processing” describes a family of techniques in which loss function gradients are backpropagated through digital signal processors, facilitating their integration into neural networks. This article surveys the literature on differentiable audio signal processing, focusing on its use in music and speech synthesis. We catalogue applications to tasks including music performance rendering, sound matching, and voice transformation, discussing the motivations for and implications of the use of this methodology. This is accompanied by an overview of digital signal processing operations that have been implemented differentiably, which is further supported by a web book containing practical advice on differentiable synthesiser programming (https://intro2ddsp.github.io/). Finally, we highlight open challenges, including optimisation pathologies, robustness to real-world conditions, and design trade-offs, and discuss directions for future research.}, doi = {https://doi.org/10.3389/frsip.2023.1284100}, Publisher-Url = {https://www.frontiersin.org/journals/signal-processing/articles/10.3389/frsip.2023.1284100/full}, Url = {https://arxiv.org/abs/2308.15422}, Keywords = {Differentiable Digital Signal Processing, Music Synthesis, Speech Synthesis, Generative Models}, Year = {2024}, } @article{wilson2023os, Author = {Wilson, E. and Fazekas, G. and Wiggins, G.}, Title = {On the Integration of Machine Agents into Live Coding}, Journal = {Organised Sound}, Volume = {28}, Number = {2}, Pages = {305--314}, Abstract = {Co-creation strategies for human-machine collaboration have recently been explored in various creative disciplines and more opportunities for human–machine collaborations are materialising. In this article, we outline how to augment musical live coding by considering how human live coders can effectively collaborate with a machine agent imbued with the ability to produce its own patterns of executable code. Using machine agents allows live coders to explore not-yet conceptualised patterns of code and supports them in asking new questions. We argue that to move away from scenarios where machine agents are used in a merely generative way, or only as creative impetus for the human, and towards a more collaborative relationship with the machine agent, consideration is needed for system designers around the aspects of reflection, aesthetics and evaluation. Furthermore, owing to live coding’s close relationship with exposing processes, using agents in such a way can be a useful manner to explore how to make artificial intelligence processes more open and explainable to an audience. Finally, some speculative futures of co-creative and artificially intelligent systems and what opportunities they might afford the live coder are discussed.}, doi = {10.1017/s1355771823000420}, Publisher-Url = {https://www.cambridge.org/core/journals/organised-sound}, Url = {https://doi.org/10.1017/s1355771823000420}, Keywords = {Machine Learning and Artificial Intelligence, Live Coding, Human-Computer Interaction}, Year = {2023}, } @conference{yu2023conditioning, author = {Yu, C.-Y. and Yeh, S.-L. and Fazekas, G. and Tang, H.}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), June 4-10, Rhodes Island, Greece}, title = {Conditioning and Sampling in Variational Diffusion Models for Speech Super-Resolution}, year = {2023}, abstract = {Recently, diffusion models (DMs) have been increasingly used in audio processing tasks, including speech super-resolution (SR), which aims to restore high-frequency content given low-resolution speech utterances. This is commonly achieved by conditioning the network of noise predictor with low-resolution audio. In this paper, we propose a novel sampling algorithm that communicates the information of the low-resolution audio via the reverse sampling process of DMs. The proposed method can be a drop-in replacement for the vanilla sampling process and can significantly improve the performance of the existing works. Moreover, by coupling the proposed sampling method with an unconditional DM, i.e., a DM with no auxiliary inputs to its noise predictor, we can generalize it to a wide range of SR setups. We also attain state-of-the-art results on the VCTK Multi-Speaker benchmark with this novel formulation.}, doi = {10.1109/ICASSP49357.2023.10095103}, keywords = {Speech Super-Resolution, Variational Diffusion Models, Acoustic Signal Processing}, Url = {https://arxiv.org/abs/2210.15793}, } @article{ceriani2023websemantics, Author = {Ceriani, M. and Viola, F. and Rudan, S. and Antoniazzi, F. and Barthet, M. and Fazekas, G.}, Title = {Semantic Integration of Audio Content Providers through the Audio Commons Ontology}, Journal = {Journal of Web Semantics (JWS)}, Volume = {77}, Number = {100787}, Pages = {1-19}, Abstract = {A broad variety of audio content is available online through an increasing number of repositories and platforms. Resources such as music tracks, recorded sounds or instrument samples may be accessed by users for tasks ranging from customised music listening and exploration, to music making and sound design using existing sounds and samples. However, each online repository offers its own API and represents information through its own data model, making it difficult for applications to exploit the plurality of online audio and music content on the web. A crucial step toward integrating audio repositories in a flexible manner is a shared basis for modelling the data therein. This paper describes and extends the Audio Commons Ontology, a common data model designed to integrate existing repositories in the audio media domain. The ontology is designed with the involvement of users through surveys and requirements analyses, and evaluated in-use, by demonstrating how it supports the integration of four relevant repositories with heterogeneous APIs and data models. While this work proves the concept in the audio domain, our proposed methodology may transfer across a broad range of media integration tasks.}, doi = {10.1016/j.websem.2023.100787}, Publisher-Url = {https://www.sciencedirect.com/science/article/pii/S1570826823000554}, Url = {files/papers/ceriani2023jws.pdf}, Keywords = {Audio Commons, Ontology, Semantic Integration, Web Semantics}, Year = {2023}, } @conference{yu2023singingvoice, author = {Yu, C-Y and Fazekas, G.}, booktitle = {24th International Society for Music Information Retrieval Conference (ISMIR), November 6-10, Milan, Italy}, title = {Singing Voice Synthesis Using Differentiable LPC and Glottal-Flow-Inspired Wavetables}, year = {2023}, abstract = {This paper introduces GlOttal-flow LPC Filter (GOLF), a novel method for singing voice synthesis (SVS) that exploits the physical characteristics of the human voice using differentiable digital signal processing. GOLF employs a glottal model as the harmonic source and IIR filters to simulate the vocal tract, resulting in an interpretable and efficient approach. We show it is competitive with state-of-the-art singing voice vocoders, requiring fewer synthesis parameters and less memory to train, and runs an order of magnitude faster for inference. Additionally, we demonstrate that GOLF can model the phase components of the human voice, which has immense potential for rendering and analysing singing voices in a differentiable manner. Our results highlight the effectiveness of incorporating the physical properties of the human voice mechanism into SVS and underscore the advantages of signal-processing-based approaches, which offer greater interpretability and efficiency in synthesis.}, doi = {10.5281/zenodo.13916489}, keywords = {Singing Voice Synthesis, Differentiable Digital Signal Processing, LPC, Glottal Flow, Vocoder}, Url = {http://arxiv.org/abs/2306.17252v3}, } @conference{hayes2023sinusoidal, author = {Hayes, B. and Saitis, C. and Fazekas, G.}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), June 4-10, Rhodes Island, Greece}, title = {Sinusoidal Frequency Estimation by Gradient Descent}, year = {2023}, abstract = {Sinusoidal parameter estimation is a fundamental task in applications from spectral analysis to time-series forecasting. Estimating the sinusoidal frequency parameter by gradient descent is, however, often impossible as the error function is non-convex and densely populated with local minima. The growing family of differentiable signal processing methods has therefore been unable to tune the frequency of oscillatory components, preventing their use in a broad range of applications. This work presents a technique for joint sinusoidal frequency and amplitude estimation using the Wirtinger derivatives of a complex exponential surrogate and any first order gradient-based optimizer, enabling end to-end training of neural network controllers for unconstrained sinusoidal models.}, doi = {10.1109/icassp49357.2023.10095188}, keywords = {Sinusoidal Frequency Estimation, Gradient Descent, Signal Processing, Frequency Tracking}, Url = {https://arxiv.org/abs/2210.14476}, } @conference{diaz2023rigidbody, author = {Diaz, R. and Hayes, B. and Saitis, C. and Fazekas, G. and Sandler, M.}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), June 4-10, 2023, Rhodes Island, Greece}, title = {Rigid-Body Sound Synthesis with Differentiable Modal Resonators}, year = {2023}, abstract = {Physical models of rigid bodies are used for sound synthesis in applications from virtual environments to music production. Traditional methods such as modal synthesis often rely on computationally expensive numerical solvers, while recent deep learning approaches are limited by post-processing of their results. In this work, we present a novel end-to-end framework for training a deep neural network to generate modal resonators for a given 2D shape and material, using a bank of differentiable IIR filters. We demonstrate our method on a dataset of synthetic objects, but train our model using an audio-domain objective, paving the way for physically-informed synthesizers to be learned directly from recordings of real-world objects.}, doi = {10.1109/icassp49357.2023.10095139}, keywords = {Differentiable signal processing, Machine learning, Sound synthesis, Physical modelling}, url = {https://arxiv.org/abs/2210.15306} } @conference{rafee2023hipi, author = {Rafee, S. R. M. and Fazekas, G. and Wiggins, G.}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), June 4-10, 2023, Rhodes Island, Greece}, title = {HIPI: A Hierarchical Performer Identification Model Based on Symbolic Representation of Music}, year = {2023}, abstract = {Automatic Performer Identification from the symbolic representation of music has been a challenging topic in Music Information Retrieval (MIR). In this study, we apply a Recurrent Neural Network (RNN) model to classify the most likely music performers from their interpretative styles. We study different expressive parameters and investigate how to quantify these parameters for the exceptionally challenging task of performer identification. We encode performer-style information using a Hierarchical Attention Network (HAN) architecture, based on the notion that traditional western music has a hierarchical structure (note, beat, measure, phrase level etc.). In addition, we present a large-scale dataset consisting of six virtuoso pianists performing the same set of compositions. The experimental results show that our model outperforms the baseline models with an F1-score of 0.845 and demonstrates the significance of the attention mechanism for understanding different performance styles.}, doi = {10.1109/icassp49357.2023.10094844}, keywords = {Performer Identification, Music Information Retrieval, Recurrent Neural Network, Hierarchical Attention Network, Symbolic Music Representation}, url = {https://ieeexplore.ieee.org/document/10094844/} } @conference{vanka2023adoption, author = {Vanka, S. and Safi, M. and Rolland, J-B. and Fazekas, G.}, booktitle = {AES Europe Convention 154, May 13-15, Helsinki, Finland}, title = {Adoption of AI Technology in the Music Mixing Workflow: An Investigation}, year = {2023}, abstract = {The integration of artificial intelligence (AI) technology in the music industry is driving a significant change in the way music is being composed, produced and mixed. This study investigates the current state of AI in the mixing workflows and its adoption by different user groups. Through semi-structured interviews, a questionnaire-based study, and analyzing web forums, the study confirms three user groups comprising amateurs, pro-ams, and professionals. Our findings show that while AI mixing tools can simplify the process and provide decent results for amateurs, pro-ams seek precise control and customization options, while professionals desire control and customization options in addition to assistive and collaborative technologies. The study provides strategies for designing effective AI mixing tools for different user groups and outlines future directions.}, doi = {10.48550/arXiv.2304.03407}, keywords = {AI in Music, Mixing Workflow, User-Centered Design}, Url = {https://arxiv.org/abs/2304.03407}, Publisher-Url = {https://aes2.org/publications/elibrary-page/?id=22060}, } @conference{lobbers2023sketchsynth, author = {Lobbers, S. and Fazekas, G.}, booktitle = {International Conference on New Interfaces for Musical Expression (NIME), May 31 - June 2, Mexico City, Mexico}, title = {SketchSynth: A Browser-Based Sketching Interface for Sound Control}, year = {2023}, abstract = {SketchSynth is an interface that allows users to create mappings between synthesised sound and a graphical sketch input based on human cross-modal perception. The project is rooted in the authors' research which collected 2692 sound-sketches from 178 participants representing their associations with various sounds. The interface extracts sketch features in real-time that were shown to correlate with sound characteristics and can be mapped to synthesis and audio effect parameters via Open Sound Control (OSC). This modular approach allows for an easy integration into an existing workflow and can be tailored to individual preferences. The interface can be accessed online through a web-browser on a computer, laptop, smartphone or tablet and does not require specialised hard- or software. We demonstrate SketchSynth with an iPad for sketch input to control synthesis and audio effect parameters in the Ableton Live digital audio workstation (DAW). A MIDI controller is used to play notes and trigger pre-recorded accompaniment. This work serves as an example of how perceptual research can help create strong, meaningful gesture-to-sound mappings.}, pages = {637--641}, doi = {10.5281/zenodo.11189331}, keywords = {Sound Control, Cross-Modal Perception, Sketch-Based Interfaces, OSC, DAW Integration}, Url = {https://nime.org/proceedings/2023/nime2023_95.pdf}, } @conference{hayes2023responsibility, author = {Hayes, B. and Saitis, C. and Fazekas, G.}, booktitle = {International Conference on Learning Representations (ICLR), April 19}, title = {The Responsibility Problem in Neural Networks with Unordered Targets}, year = {2023}, abstract = {Neural networks designed to handle unordered targets face unique challenges related to responsibility assignment across output neurons. This paper investigates the problem and explores potential solutions, highlighting its implications for neural network interpretability and training stability.}, doi = {10.48550/arxiv.2304.09499}, keywords = {Neural Networks, Interpretability, Unordered Targets, Responsibility Assignment}, Url = {https://arxiv.org/abs/2304.09499}, } @conference{lobbers2023bsketchsynth, author = {Lobbers, S. and Thorpe, L. and Fazekas, G.}, booktitle = {EvoMUSART 2023, April 12-14, Brno, Czech Republic}, title = {SketchSynth: Cross-Modal Control of Sound Synthesis}, year = {2023}, abstract = {This paper introduces SketchSynth, a tool that bridges graphical sketch inputs with sound synthesis through cross-modal mapping. By leveraging perceptual correlations between visual and auditory features, SketchSynth enables intuitive control of synthesis parameters. Applications in creative and educational contexts are discussed, demonstrating the potential of such interfaces.}, doi = {10.1007/978-3-031-29956-8_11}, keywords = {Sound Synthesis, Cross-Modal Interaction, Sketch-Based Control, Perceptual Mapping}, Url = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/96745/Lobbers%20SketchSynth%3A%20Cross-Modal%202023%20Accepted.pdf}, } @conference{lobbers2023aimediator, author = {Löbbers, S. and Barthet, M. and Fazekas, G.}, booktitle = {Published on arXiv, March 2, 2023}, title = {AI as Mediator Between Composers, Sound Designers, and Creative Media Producers}, year = {2023}, abstract = {This paper explores the role of AI as a mediator in the creative processes involving composers, sound designers, and media producers. It investigates the potential for AI systems to facilitate collaboration, streamline workflows, and enhance creativity through intelligent mediation and adaptive tools.}, doi = {10.48550/arxiv.2303.01457}, keywords = {AI Mediation, Creative Collaboration, Music Production, Sound Design}, Url = {https://arxiv.org/abs/2303.01457}, } @article{turchet2023ieeeiot, Author = {Turchet, L. and Lagrange, M. and Rottondi, C. and Fazekas, G. and Peters, N. and Østergaard, J. and Font, F. and Bäckström, T. and Fischione, C.}, Title = {The Internet of Sounds: Convergent Trends, Insights, and Future Directions}, Journal = {IEEE Internet of Things Journal (IoT)}, Volume = {10}, Number = {13}, Pages = {11264--11292}, Abstract = {This paper discusses the emerging concept of the Internet of Sounds, focusing on the integration of audio technologies with IoT ecosystems. It reviews convergent trends, highlights current insights, and outlines future directions in the field, identifying key research challenges and applications.}, doi = {10.1109/jiot.2023.3253602}, Publisher-Url = {https://ieeexplore.ieee.org/document/10014560}, Url = {https://hal.science/hal-04041020/file/The_Internet_of_Sounds_Convergent_Trends_Insights_and_Future_Directions.pdf}, Keywords = {Internet of Sounds, IoT, Audio Technology, Future Directions}, Year = {2023}, } @article{qian2023fdh, Author = {Qian, K. and Fazekas, G. and Li, S. and Li, Z. and Schuller, B. W.}, Title = {Editorial: Human-centred computer audition: sound, music, and healthcare}, Journal = {Frontiers in Digital Health}, Volume = {5}, Pages = {1340517}, Abstract = {Motivated by the concept of human-centred AI (HAI), we organised the research topic on “Human-Centred Computer Audition: Sound, Music, and Healthcare,” which lasted from April 2021 to January 2023. Finally, 10 articles were accepted and published after a rigorous peer-review process. There are 57 authors involved in this research topic.}, doi = {10.3389/fdgth.2023.1340517}, Publisher-Url = {https://www.frontiersin.org/journals/digital-health}, Url = {https://www.frontiersin.org/articles/10.3389/fdgth.2023.1340517/full}, Keywords = {Human-centred AI, Computer Audition, Sound, Music, Healthcare}, Year = {2023}, } @conference{manco2022ismir, author = {Manco, I. and Benetos, E. and Quinton, E. and Fazekas, G.}, booktitle = {23rd International Society for Music Information Retrieval Conference (ISMIR), December 4-8, Bengaluru, India}, title = {Contrastive audio-language learning for music}, year = {2022}, abstract = {As one of the most intuitive interfaces known to humans, natural language has the potential to mediate many tasks that involve human-computer interaction, especially in application-focused fields like Music Information Retrieval. In this work, we explore cross-modal learning in an attempt to bridge audio and language in the music domain. To this end, we propose MusCALL, a framework for Music Contrastive Audio-Language Learning. Our approach consists of a dual-encoder architecture that learns the alignment between pairs of music audio and descriptive sentences, producing multimodal embeddings that can be used for text-to-audio and audio-to-text retrieval out-of-the-box. Thanks to this property, MusCALL can be transferred to virtually any task that can be cast as text-based retrieval. Our experiments show that our method performs significantly better than the baselines at retrieving audio that matches a textual description and, conversely, text that matches an audio query. We also demonstrate that the multimodal alignment capability of our model can be successfully extended to the zero-shot transfer scenario for genre classification and auto-tagging on two public datasets.}, doi = {10.5281/zenodo.7316744}, keywords = {Audio-Language Learning, Music Retrieval, Contrastive Learning, Zero-Shot Transfer, MusCALL}, url = {https://arxiv.org/abs/2208.12208}, } @conference{zhang2022atepp, author = {Zhang, H. and Tang, J. and Rafee, S. and Dixon, S. and Wiggins, G. and Fazekas, G.}, booktitle = {23rd International Society for Music Information Retrieval Conference (ISMIR 2022), December 4-8, Bengaluru, India}, title = {ATEPP: A Dataset of Automatically Transcribed Expressive Piano Performance}, year = {2022}, abstract = {Computational models of expressive piano performance rely on attributes like tempo, timing, dynamics, and pedaling. Despite some promising models for performance assessment and rendering, results are limited by the scale, breadth, and uniformity of existing datasets. In this paper, we present ATEPP, a dataset containing 1000 hours of performances of standard piano repertoire by 49 world-renowned pianists, organized and aligned by compositions and movements for comparative studies. Scores in MusicXML format are also available for around half of the tracks. We first evaluate and verify the use of transcribed MIDI for representing expressive performance with a listening evaluation involving recent transcription models. Then, the process of sourcing and curating the dataset is outlined, including composition entity resolution and a pipeline for audio matching and solo filtering. Finally, we conduct baseline experiments for performer identification and performance rendering on our datasets, demonstrating its potential in generalizing expressive features of individual performing styles.}, doi = {10.5281/zenodo.7676768}, keywords = {Expressive Piano Performance, Transcription, Performer Identification, Music Datasets}, url = {https://archives.ismir.net/ismir2022/paper/000053.pdf}, } @conference{walwadkar2022compldnet, author = {Walwadkar, D. and Shatri, E. and Fazekas, G.}, booktitle = {4th International Workshop on Reading Music Systems (WoRMS), November 18, Online}, title = {CompldNet: Sheet Music Composer Identification using Deep Neural Network}, year = {2022}, abstract = {There have been significant breakthroughs in computer vision research in many subfields, including composer identification from images of sheet music. Previous work in composer identification depends on a specific digital semantic representation of music and various evaluation criteria, making it difficult to quantify their relative merits. We present a novel approach using an end-to-end deep neural network model for music composer identification with images of sheet music as inputs. Hence, this method is not dependent on the conversion of the sheet music to any other intermediate digital semantic format. Additionally, we compare results from classification applied to sheet music and the respective bootleg representation. Identifying the composer can lead to more inferred data, which is helpful in archiving historical pieces digitally. Based on our experimental results, it can be concluded that the composer identification in sheet music images with deep neural models shows promising results. With the proposed model, we achieved 83\% accuracy for composer identification on sheet music images compared to 76\% accuracy when applied to the bootleg representations on our newly collected dataset.}, doi = {10.48550/arXiv.2211.13285}, keywords = {Composer Identification, Deep Neural Network, Sheet Music Analysis}, Url = {http://www.semanticaudio.net/files/papers/walwadkar2022compldnet.pdf}, } @conference{zhao2022transfer, author = {Zhao, Y. and Fazekas, G. and Sandler, M.}, booktitle = {30th European Signal Processing Conference (EUSIPCO), August 29 -- September 2, Belgrade, Serbia}, title = {Transfer Learning for Violinist Identification}, year = {2022}, abstract = {Music performer identification is important for music recommendation, music expression analysis and playlist generation. In previous research, audio feature learning methods were commonly used for both singer identification (SID) and instrument player identification (IPID) with good results. In the current deep learning era, SID results are greatly improved using neural networks, however, instrument player identification is rarely investigated in recent works primarily due to the shortage of open-access datasets. To solve this problem, we construct a concerto violin dataset as well as a solo dataset, and present a transfer learning approach for violinist identification from pre-trained music auto-tagging neural networks and singer identification models. We then transfer pre-trained weights and fine-tune the models using violin datasets and finally obtain violinist identification results. We compare our system with a number of state-of-the-art methods and show that our model outperforms them using both of our datasets.}, doi = {10.23919/eusipco55093.2022.9909590}, keywords = {Transfer Learning, Violinist Identification, Music Performance Analysis}, Url = {https://doi.org/10.23919/eusipco55093.2022.9909590}, } @article{hayes2022jaes, Author = {Hayes, B. and Saitis, C. and Fazekas, G.}, Title = {Disembodied Timbres: A Study on Semantically Prompted FM Synthesis}, Journal = {Journal of the Audio Engineering Society (JAES)}, Volume = {70}, Number = {5}, Pages = {373--391}, Abstract = {Disembodied electronic sounds constitute a large part of the modern auditory lexicon, but research into timbre perception has focused mostly on the tones of conventional acoustic musical instruments. It is unclear whether insights from these studies generalize to electronic sounds, nor is it obvious how these relate to the creation of such sounds. This work presents an experiment on the semantic associations of sounds produced by FM synthesis with the aim of identifying whether existing models of timbre semantics are appropriate for such sounds. A novel experimental paradigm, in which experienced sound designers responded to semantic prompts by programming a synthesizer, was applied, and semantic ratings on the sounds they created were provided. Exploratory factor analysis revealed a five-dimensional semantic space. The first two factors mapped well to the concepts of luminance, texture, and mass. The remaining three factors did not have clear parallels, but correlation analysis with acoustic descriptors suggested an acoustical relationship to luminance and texture. The results suggest that further inquiry into the timbres of disembodied electronic sounds, their synthesis, and their semantic associations would be worthwhile and that this could benefit research into auditory perception and cognition and synthesis control and audio engineering.}, doi = {10.17743/jaes.2022.0006}, Publisher-Url = {https://aes2.org/publications/elibrary-page/?id=21740}, Preprint-Url = {https://doi.org/10.31234/osf.io/ksw5j}, Url = {http://www.semanticaudio.net/files/papers/hayes2022jaes.pdf}, Keywords = {Timbre, FM synthesis, Audio Engineering, Semantic Perception}, Year = {2022}, } @conference{lobbers2022seeingsoundsketches, author = {Löbbers, S. and Fazekas, G.}, booktitle = {International Computer Music Conference (ICMC), Jul 9-3, Limerick, Ireland}, title = {Seeing Sounds, Hearing Shapes: A Gamified Study to Evaluate Sound-Sketches}, year = {2022}, abstract = {Sound-shape associations, a subset of cross-modal associations between the auditory and visual domain, have been studied mainly in the context of matching a set of purposefully crafted shapes to sounds. Recent studies have explored how humans represent sound through free-form sketching and how a graphical sketch input could be used for sound production. In this paper, the potential of communicating sound characteristics through these free-form sketches is investigated in a gamified study that was conducted with eighty-two participants at two online exhibition events. The results show that participants managed to recognise sounds at a higher rate than the random baseline would suggest, however it appeared difficult to visually encode nuanced timbral differences.}, doi = {10.48550/arXiv.2205.08866}, keywords = {sound-shape associations, cross-modal, free-form sketching, sound representation, gamified study}, url = {https://arxiv.org/abs/2205.08866}, } @article{proutskova2022websemantics, author = {Proutskova, P. and Wolff, D. and Fazekas, G. and Frieler, K. and Höger, F. and Velichkina, O. and Solis, G. and Weyde, T. and Pfleiderer, M. and Crayencour, H. C. and Peeters, G. and Dixon, S.}, title = {The Jazz Ontology: A semantic model and large-scale RDF repositories for jazz}, journal = {Journal of Web Semantics}, volume = {72}, number = {100735}, pages = {1--16}, Abstract = {Jazz is a musical tradition that is just over 100 years old; unlike in other Western musical traditions, improvisation plays a central role in jazz. Modelling the domain of jazz poses some ontological challenges due to specificities in musical content and performance practice, such as band lineup fluidity and importance of short melodic patterns for improvisation. This paper presents the Jazz Ontology – a semantic model that addresses these challenges. Additionally, the model also describes workflows for annotating recordings with melody transcriptions and for pattern search. The Jazz Ontology incorporates existing standards and ontologies such as FRBR and the Music Ontology. The ontology has been assessed by examining how well it supports describing and merging existing datasets and whether it facilitates novel discoveries in a music browsing application. The utility of the ontology is also demonstrated in a novel framework for managing jazz related music information. This involves the population of the Jazz Ontology with the metadata from large scale audio and bibliographic corpora (the Jazz Encyclopedia and the Jazz Discography). The resulting RDF datasets were merged and linked to existing Linked Open Data resources. These datasets are publicly available and are driving an online application that is being used by jazz researchers and music lovers for the systematic study of jazz.}, doi = {10.1016/j.websem.2022.100735}, publisher-url = {https://www.journals.elsevier.com/journal-of-web-semantics}, % Publisher's URL url = {https://doi.org/10.1016/j.websem.2022.100735}, % Direct URL to paper keywords = {Jazz, Ontology, RDF, Semantic Web, Music Information Retrieval}, year = {2022}, } @conference{manco2022learningsupervision, author = {Manco, I. and Benetos, E. and Quinton, E. and Fazekas, G.}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), May 22-27, Singapore}, title = {Learning music audio representations via weak language supervision}, year = {2022}, abstract = {Audio representations for music information retrieval are typically learned via supervised learning in a task-specific fashion. Although effective at producing state-of-the-art results, this scheme lacks flexibility with respect to the range of applications a model can have and requires extensively annotated datasets. In this work, we pose the question of whether it may be possible to exploit weakly aligned text as the only supervisory signal to learn general-purpose music audio representations. To address this question, we design a multimodal architecture for music and language pre-training (MuLaP) optimised via a set of proxy tasks. Weak supervision is provided in the form of noisy natural language descriptions conveying the overall musical content of the track. After pre-training, we transfer the audio backbone of the model to a set of music audio classification and regression tasks. We demonstrate the usefulness of our approach by comparing the performance of audio representations produced by the same audio backbone with different training strategies and show that our pre-training method consistently achieves comparable or higher scores on all tasks and datasets considered. Our experiments also confirm that MuLaP effectively leverages audio-caption pairs to learn representations that are competitive with audio-only and cross-modal self-supervised methods in the literature.}, doi = {10.1109/ICASSP43922.2022.9746996}, keywords = {audio and language, audio representations, multimodal learning, music information retrieval}, url = {https://arxiv.org/abs/2112.04214}, } @conference{muradeli2022differentiable, author = {Muradeli, J. and Vahidi, C. and Wang, C. and Han, H. and Lostanlen, V. and Lagrange, M. and Fazekas, G.}, booktitle = {Proc. of the 25h International Conference on Digital Audio Effects (DAFx20in22), Vienna, Austria, September 6-10}, title = {Differentiable Time-Frequency Scattering on GPU [Best paper award]}, year = {2022}, abstract = {Joint time-frequency scattering (JTFS) is a convolutional operator in the time-frequency domain which extracts spectrotemporal modulations at various rates and scales. It offers an idealized model of spectrotemporal receptive fields (STRF) in the primary auditory cortex, and thus may serve as a biological plausible surrogate for human perceptual judgments at the scale of isolated audio events. Yet, prior implementations of JTFS and STRF have remained outside of the standard toolkit of perceptual similarity measures and evaluation methods for audio generation. We trace this issue down to three limitations: differentiability, speed, and flexibility. In this paper, we present an implementation of time-frequency scattering in Python. Unlike prior implementations, ours accommodates NumPy, PyTorch, and TensorFlow as backends and is thus portable on both CPU and GPU. We demonstrate the usefulness of JTFS via three applications: unsupervised manifold learning of spectrotemporal modulations, supervised classification of musical instruments, and texture resynthesis of bioacoustic sounds.}, doi = {10.48550/arXiv.2204.08269}, keywords = {time-frequency scattering, GPU acceleration, machine learning, signal processing}, url = {https://qmro.qmul.ac.uk/xmlui/handle/123456789/84235}, } @conference{zhao2022violinistdistributions, author = {Zhao, Y. and Fazekas, G. and Sandler, M.}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), May 22-27, Singapore}, title = {Violinist Identification Using Note-Level Timbre Feature Distributions}, year = {2022}, abstract = {Modelling musical performers' individual playing styles based on audio features is important for music education, music expression analysis and music generation. In violin performance, the perception of playing styles are mainly affected by the characteristic musical timbre, which is mostly determined by performers, instruments and recording conditions. To verify if timbre features can describe a performer's style adequately, we examine a violinist identification method based on note-level timbre feature distributions. We first apply it using solo datasets to recognise professional violinists, then use it to identify master players from commercial concerto recordings. The results show that the designed features and method work very well for both datasets. The identification accuracy with the solo dataset using MFCCs and spectral contrast features are 0.94 and 0.91 respectively. Significantly lower but promising results are reported with the concerto dataset. Results suggest that the selected timbre features can model performers' individual playing reasonably objectively, regardless of the instrument they play.}, doi = {10.1109/ICASSP43922.2022.9747606}, keywords = {timbre, performer identification, music information retrieval, violin, machine learning}, url = {https://ieeexplore.ieee.org/document/9747606} } @article{turchet2022websemantics, author = {Turchet, L. and Bouquet, P. and Molinari, A. and Fazekas, G.}, title = {The Smart Musical Instruments Ontology}, journal = {Journal of Web Semantics}, volume = {72}, number = {1}, pages = {1--14}, year = {2022}, abstract = {The Smart Musical Instruments (SMIs) are an emerging category of musical instruments that belongs to the wider class of Musical Things within the Internet of Musical Things paradigm. SMIs encompass sensors, actuators, embedded intelligence, and wireless connectivity to local networks and to the Internet. Interoperability represents a key issue within this domain, where heterogeneous SMIs are envisioned to exchange information between each other and a plethora of Musical Things. This paper proposes an ontology for the representation of the knowledge related to SMIs, with the aim of facilitating interoperability between SMIs as well as with other Musical Things interacting with them. There was no previous comprehensive data model for the SMIs domain, however the new ontology relates to existing ontologies, including the SOSA Ontology for the representation of sensors and actuators, the Audio Effects Ontology dealing with the description of digital audio effects, and the IoMusT Ontology for the representation Musical Things and IoMusT ecosystems. This paper documents the design of the ontology and its evaluation with respect to specific requirements gathered from an extensive literature review, which was based on scenarios involving SMIs stakeholders, such as performers and studio producers. }, doi = {10.1016/j.websem.2021.100687}, publisher-url = {https://doi.org/10.1016/j.websem.2021.100687}, url = {files/papers/turchet2022smart.pdf}, keywords = {smart musical instruments, ontology, semantic web, music technology}, year = {2022} } @conference{hayes2022timbrefunvocabulary, author = {Hayes, B. and Saitis, C. and Fazekas, G.}, booktitle = {Proc. of the 24th International Congress on Acoustics, October 24--28, Gyeongju, Korea}, title = {timbre.fun: A gamified interactive system for crowdsourcing a timbre semantic vocabulary}, year = {2022}, abstract = {We present timbre.fun (https://timbre.fun/), a web-based gamified interactive system where users create sounds in response to semantic prompts (e.g., bright, rough) through exploring a two-dimensional control space that maps nonlinearly to the parameters of a simple hybrid wavetable and amplitude-modulation synthesizer. The current version features 25 semantic adjectives mined from a popular synthesis forum. As well as creating sounds, users can explore heatmaps generated from others' responses, and fit a classifier (k-nearest neighbors) in-browser. timbre.fun is based on recent work, including by the authors, which studied timbre semantic associations through prompted synthesis paradigms. The interactive is embedded in a digital exhibition on sensory variation and interaction (https://seeingmusic.app/) which debuted at the 2021 Edinburgh Science Festival, where it was visited by 197 users from 21 countries over 16 days. As it continues running online, a further 596 visitors from 35 countries have engaged. To date 579 sounds have been created and tagged, which will facilitate parallel research in timbre semantics and neural audio synthesis. Future work will include further gamifying the data collection pipeline, including “leveling-up” to unlock new words and synthesizers, and a full open-source release.}, keywords = {timbre, gamification, semantic vocabulary, audio synthesis, crowdsourcing}, url = {hhttps://benhayes.net/assets/pdf/timbre_fun_ica2022.pdf} } @conference{bromham2022measuringcompressor, author = {Bromham, G. and Moffat, D. and Sheng, D. and Fazekas, G.}, booktitle = {153rd Audio Engineering Society Convention, October 18--20, New York, USA}, title = {Measuring Audibility Threshold Levels for Attack and Release in a Dynamic Range Compressor}, year = {2022}, abstract = {Dynamic range compressors are one of the most ubiquitous audio effects used in music production and mixing contexts. The ballistics settings, of attack and release time, in a dynamic range compressor (DRC) are considered highly important for influencing the perceived response of the compressor, and the manner in which they shape an audio signal. The ability to perceive the effect of compression ballistics is important to better understand how these tools can be best utilised in music production. We present an audibility threshold test, in a web-based format, using an ABX test methodology, where participants were asked to identify which of the presented audio examples they perceived as different to a reference file. DRC attack and release settings were varied and applied to a set of audio examples. The results demonstrate that larger ranges of change in the ballistics settings are more noticeable than small changes. The test also demonstrated that the perception of changes in ballistics settings vary when compressor threshold settings were changed. Lighter levels of compression with a higher threshold generally yielded more noticeable results, especially where the effects of large changes in ballistics settings were measured. Finally, it was observed that participants with above average or expert levels of experience (4 years or more) were more likely to perceive small differences in attack and release times than those with less developed critical listening skills.}, keywords = {dynamic range compression, attack time, release time, audio engineering, critical listening}, url = {https://secure.aes.org/forum/pubs/conventions/?elib=21958} } @conference{li2022howmusic, author = {Li, Y. and Li, S. and Fazekas, G.}, booktitle = {23rd International Society for Music Information Retrieval Conference (ISMIR), December 4-8, Bengaluru, India}, title = {How Music Features and Musical Data Representations Affect Objective Evaluation of Music Composition: A Review of the CSMT Data Challenge 2020}, year = {2022}, abstract = {Tools and methodologies for distinguishing computer-generated melodies from human-composed melodies have a broad range of applications from detecting copyright infringement through the evaluation of generative music systems to facilitating transparent and explainable AI. This paper reviews a data challenge on distinguishing computer-generated melodies from human-composed melodies held in association with the Conference on Sound and Music Technology (CSMT) in 2020. An investigation of the submitted systems and the results are presented first. Besides the structure of the proposed models, the paper investigates two important factors that were identified as contributors to good model performance: the specific music features and the music representation used. Through an analysis of the submissions, important melody-related music features have been identified. Encoding or representation of the music in the context of neural network modes are found noticeably impacting system performance through an experiment where the top-ranked system was re-implemented with different input representations for comparison purposes. Besides demonstrating the feasibility of developing an objective music composition evaluation system, the investigation presented in this paper also reveals some important limitations of current music composition systems opening opportunities for future work in the community.}, doi = {10.5281/zenodo.7316603}, keywords = {music features, music data representations, music composition, machine learning, CSMT Data Challenge 2020}, Url = {https://archives.ismir.net/ismir2022/paper/000010.pdf} } @conference{oconnor2021zeroshot, author = {O'Connor, B. and Fazekas, G. and Dixon, S.}, booktitle = {Computer Music Multidisciplinary Research, November 13--17, Tokyo, Japan}, title = {Zero-shot Singing Technique Conversion}, year = {2021}, abstract = {In this paper we propose modifications to the neural network framework, AutoVC for the task of singing technique conversion. This includes utilising a pretrained singing technique encoder which extracts technique information, upon which a decoder is conditioned during training. By swapping out a source singer’s technique information for that of the target’s during conversion, the input spectrogram is reconstructed with the target’s technique. We document the beneficial effects of omitting the latent loss, the importance of sequential training, and our process for fine-tuning the bottleneck. We also conducted a listening study where participants rate the specificity of technique-converted voices as well as their naturalness. From this we are able to conclude how effective the technique conversions are and how different conditions affect them, while assessing the model’s ability to reconstruct its input data.}, doi = {10.48550/arXiv.2111.08839}, keywords = {voice synthesis, singing synthesis, style transfer, neural network, singing technique, timbre conversion, conditional autoencoder, sequential training, latent loss}, Url = {https://arxiv.org/abs/2111.08839} } @conference{hayes2021neuralsynthesis, author = {Hayes, B. and Saitis, C. and Fazekas, G.}, booktitle = {22nd International Society for Music Information Retrieval (ISMIR), November 7--12, Online}, title = {Neural Waveshaping Synthesis}, year = {2021}, abstract = {We present the Neural Waveshaping Unit (NEWT): a novel, lightweight, fully causal approach to neural audio synthesis which operates directly in the waveform domain, with an accompanying optimisation (FastNEWT) for efficient CPU inference. The NEWT uses time-distributed multilayer perceptrons with periodic activations to implicitly learn nonlinear transfer functions that encode the characteristics of a target timbre. Once trained, a NEWT can produce complex timbral evolutions by simple affine transformations of its input and output signals. We paired the NEWT with a differentiable noise synthesiser and reverb and found it capable of generating realistic musical instrument performances with only 260k total model parameters, conditioned on F0 and loudness features. We compared our method to state-of-the-art benchmarks with a multi-stimulus listening test and the Fréchet Audio Distance and found it performed competitively across the tested timbral domains. Our method significantly outperformed the benchmarks in terms of generation speed, and achieved real-time performance on a consumer CPU, both with and without FastNEWT, suggesting it is a viable basis for future creative sound design tools.}, doi = {10.48550/arXiv.2107.05050}, keywords = {neural audio synthesis, signal processing, deep learning, machine learning, audio synthesis}, Url = {https://arxiv.org/abs/2107.05050} } @conference{zhao2021violinist, author = {Zhao, Y. and Wang, C. and Fazekas, G. and Benetos, E. and Sandler, M.}, booktitle = {29th European Signal Processing Conference (EUSIPCO), August 23-27, Dublin, Ireland}, title = {Violinist identification based on vibrato features}, year = {2021}, abstract = {Identifying performers from polyphonic music is a challenging task in music information retrieval. As a ubiquitous expressive element in violin music, vibrato contains important information about the performers' interpretation. This paper proposes to use vibrato features for identifying violinists from commercial orchestral recordings. We present and compare two systems, which take the same note-level melodies as input while using different vibrato feature extractors and classification schemes. One system calculates vibrato features according to vibrato definition, models the feature distribution using histograms, and classifies performers based on the distribution similarity. The other system uses the adaptive wavelet scattering which contains vibrato information and identifies violinists with a machine learning classifier. We report accuracy improvement of 19.8% and 17.8%, respectively, over a random baseline on piece-level evaluation. This suggests that vibrato notes in polyphonic music are useful for master violinist identification.}, keywords = {Violinist identification, Vibrato features, Music information retrieval}, Url = {https://arxiv.org/abs/2108.00000} } @conference{rafee2021performer, author = {Rafee, S. R. M. and Fazekas, G. and Wiggins, G. A.}, booktitle = {International Computer Music Conference (ICMC), July 25-31 (Virtual), Santiago de Chile, Chile}, title = {Performer Identification From Symbolic Representation of Music Using Statistical Models}, year = {2021}, abstract = {Music Performers have their own idiosyncratic way of interpreting a musical piece. A group of skilled performers playing the same piece of music would likely to inject their unique artistic styles in their performances. The variations of the tempo, timing, dynamics, articulation etc. from the actual notated music are what make the performers unique in their performances. This study presents a dataset consisting of four movements of Schubert's "Sonata in B-flat major, D.960" performed by nine virtuoso pianists individually. We proposed and extracted a set of expressive features that are able to capture the characteristics of an individual performer's style. We then present a performer identification method based on the similarity of feature distribution, given a set of piano performances. The identification is done considering each feature individually as well as a fusion of the features. Results show that the proposed method achieved a precision of 0.903 using fusion features. Moreover, the onset time deviation feature shows promising result when considered individually.}, doi = {10.48550/arxiv.2108.02576}, keywords = {Performer identification, Symbolic music representation, Statistical models}, Url = {https://arxiv.org/abs/2108.02576} } @conference{lobbers2021sketching, author = {Löbbers, S. and Barthet, M. and Fazekas, G.}, booktitle = {International Computer Music Conference (ICMC), July 25-31, (Virtual), Santiago de Chile, Chile}, title = {Sketching sounds: an exploratory study on sound-shape associations}, year = {2021}, abstract = {Sound synthesiser controls typically correspond to technical parameters of signal processing algorithms rather than intuitive sound descriptors that relate to human perception of sound. This makes it difficult to realise sound ideas in a straightforward way. Cross-modal mappings, for example between gestures and sound, have been suggested as a more intuitive control mechanism. A large body of research shows consistency in human associations between sounds and shapes. However, the use of drawings to drive sound synthesis has not been explored to its full extent. This paper presents an exploratory study that asked participants to sketch visual imagery of sounds with a monochromatic digital drawing interface, with the aim to identify different representational approaches and determine whether timbral sound characteristics can be communicated reliably through visual sketches. Results imply that the development of a synthesiser exploiting sound-shape associations is feasible, but a larger and more focused dataset is needed in followup studies.}, keywords = {Timbre perception, Cross-modal associations, Sound-shape associations}, Url = {https://arxiv.org/abs/2107.07360} } @conference{shatri2021doremi, author = {Shatri, E. and Fazekas, G.}, booktitle = {3rd International Workshop on Reading Music Systems (WORMS), July 23, Alicante, Spain}, title = {DoReMi: First glance at a universal OMR dataset}, year = {2021}, abstract = {The main challenges of Optical Music Recognition (OMR) come from the nature of written music, its complexity and the difficulty of finding an appropriate data representation. This paper provides a first look at DoReMi, an OMR dataset that addresses these challenges, and a baseline object detection model to assess its utility. Researchers often approach OMR following a set of small stages, given that existing data often do not satisfy broader research. We examine the possibility of changing this tendency by presenting more metadata. Our approach complements existing research; hence DoReMi allows harmonisation with two existing datasets, DeepScores and MUSCIMA++. DoReMi was generated using a music notation software and includes over 6400 printed sheet music images with accompanying metadata useful in OMR research. Our dataset provides OMR metadata, MIDI, MEI, MusicXML and PNG files, each aiding a different stage of OMR. We obtain 64% mean average precision (mAP) in object detection using half of the data. Further work includes re-iterating through the creation process to satisfy custom OMR models. While we do not assume to have solved the main challenges in OMR, this dataset opens a new course of discussions that would ultimately aid that goal.}, doi = {10.48550/arXiv.2107.07786}, keywords = {Optical Music Recognition, Dataset, Object detection}, Url = {https://arxiv.org/abs/2107.07786} } @conference{manco2021muscaps, author = {Manco, I. and Benetos, E. and Quinton, E. and Fazekas, G.}, booktitle = {IEEE International Joint Conference on Neural Networks (IJCNN), July 18-22, Virtual}, title = {MusCaps: generating captions for music audio}, year = {2021}, abstract = {Content-based music information retrieval has seen rapid progress with the adoption of deep learning. Current approaches to high-level music description typically make use of classification models, such as in auto tagging or genre and mood classification. In this work, we propose to address music description via audio captioning, defined as the task of generating a natural language description of music audio content in a human-like manner. To this end, we present the first music audio captioning model, MusCaps, consisting of an encoder-decoder with temporal attention. Our method combines convolutional and recurrent neural network architectures to jointly process audio-text inputs through a multimodal encoder and leverages pre-training on audio data to obtain representations that effectively capture and summarise musical features in the input. Evaluation of the generated captions through automatic metrics shows that our method outperforms a baseline designed for non-music audio captioning. Through an ablation study, we unveil that this performance boost can be mainly attributed to pre-training of the audio encoder, while other design choices – modality fusion, decoding strategy and the use of attention -- contribute only marginally. Our model represents a shift away from classification-based music description and combines tasks requiring both auditory and linguistic understanding to bridge the semantic gap in music information retrieval.}, doi = {10.1109/IJCNN52387.2021.9533461}, keywords = {Music audio captioning, Deep learning, Multimodal modeling}, Url = {https://arxiv.org/abs/2104.11984} } @conference{vahidi2021modulation, author = {Vahidi, C. and Fazekas, G. and Saitis, C.}, booktitle = {IEEE International Joint Conference on Neural Networks (IJCNN), July 18-22, Virtual}, title = {A Modulation Front-End for Music Audio Tagging}, year = {2021}, abstract = {Convolutional Neural Networks have been extensively explored in the task of automatic music tagging. The problem can be approached by using either engineered time-frequency features or raw audio as input. Modulation filter bank representations that have been actively researched as a basis for timbre perception have the potential to facilitate the extraction of perceptually salient features. We explore end-to-end learned front-ends for audio representation learning, ModNet and SincModNet, that incorporate a temporal modulation processing block. The structure is effectively analogous to a modulation filter bank, where the FIR filter center frequencies are learned in a data-driven manner. The expectation is that a perceptually motivated filter bank can provide a useful representation for identifying music features. Our experimental results provide a fully visualisable and interpretable front-end temporal modulation decomposition of raw audio. We evaluate the performance of our model against the state-of-the-art of music tagging on the MagnaTagATune dataset. We analyse the impact on performance for particular tags when time-frequency bands are subsampled by the modulation filters at a progressively reduced rate. We demonstrate that modulation filtering provides promising results for music tagging and feature representation, without using extensive musical domain knowledge in the design of this front-end.}, doi = {10.1109/IJCNN52387.2021.9533547}, keywords = {Music audio tagging, Modulation filter bank, Convolutional Neural Networks}, Url = {https://arxiv.org/abs/2105.11836} } @article{singh2021jaes, Author = {Singh, S. and Bromham, G. and Sheng, D. and Fazekas, G.}, Title = {Intelligent Control Method for the Dynamic Range Compressor: A User Study}, Journal = {Journal of the Audio Engineering Society}, Volume = {69}, Number = {7/8}, Pages = {576-585}, Abstract = {Music producers and casual users often seek to replicate dynamic range compression used in a particular recording or production context for their own track. However, not knowing the parameter settings used to produce the audio using the effect may become an impediment, especially for beginners or untrained users who may lack critical listening skills. We address this issue by presenting an automatic compressor plugin relying on a neural network to extract relevant features from a reference signal and estimate compression parameters. The plugin automatically adjusts its parameters to match the input signal with a reference audio recording as closely as possible. Quantitative and qualitative usability evaluation of the plugin was conducted with amateur, pro-amateur and professional music producers. The results established acceptance of the core idea behind the proposed control method across these user groups.}, doi = {10.17743/jaes.2021.0028}, Publisher-Url = {https://secure.aes.org/forum/pubs/journal/?elib=21125}, Url = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/72391/Singh%20Intelligent%20control%20method%202021%20Accepted.pdf}, Keywords = {Dynamic range compression, Automatic mixing, User study, Intelligent music production}, Year = {2021}, } @article{gabrielli2021applsci, Author = {Gabrielli, L. and Fazekas, G. and Nam, J.}, Title = {Special Issue on Deep Learning for Applications in Acoustics: Modeling, Synthesis, and Listening}, Journal = {Applied Sciences}, Volume = {11}, Number = {2}, Pages = {473}, Abstract = {Recent introduction of Deep Learning has led to a vast array of breakthroughs in many fields of science and engineering. The data-driven approach has gathered the attention of research communities and has often been successful in yielding solutions to very complex classification and regression problems. In the fields of audio analysis, processing and acoustic modelling, Deep Learning has been adopted, initially borrowing their methods from the image processing and computer vision field, and then finding creative and innovative solutions to suit domain-specific needs of acoustic research. In this process, researchers are facing two big challenges: learning meaningful spatio-temporal representations of audio signals and making sense of the black-box model of neural networks, i.e. extracting knowledge that is useful for scientific advance.}, doi = {10.3390/app11020473}, Publisher-Url = {https://www.mdpi.com/journal/applsci}, Url = {https://www.mdpi.com/2076-3417/11/2/473}, Keywords = {Deep Learning, Acoustics, Audio analysis, Signal processing}, Year = {2021}, } @conference{li2021csmt, Author = {Li, S. and Jing, Y. and Fazekas, G.}, Title = {A Novel Dataset for the Identification of Computer Generated Melodies in the CSMT Challenge}, Booktitle = {Proceedings of the 8th Conference on Sound and Music Technology}, Series = {Lecture Notes in Electrical Engineering}, Volume = {761}, Pages = {177--186}, Year = {2021}, Publisher = {Springer Nature}, Abstract = {In this paper, the dataset used for the data challenge organised by Conference on Sound and Music Technology (CSMT) is introduced. The CSMT data challenge requires participants to identify whether a given piece of melody is generated by computer or is composed by human. The dataset is formed by two parts: development dataset and evaluation dataset. The development dataset contains only computer generated melodies whereas the evaluation dataset contain both computer generated melodies and human composed melodies. The aim of the dataset is to examine whether it is possible to distinguish computer generated melodies by learning the feature of generated melodies.}, doi = {10.1007/978-981-16-1649-5_15}, Keywords = {Computer generated melodies, Music dataset, CSMT challenge}, Url = {https://arxiv.org/abs/2012.03646} } @conference{fazekas2021isic, author = {Fazekas, G.}, booktitle = {International Semantic Intelligence Conference, February 25–27, New Delhi, India}, title = {Ontology based Machine Learning in Semanc Audio Applicaons [Keynote Abstract]}, year = {2021}, abstract = {Semantic Audio aims to associate audio and music content with meaningful labels and descriptions. It is an emerging technological and research field in the confluence of signal processing, machine learning, including deep learning, and formal knowledge representation. Semantic Audio can facilitate the detection of acoustic events in complex environments, the recognition of beat, tempo, chords or keys in music recordings or the creation of smart ecosystems and environments, for instance, to enhance audience and performer interaction. Semantic Audio can bring together creators, distributors and consumers in the music value chain in intuitive new ways. Ontologies play a crucial role in enabling complex Semantic Audio applications by providing shared conceptual models that enable combining different data sources and heterogeneous services using Semantic Web technologies. The benefit of using these techniques have been demonstrated in several large projects recently, including Audio Commons, an ecosystem built around Creative Commons audio content. In this talk, I will first outline fundamental principles in Semantic Audio analysis and introduce important concepts in representing audio and music data. Specific demonstrators will be discussed in the areas of smart audio content ecosystems, music recommendation, intelligent audio production and the application of IoT principles in musical interaction. I will discuss how machine learning and the use of ontologies in tandem benefit specific applications, and talk about challenges in fusing audio and semantic technologies as well as the opportunities they call forth.}, keywords = {Machine Learning, Ontolgy, Semantic Audio}, Url = {https://ceur-ws.org/Vol-2786/Paper1.pdf} } @conference{thompson2021wac, Author = {Thompson, A. and Fazekas, G. and Wiggins, G.}, booktitle = {Proceedings of Web Audio Conference (WAC), July 5--7, Barcelona, Spain}, title = {A Time-Travel Debugger for Web Audio Applications}, year = {2021}, abstract = {Developing real-time audio applications, particularly those with an element of user interaction, can be a difficult task. When things go wrong, it can be challenging to locate the source of a problem when many parts of the program are connected and interacting with one another in real-time. We present a time-travel debugger for the Flow Web Audio framework that allows developers to record a session interacting with their program, playback that session with the original timing still intact, and step through individual events to inspect the program state at any point in time. In contrast to the browser's native debugging features, audio processing remains active while the time-travel debugger is enabled, allowing developers to listen out for audio bugs or unexpected behaviour. We describe three example use-cases for such a debugger. The first is error reproduction using the debugger's JSON import/export capabilities to ensure the developer can replicate problematic sessions. The second is using the debugger as an exploratory aid instead of a tool for error finding. Finally, we consider opportunities for the debugger's technology to be used by end-users as a means of recording, sharing, and remixing ideas. We conclude with some options for future development, including expanding the debugger's program state inspector to allow for in situ data updates, visualisation of the current audio graph similar to existing Web Audio inspectors, and possible methods of evaluating the debugger's effectiveness in the scenarios described.}, keywords = {Time-travel debugging, Reverse debugging, Web Audio API, Declarative programming, Exploratory programming, Interactive audio applications}, Url = {http://www.semanticaudio.net/files/papers/thompson2021wac.pdf}, } @article{turchet2021aes, Author = {Turchet, L. and Fazekas, G. and Rottondi, C. and Fischione, C.}, Title = {Guest Editor's Note: Special Issue on the Internet of Sounds}, Booktitle = {Journal of the Audio Engineering Society}, Volume = {69}, Number = {10}, Pages = {912--913}, Abstract = {Current sound-based practices and systems point to convergent research trends that bring together the field of Sound and Music Computing with that of the Internet of Things (IoT). These endeavors are spurring the emergence of the Internet of Sounds (IoS) research area. The IoS relates to the network of devices capable of sensing, acquiring, processing, actuating, and exchanging data serving the purpose of communicating sound-related information. IoS can be seen as the union of two paradigms: the Internet of Musical Things and the Internet of Audio Things, which respectively address musical and non-musical domains in networked contexts. The IoS area has increasingly attracted the attention of researchers in both industrial and academic contexts. This motivated researchers to initiate an annual gathering dedicated to the IoS, the 'International Workshop on the Internet of Sounds' (IWIS), and us to organize this special issue, which was welcomed by the Journal of the Audio Engineering Society (JAES). Some of the articles contained in this issue are extensions of the contributions submitted to IWIS.}, Year = {2021}, Keywords = {Internet of Sounds, Guest editorial, Special issue}, Url = {https://www.aes.org/journal/online/JAES_V69/10/JAES_V69_10_PG706.pdf}, } @conference{lobbers2021icmpc, Author = {Löbbers, S. and Fazekas, G.}, Title = {Representation of musical timbre through visual sketching [Poster]}, Booktitle = {16th International Conference on Music Perception and Cognition (ICMPC-ESCOM), July 28-31, Sheffield, UK}, Abstract = {Cross-modal associations can be a helpful means to communicate timbre in a musical context. Research shows a that there is general consensus between humans on how to map shapes with timbre when presented with a set of visual stimuli. Only little research has been conducted on how humans represent timbre through their own drawings.}, Year = {2021}, Keywords = {Musical timbre, Visual sketching, Music perception}, Url = {https://sebastianlobbers.com/static/8aca3ab6a6f65bce975226b4c8944a86/Lobbers530.jpeg} } %%----------------------------------------------------------------------------------------------------------------------- @conference{Oconnor2020mume, Abstract = {Sixty participants provided dissimilarity ratings between various singing techniques. Multidimensional scaling, class averaging and clustering techniques were used to analyse timbral spaces and how they change between different singers, genders and registers. Clustering analysis showed that ground-truth similarity and silhouette scores that were not significantly different between gender or register conditions, while similarity scores were positively correlated with participants' instrumental abilities and task comprehension. Participant feedback showed how a revised study design might mitigate noise in our data, leading to more detailed statistical results. Timbre maps and class distance analysis showed us which singing techniques remained similar to one another across gender and register conditions. This research provides insight into how the timbre space of singing changes under different conditions, highlights the subjectivity of perception between participants, and provides generalised timbre maps for regularisation in machine learning.}, Author = {O'Connor, B. and Dixon, S. and Fazekas, G.}, Booktitle = {1st Joint Conference on AI Music Creativity (AIMC), October 19-23, Stockholm, Sweden}, Date-Added = {2020-12-27 14:07:41 +0000}, Date-Modified = {2020-12-27 14:16:23 +0000}, Doi = {https://doi.org/10.5281/zenodo.4285404}, Keywords = {music perception, music informatics, singing voice, neural audio synthesis}, Publisher-Url = {https://boblsturm.github.io/aimusic2020/papers/CSMC__MuMe_2020_paper_38.pdf}, Title = {An Exploratory Study on Perceptual Spaces of the Singing Voice}, Url = {http://www.semanticaudio.net/files/papers/oconnor2020mume.pdf}, Year = {2020}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/oconnor2020mume.pdf}, Bdsk-Url-2 = {https://doi.org/10.5281/zenodo.4285404}} @conference{Proutskova2020ismir, Abstract = {This paper presents exploratory work investigating the suitability of the Music Ontology - the most widely used formal specification of the music domain - for modelling non-Western musical traditions. Four contrasting case studies from a variety of musical cultures are analysed: Dutch folk song research, reconstructive performance of rural Russian traditions, contemporary performance and composition of Persian classical music, and recreational use of a personal world music collection. We propose semantic models describing the respective do- mains and examine the applications of the Music Ontology for these case studies: which concepts can be successfully reused, where they need adjustments, and which parts of the reality in these case studies are not covered by the Mu- sic Ontology. The variety of traditions, contexts and modelling goals covered by our case studies sheds light on the generality of the Music Ontology and on the limits of generalisation ``for all musics'' that could be aspired for on the Semantic Web.}, Author = {Proutskova, P. and Volk, A. and Heidarian, P. and Fazekas G.}, Booktitle = {Proc. of the International Society of Music Information Retrieval Conference (ISMIR), 11-16 Oct., Montreal, Canada}, Date-Added = {2020-12-27 14:03:45 +0000}, Date-Modified = {2020-12-27 15:41:21 +0000}, Keywords = {ontology, music information retrieval, semantic audio}, Pages = {923-931}, Publisher-Url = {https://program.ismir2020.net/static/final_papers/323.pdf}, Title = {From Music Ontology Towards Ethno-Music-Ontology}, Url = {http://www.semanticaudio.net/files/papers/proutskova2020ismir.pdf}, Year = {2020}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/proutskova2020ismir.pdf}} @conference{vahidi2020timbre, Abstract = {In this study, we produce a geometrically scaled perceptual timbre space from dissimilarity ratings of subtractive synthesized sounds and correlate the resulting dimensions with a set of acoustic descriptors. We curate a set of 15 sounds, produced by a synthesis model that uses varying source waveforms, frequency modulation (FM) and a lowpass filter with an enveloped cutoff frequency. Pairwise dissimilarity ratings were collected within an online browser-based experiment. We hypothesized that a varied waveform input source and enveloped filter would act as the main vehicles for timbral variation, providing novel acoustic correlates for the perception of synthesized timbres.}, Author = {Vahidi, C. and Fazekas, G. and Saitis, C. and Palladini, A.}, Booktitle = {Proc. of the 2nd International Conference on Timbre (Timbre 2020), 3-4 September, Thessaloniki, Greece}, Date-Added = {2020-12-27 13:27:44 +0000}, Date-Modified = {2020-12-27 13:36:11 +0000}, Keywords = {music perception, neural audio synthesis}, Pages = {30-33}, Publisher-Url = {https://arxiv.org/abs/2009.11706}, Title = {Timbre Space Representation of a Subtractive Synthesizer}, Url = {http://www.semanticaudio.net/files/papers/vahidi2020timbre.pdf}, Year = {2020}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/vahidi2020timbre.pdf}} @article{lefford2020jaes, Abstract = {Intelligent Mixing Systems (IMS) are rapidly becoming integrated into music mixing and production workflows. The intelligences of a human mixer and an IMS can be distinguished by their abilities to comprehend, assess and appreciate context. Humans will factor context into decisions, particularly concerning the use and application of technologies. The utility of an IMS depends on both its affordances and the situation in which it is to be used. The appropriate use for conventional purposes, or its utility for misappropriation, is determined by the context. This study considers how context impacts mixing decisions and the use of technology, focusing on how the mixer's understanding of context can inform the use of IMS, and how the use of IMS can aid in informing a mixer of different contexts.}, Author = {Lefford, MN. and Bromham, G. and Fazekas, G. and Moffat, D.}, Date-Added = {2020-12-26 18:44:18 +0000}, Date-Modified = {2020-12-26 18:50:45 +0000}, Journal = {Journal of the Audio Engineering Society (JAES)}, Keywords = {semantic audio, intelligent music production, automatic mixing}, Number = {3}, Pages = {1-29}, Publisher-Url = {https://pearl.plymouth.ac.uk/handle/10026.1/16381}, Title = {Context Aware Intelligent Mixing Systems}, Url = {http://www.semanticaudio.net/files/papers/lefford2020jaes-preprint.pdf}, Volume = {1}, Year = {2020}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGJCVYJHZlcnNpb25YJG9iamVjdHNZJGFyY2hpdmVyVCR0b3ASAAGGoKgHCBMUFRYaIVUkbnVsbNMJCgsMDxJXTlMua2V5c1pOUy5vYmplY3RzViRjbGFzc6INDoACgAOiEBGABIAFgAdccmVsYXRpdmVQYXRoWWFsaWFzRGF0YV8QI3BhcGVycy9sZWZmb3JkMjAyMGphZXMtcHJlcHJpbnQucGRm0hcLGBlXTlMuZGF0YU8RAbIAAAAAAbIAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xxsZWZmb3JkMjAyMGphZXMtcHJlcHJpbnQucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEAAwAACiBjdQAAAAAAAAAAAAAAAAAGcGFwZXJzAAIATy86VXNlcnM6Z2ZhemVrYXM6RG9jdW1lbnRzOndlYnNpdGUtaGc6ZmlsZXM6cGFwZXJzOmxlZmZvcmQyMDIwamFlcy1wcmVwcmludC5wZGYAAA4AOgAcAGwAZQBmAGYAbwByAGQAMgAwADIAMABqAGEAZQBzAC0AcAByAGUAcAByAGkAbgB0AC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgBNVXNlcnMvZ2ZhemVrYXMvRG9jdW1lbnRzL3dlYnNpdGUtaGcvZmlsZXMvcGFwZXJzL2xlZmZvcmQyMDIwamFlcy1wcmVwcmludC5wZGYAABMAAS8AABUAAgAP//8AAIAG0hscHR5aJGNsYXNzbmFtZVgkY2xhc3Nlc11OU011dGFibGVEYXRhox0fIFZOU0RhdGFYTlNPYmplY3TSGxwiI1xOU0RpY3Rpb25hcnmiIiBfEA9OU0tleWVkQXJjaGl2ZXLRJidUcm9vdIABAAgAEQAaACMALQAyADcAQABGAE0AVQBgAGcAagBsAG4AcQBzAHUAdwCEAI4AtAC5AMECdwJ5An4CiQKSAqACpAKrArQCuQLGAskC2wLeAuMAAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAAC5Q==}} @article{turchet2020tiot, Abstract = {Large online music databases under Creative Commons licenses are rarely recorded by well-known artists, therefore conventional metadata-based search is insufficient in their adaptation to instrument players' needs. The emerging class of smart musical instruments (SMIs) can address this challenge. Thanks to direct internet connectivity and embedded processing, SMIs can send requests to repositories and reproduce the response for improvisation, composition or learning purposes. We present a smart guitar prototype that allows retrieving songs from large online music databases using criteria different from conventional music search, which were derived from interviewing thirty guitar players. We investigate three interaction methods coupled with four search criteria (tempo, chords, key and tuning) exploiting intelligent capabilities in the instrument: i) keywords-based retrieval using an embedded touchscreen; ii) cloud-computing where recorded content is transmitted to a server that extracts relevant audio features; iii) edge-computing where the guitar detects audio features and sends the request directly. Overall, the evaluation of these methods with beginner, intermediate and expert players showed a strong appreciation for the direct connectivity of the instrument with an online database and the approach to the search based on the actual musical content rather than conventional textual criteria, such as song title or artist name.}, Author = {Turchet, L. and Pauwels, J. Fischione, C. and Fazekas, G.}, Date-Added = {2020-12-26 18:37:35 +0000}, Date-Modified = {2020-12-26 18:41:33 +0000}, Doi = {10.1145/3377881}, Journal = {ACM Transactions on Internet of Things (TIoT)}, Keywords = {IoT, ontology, semantic audio, Semantic Web, IoMusT}, Number = {3}, Pages = {1-29}, Publisher-Url = {https://doi.org/10.1145/3377881}, Title = {Cloud-smart Musical Instrument Interactions: Querying a Large Music Collection with a Smart Guitar}, Url = {http://www.semanticaudio.net/files/papers/turchet2020tiot-preprint.pdf}, Volume = {1}, Year = {2020}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGJCVYJHZlcnNpb25YJG9iamVjdHNZJGFyY2hpdmVyVCR0b3ASAAGGoKgHCBMUFRYaIVUkbnVsbNMJCgsMDxJXTlMua2V5c1pOUy5vYmplY3RzViRjbGFzc6INDoACgAOiEBGABIAFgAdccmVsYXRpdmVQYXRoWWFsaWFzRGF0YV8QKy4uLy4uLy4uLy5UcmFzaC90dXJjaGV0MjAyMGFjbS1wcmVwcmludC5wZGbSFwsYGVdOUy5kYXRhTxEBeAAAAAABeAACAAAMTWFjaW50b3NoIEhEAAAAAAAAAAAAAAAAAAAAAAAAAEJEAAH/////G3R1cmNoZXQyMDIwYWNtLXByZXByaW50LnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP////8AAAAAAAAAAAAAAAAAAwACAAAKIGN1AAAAAAAAAAAAAAAAAAYuVHJhc2gAAgAzLzpVc2VyczpnZmF6ZWthczouVHJhc2g6dHVyY2hldDIwMjBhY20tcHJlcHJpbnQucGRmAAAOADgAGwB0AHUAcgBjAGgAZQB0ADIAMAAyADAAYQBjAG0ALQBwAHIAZQBwAHIAaQBuAHQALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASADFVc2Vycy9nZmF6ZWthcy8uVHJhc2gvdHVyY2hldDIwMjBhY20tcHJlcHJpbnQucGRmAAATAAEvAAAVAAIAD///AACABtIbHB0eWiRjbGFzc25hbWVYJGNsYXNzZXNdTlNNdXRhYmxlRGF0YaMdHyBWTlNEYXRhWE5TT2JqZWN00hscIiNcTlNEaWN0aW9uYXJ5oiIgXxAPTlNLZXllZEFyY2hpdmVy0SYnVHJvb3SAAQAIABEAGgAjAC0AMgA3AEAARgBNAFUAYABnAGoAbABuAHEAcwB1AHcAhACOALwAwQDJAkUCRwJMAlcCYAJuAnICeQKCAocClAKXAqkCrAKxAAAAAAAAAgEAAAAAAAAAKAAAAAAAAAAAAAAAAAAAArM=}} @article{turchet2020iotj, Abstract = {The Internet of Audio Things (IoAuT) is an emerging research field positioned at the intersection of the Internet of Things, sound and music computing, artificial intelligence, and human-computer interaction. The IoAuT refers to the networks of computing devices embedded in physical objects (Audio Things) dedicated to the production, reception, analysis, and understanding of audio in distributed environments. Audio Things, such as nodes of wireless acoustic sensor networks, are connected by an infrastructure that enables multidirectional communication, both locally and remotely. In this article, we first review the state of the art of this field, then we present a vision for the IoAuT and its motivations. In the proposed vision, the IoAuT enables the connection of digital and physical domains by means of appropriate information and communication technologies, fostering novel applications and services based on auditory information. The ecosystems associated with the IoAuT include interoperable devices and services that connect humans and machines to support human-human and human-machines interactions. We discuss the challenges and implications of this field, which lead to future research directions on the topics of privacy, security, design of Audio Things, and methods for the analysis and representation of audio-related information.}, Author = {Turchet, L. and Fazekas, G. and Lagrange, M. and Ghadikolaei, H. and Fischione, C.}, Date-Added = {2020-12-26 18:29:52 +0000}, Date-Modified = {2020-12-26 18:35:54 +0000}, Doi = {10.1109/JIOT.2020.2997047}, Journal = {IEEE Internet of Things Journal (IoT)}, Keywords = {IoT, ontology, semantic audio, Semantic Web, IoAuT, IoMusT}, Number = {10}, Pages = {10233-10249}, Publisher-Url = {https://ieeexplore.ieee.org/document/9099251}, Title = {The Internet of Audio Things: State of the Art, Vision, and Challenges}, Url = {http://www.semanticaudio.net/files/papers/turchet2020iotj-preprint.pdf}, Volume = {7}, Year = {2020}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGJCVYJHZlcnNpb25YJG9iamVjdHNZJGFyY2hpdmVyVCR0b3ASAAGGoKgHCBMUFRYaIVUkbnVsbNMJCgsMDxJXTlMua2V5c1pOUy5vYmplY3RzViRjbGFzc6INDoACgAOiEBGABIAFgAdccmVsYXRpdmVQYXRoWWFsaWFzRGF0YV8QI3BhcGVycy90dXJjaGV0MjAyMGlvdGotcHJlcHJpbnQucGRm0hcLGBlXTlMuZGF0YU8RAbIAAAAAAbIAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xx0dXJjaGV0MjAyMGlvdGotcHJlcHJpbnQucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEAAwAACiBjdQAAAAAAAAAAAAAAAAAGcGFwZXJzAAIATy86VXNlcnM6Z2ZhemVrYXM6RG9jdW1lbnRzOndlYnNpdGUtaGc6ZmlsZXM6cGFwZXJzOnR1cmNoZXQyMDIwaW90ai1wcmVwcmludC5wZGYAAA4AOgAcAHQAdQByAGMAaABlAHQAMgAwADIAMABpAG8AdABqAC0AcAByAGUAcAByAGkAbgB0AC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgBNVXNlcnMvZ2ZhemVrYXMvRG9jdW1lbnRzL3dlYnNpdGUtaGcvZmlsZXMvcGFwZXJzL3R1cmNoZXQyMDIwaW90ai1wcmVwcmludC5wZGYAABMAAS8AABUAAgAP//8AAIAG0hscHR5aJGNsYXNzbmFtZVgkY2xhc3Nlc11OU011dGFibGVEYXRhox0fIFZOU0RhdGFYTlNPYmplY3TSGxwiI1xOU0RpY3Rpb25hcnmiIiBfEA9OU0tleWVkQXJjaGl2ZXLRJidUcm9vdIABAAgAEQAaACMALQAyADcAQABGAE0AVQBgAGcAagBsAG4AcQBzAHUAdwCEAI4AtAC5AMECdwJ5An4CiQKSAqACpAKrArQCuQLGAskC2wLeAuMAAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAAC5Q==}} @article{williams2020sensors, Abstract = {Music has been shown to be capable of improving runners' performance in treadmill and laboratory-based experiments. This paper evaluates a generative music system, namely HEARTBEATS, designed to create biosignal synchronous music in real-time according to an individual athlete's heartrate or cadence (steps per minute). The tempo, melody, and timbral features of the generated music are modulated according to biosensor input from each runner using a combination of PPG (Photoplethysmography) and GPS (Global Positioning System) from a wearable sensor, synchronized via Bluetooth. We compare the relative performance of athletes listening to music with heartrate and cadence synchronous tempos, across a randomized trial (N= 54) on a trail course with 76 ft of elevation. Participants were instructed to continue until their self-reported perceived effort went beyond an 18 using the Borg rating of perceived exertion. We found that cadence-synchronous music improved performance and decreased perceived effort in male runners. For female runners, cadence synchronous music improved performance but it was heartrate synchronous music which significantly reduced perceived effort and allowed them to run the longest of all groups tested. This work has implications for the future design and implementation of novel portable music systems and in music-assisted coaching.}, Author = {Williams, D. and Fazenda, B. and Williamson, V. and Fazekas, G.}, Date-Added = {2020-12-26 18:22:06 +0000}, Date-Modified = {2020-12-26 18:27:40 +0000}, Doi = {10.3390/s20164528}, Journal = {Sensors}, Keywords = {music generation, adaptive music}, Number = {16}, Pages = {4528}, Publisher-Url = {https://www.mdpi.com/1424-8220/20/16/4528/pdf}, Title = {On performance and perceived effort in trail runners using sensor control to generate biosynchronous music}, Url = {http://www.semanticaudio.net/files/papers/williams2020sensors-preprint.pdf}, Volume = {20}, Year = {2020}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGJCVYJHZlcnNpb25YJG9iamVjdHNZJGFyY2hpdmVyVCR0b3ASAAGGoKgHCBMUFRYaIVUkbnVsbNMJCgsMDxJXTlMua2V5c1pOUy5vYmplY3RzViRjbGFzc6INDoACgAOiEBGABIAFgAdccmVsYXRpdmVQYXRoWWFsaWFzRGF0YV8QHnBhcGVycy93aWxsaWFtczIwMjBzZW5zb3JzLnBkZtIXCxgZV05TLmRhdGFPEQGcAAAAAAGcAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8Xd2lsbGlhbXMyMDIwc2Vuc29ycy5wZGYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAABAAMAAAogY3UAAAAAAAAAAAAAAAAABnBhcGVycwACAEovOlVzZXJzOmdmYXpla2FzOkRvY3VtZW50czp3ZWJzaXRlLWhnOmZpbGVzOnBhcGVyczp3aWxsaWFtczIwMjBzZW5zb3JzLnBkZgAOADAAFwB3AGkAbABsAGkAYQBtAHMAMgAwADIAMABzAGUAbgBzAG8AcgBzAC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgBIVXNlcnMvZ2ZhemVrYXMvRG9jdW1lbnRzL3dlYnNpdGUtaGcvZmlsZXMvcGFwZXJzL3dpbGxpYW1zMjAyMHNlbnNvcnMucGRmABMAAS8AABUAAgAP//8AAIAG0hscHR5aJGNsYXNzbmFtZVgkY2xhc3Nlc11OU011dGFibGVEYXRhox0fIFZOU0RhdGFYTlNPYmplY3TSGxwiI1xOU0RpY3Rpb25hcnmiIiBfEA9OU0tleWVkQXJjaGl2ZXLRJidUcm9vdIABAAgAEQAaACMALQAyADcAQABGAE0AVQBgAGcAagBsAG4AcQBzAHUAdwCEAI4ArwC0ALwCXAJeAmMCbgJ3AoUCiQKQApkCngKrAq4CwALDAsgAAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAACyg==}} @article{turchet2020jws, Abstract = {The Internet of Musical Things (IoMusT) is an emerging research area consisting of the extension of the Internet of Things paradigm to the music domain. Interoperability represents a central issue within this domain, where heterogeneous objects dedicated to the production and/or reception of musical content (Musical Things) are envisioned to communicate between each other. This paper proposes an ontology for the representation of the knowledge related to IoMusT ecosystems to facilitate interoperability between Musical Things. There was no previous comprehensive data model for the IoMusT domain, however the new ontology relates to existing ontologies, including the SOSA Ontology for the representation of sensors and actuators and the Music Ontology focusing on the production and consumption of music. This paper documents the design of the ontology and its evaluation with respect to specific requirements gathered from an extensive literature review, which was based on scenarios involving IoMusT stakeholders, such as performers and audience members. The IoMusT Ontology can be accessed at: https://w3id.org/iomust#.}, Author = {Turchet, L. and Antoniazzi, F. and Viola, F. and Giunchiglia, F. and Fazekas, G.}, Date-Added = {2020-12-26 10:02:31 +0000}, Date-Modified = {2020-12-26 10:08:00 +0000}, Doi = {10.1016/j.websem.2020.100548}, Journal = {Journal of Web Semantics (JWS)}, Keywords = {ontology, semantic audio, Semantic Web}, Number = {100548}, Publisher-Url = {https://doi.org/10.1016/j.websem.2020.100548}, Title = {The Internet of Musical Things Ontology}, Url = {http://www.semanticaudio.net/files/papers/turchet2020jws-preprint.pdf}, Volume = {60}, Year = {2020}, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=18376}, Bdsk-Url-2 = {https://dx.doi.org/10.17743/jaes.2016.0042}} @conference{thompson2020vlhcc, Abstract = {New domain-specific languages for creating music and audio applications have typically been created in response to some technological challenge. Recent research has begun looking at how these languages impact our creative and aesthetic choices in music-making but we have little understanding on their effect on our wider programming practice. We present a survey that seeks to uncover what programming practices exist among interactive audio software developers and discover it is highly multi-practice, with developers adopting both exploratory programming and software engineering practice. A Q methodological study reveals that this multi-practice development is supported by different combinations of language features.}, Author = {Thompson, A. and Fazekas, G. and Wiggins, G.}, Booktitle = {Proc. of the 2020 IEEE Symposium on Visual Languages and Human-Centric Computing (VL/HCC), 10-14 Aug., Dunedin, New Zealand}, Date-Added = {2020-12-27 13:14:21 +0000}, Date-Modified = {2020-12-27 13:27:20 +0000}, Doi = {10.1109/VL/HCC50065.2020.9127261}, Keywords = {audio programming, HCI}, Publisher = {IEEE}, Publisher-Url = {https://ieeexplore.ieee.org/document/9127261}, Title = {Programming Practices Among Interactive Audio Software Developers}, Url = {https://ieeexplore.ieee.org/document/9127261}, Year = {2020}, Bdsk-Url-1 = {https://ieeexplore.ieee.org/document/9127261}, Bdsk-Url-2 = {http://dx.doi.org/10.1109/VL/HCC50065.2020.9127261}} @conference{shatri2020tenor, Abstract = {Optical Music Recognition (OMR) is concerned with transcribing sheet music into a machine-readable format. The transcribed copy should allow musicians to compose, play and edit music by taking a picture of a music sheet. Complete transcription of sheet music would also enable more efficient archival. OMR facilitates examining sheet music statistically or searching for patterns of notations, thus helping use cases in digital musicology too. Recently, there has been a shift in OMR from using conventional computer vision techniques towards a deep learning approach. In this paper, we review relevant works in OMR, including fundamental methods and significant outcomes, and highlight different stages of the OMR pipeline. These stages often lack standard input and output representation and standardised evaluation. Therefore, comparing different approaches and evaluating the impact of different processing methods can become rather complex. This paper provides recommendations for future work, addressing some of the highlighted issues and represents a position in furthering this important field of research.}, Author = {Shatri, E. and Fazekas}, Booktitle = {Proc. of the 7th International Conference on Technologies for Music Notation and Representation (TENOR), Hamburg, Germany}, Date-Added = {2020-12-27 13:06:35 +0000}, Date-Modified = {2020-12-27 14:28:24 +0000}, Keywords = {optical music recognition, OMR, computer vision}, Publisher-Url = {https://www.tenor-conference.org/proceedings/2020/23_Shatri_tenor20.pdf}, Title = {Optical Music Recognition: State of the Art and Major Challenges}, Url = {http://www.semanticaudio.net/files/papers/shatri2020tenor.pdf}, Year = {2020}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/shatri2020tenor.pdf}} @conference{pauwels2020aaai, Abstract = {In recent years, Markov logic networks (MLNs) have been proposed as a potentially useful paradigm for music signal analysis. Because all hidden Markov models can be reformulated as MLNs, the latter can provide an all-encompassing framework that reuses and extends previous work in the field. However, just because it is theoretically possible to reformulate previous work as MLNs, does not mean that it is advantageous. In this paper, we analyse some proposed examples of MLNs for musical analysis and consider their practical disadvantages when compared to formulating the same musical dependence relationships as (dynamic) Bayesian networks. We argue that a number of practical hurdles such as the lack of support for sequences and for arbitrary continuous probability distributions make MLNs less than ideal for the proposed musical applications, both in terms of easy of formulation and computational requirements due to their required inference algorithms. These conclusions are not specific to music, but apply to other fields as well, especially when sequential data with continuous observations is involved. Finally, we show that the ideas underlying the proposed examples can be expressed perfectly well in the more commonly used framework of (dynamic) Bayesian networks.}, Author = {Pauwels, J. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the Ninth International Workshop on Statistical Relational AI (StarAI 2020) at the 34th AAAI Conference on Artificial Intelligence (AAAI), New York, USA, 7 February}, Date-Added = {2020-12-27 12:58:47 +0000}, Date-Modified = {2020-12-27 13:36:20 +0000}, Keywords = {music information retrieval, semantic audio, markov logic networks, chord and key recognition}, Publisher-Url = {https://arxiv.org/abs/2001.06086}, Title = {A Critical Look at the Applicability of Markov Logic Networks for Music Signal Analysis}, Url = {http://www.semanticaudio.net/files/papers/pauwels2020aaai.pdf}, Year = {2020}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/pauwels2020aaai.pdf}} @conference{zhao2020smc, Abstract = {The same piece of music can be performed in various styles by different performers. Vibrato plays an important role in violin players' emotional expression, and it is an important factor of playing style while execution shows great diversity. Expressive timing is also an important factor to reflect individual play styles. In our study, we construct a novel dataset, which contains 15 concertos performed by 9 master violinists. Four vibrato features and one timing feature are extracted from the data, and we present a method based on the similarity of feature distribution to identify violinists using each feature alone and fusion of features. The result shows that vibrato features are helpful for the identification, but the timing feature performs better, yielding a precision of 0.751. In addition, although the accuracy obtained from fused features are lower than using timing alone, discrimination for each performer is improved.}, Author = {Zhao, Y. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 17th Sound and Music Computing Conference, Torino, Italy, 24-26 June}, Date-Added = {2020-12-27 12:49:50 +0000}, Date-Modified = {2020-12-27 12:57:07 +0000}, Doi = {10.5281/zenodo.3898747}, Keywords = {music information retrieval, semantic audio, performer identification}, Pages = {185-192}, Publisher-Url = {https://smc2020torino.it/adminupload/file/SMCCIM_2020_paper_168.pdf}, Title = {Identifying Master Violinists Using Note-level Audio Features}, Url = {http://www.semanticaudio.net/files/papers/zhao2020smc.pdf}, Year = {2020}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/zhao2020smc.pdf}, Bdsk-Url-2 = {http://dx.doi.org/10.5281/zenodo.3898747}} @conference{wilson2020icli, Abstract = {Co-creation strategies for human-machine collaboration have been explored in various creative disciplines. Recent developments in music technology and artificial intelligence have made these creative interactions applicable to the domain of computer music, meaning it is now possible to interface with algorithms as creative partners. The application of computational creativity research is beginning to be incorporated within the practice of live algorithmic music known as live coding. As music is inherently coupled with affective response (often defined as the general psychological state of an individual, including but not limited to emotions and mood), it is crucial for any artificial musical intelligence system to consider how to incorporate emotional meaning into collaborative musical actions. This work looks at bestowing live coding systems with the ability to autonomously create emotionally intelligent musical collaborations and examine new ways of interfacing with musical algorithms.}, Author = {Wilson, E. and Fazekas, G. and Wiggins, G.}, Booktitle = {Proc. of the International Conference on Live Interfaces (ICLI), 9-11, March, Trondheim, Norway}, Date-Added = {2020-12-27 12:43:18 +0000}, Date-Modified = {2020-12-27 12:48:40 +0000}, Doi = {10.5281/zenodo.3932879}, Keywords = {affective computing, HCI, live coding}, Publisher-Url = {https://doi.org/10.5281/zenodo.3932879}, Title = {Collaborative human and machine creative interaction driven through affective response in live coding systems.}, Url = {http://www.semanticaudio.net/files/papers/wilson2020icli.pdf}, Year = {2020}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/wilson2020icli.pdf}, Bdsk-Url-2 = {http://dx.doi.org/10.5281/zenodo.3932879}} @conference{thompson2019am, Abstract = {We present the Flow framework, a front-end framework for interactive Web applications built on the Web Audio API. It encourages a purely declarative approach to application design by providing a number of abstractions for the creation of HTML, audio processing graphs, and event listeners. In doing so we place the burden of tracking and managing state solely on to the framework rather than the developer. We introduce the Model-View-Update architecture and how it applies to audio application design. The MVU architecture is built on the unidirectional flow of data through pure functions, pushing side effects onto the framework's runtime. Flow conceptualises the audio graph as another View into application state, and uses this conceptualisation to enforce strict separation of the audio and visual output of an application. Future plans for the framework include a robust plug-in system to add support for third-party audio nodes, a time travelling debugger to replay sequences of actions to the runtime, and a bespoke programming language that better aligns with Flow's functional influences.}, Author = {Thompson, A. and Fazekas, G.}, Booktitle = {14th International Audio Mostly Conference, 18-20 Sept., Nottingham, UK}, Date-Added = {2020-12-26 08:51:51 +0000}, Date-Modified = {2020-12-26 09:06:26 +0000}, Doi = {10.1145/3356590.3356623}, Keywords = {web audio, programming}, Pages = {219-222}, Publisher = {ACM}, Publisher-Url = {https://dl.acm.org/doi/10.1145/3356590.3356623}, Title = {A Model-View-Update Framework for Interactive Web Audio Applications}, Url = {http://www.semanticaudio.net/files/papers/thompson2019am.pdf}, Year = {2019}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/bromham2019am.pdf}, Bdsk-Url-2 = {http://dx.doi.org/10.1145/3356590.3356618}} @conference{xambo2019routledge, Abstract = {With the advent of online audio resources and web technologies, digital tools for sound designers and music producers are changing. The Internet provides access to hundreds of thousands of digital audio files, from human-and nature-related environmental sounds, instrument samples and sound effects, to produced songs ready to use in media production. In relation to the vast amount of creative content available online, an emerging community has forged a culture of sharing. Creative Commons (CC) appears as a legal framework to support such initiative enabling the reuse and remix of creative artefacts. In this chapter, we discuss key concepts and challenges related to the use of CC online audio content (Audio Commons content) for linear media production. We present five use cases connected to the Audio Commons Initiative, illustrating how the gap between audio content creators, digital content providers, sound designers and music producers can be bridged using a web infrastructure and user-friendly tools. The use cases cover various creative production workflows from composition to performance. This chapter discusses novel tools enabling users to ``surf'' the web in search of sounds matching a creative brief, to import and process CC-licensed audio in the DAW, or to play live performances with laptop ensembles making use of responsive web audio technologies.}, Author = {Xambo, A. and Font, F. and Fazekas, G. and Barthet, M.}, Booktitle = {In Michael Filimowicz (ed.) Foundations in Sound Design for Linear Media: An Interdisciplinary Approach}, Date-Added = {2020-12-26 08:42:31 +0000}, Date-Modified = {2020-12-26 09:07:22 +0000}, Keywords = {audio commons, MIR, sound samples, Creative Commons}, Pages = {248-282}, Publisher = {Routledge, London}, Title = {Leveraging online audio commons content for media production}, Url = {http://www.semanticaudio.net/files/papers/xambo2019routledge.pdf}, Year = {2019}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{safavi2018fruct, Abstract = {Perceptual measurements have typically been recognized as the most reliable measurements in assessing perceived levels of reverberation. In this paper, a combination of blind RT60 estimation method and a binaural, nonlinear auditory model is employed to derive signal-based measures (features) that are then utilized in predicting the perceived level of reverberation. Such measures lack the excess of effort necessary for calculating perceptual measures; not to mention the variations in either stimuli or assessors that may cause such measures to be statistically insignificant. As a result, the automatic extraction of objective measurements that can be applied to predict the perceived level of reverberation become of vital significance. Consequently, this work is aimed at discovering measurements such as clarity, reverberance, and RT60 which can automatically be derived directly from audio data. These measurements along with labels from human listening tests are then forwarded to a machine learning system seeking to build a model to estimate the perceived level of reverberation, which is labeled by an expert, autonomously. The data has been labeled by an expert human listener for a unilateral set of files from arbitrary audio source types. By examining the results, it can be observed that the automatically extracted features can aid in estimating the perceptual rates.}, Author = {Safavi, S. and Wang, W. and Plumbley, M. and Choobbasti, AJ. and Fazekas, G.}, Booktitle = {Proc. of the 23rd Conference of Open Innovations Association FRUCT, 13-16 Nov., Bologna, Italy}, Date-Added = {2020-12-25 23:29:26 +0000}, Date-Modified = {2020-12-26 10:26:58 +0000}, Keywords = {Perception, semantic audio, acoustics, deep learning}, Pages = {527-531}, Publisher = {IEEE/ACM}, Publisher-Url = {https://dl.acm.org/doi/10.5555/3299905.3299978}, Title = {Predicting the Perceived Level of Reverberation using Features from Nonlinear Auditory Model}, Url = {http://www.semanticaudio.net/files/papers/safavi2018fruct.pdf}, Year = {2018}, Bdsk-Url-1 = {https://link.springer.com/chapter/10.1007/978-3-319-49157-8_5}, Bdsk-Url-2 = {https://dx.doi.org/10.1007/978-3-319-49157-8_5}} @conference{bromham2019am, Abstract = {It is not uncommon to hear musicians and audio engineers speak of warmth and brightness when describing analog technologies such as vintage mixing consoles, multitrack tape machines, and valve compressors. What is perhaps less common, is hearing this term used in association with retro digital technology. A question exists as to how much the low bit rate and low-grade conversion quality contribute to the overall brightness or warmth of a sound when processed with audio effects simulating early sampling technologies. These two dimensions of timbre are notoriously difficult to define and more importantly, measure. We present a subjective user study of brightness and warmth, where a series of audio examples are processed with different audio effects. 26 participants rated the perceived level of brightness and warmth of various instrumental sequences for 5 different audio effects including bit depth reduction, compression and equalisation. Results show that 8 bit reduction tends to increase brightness and decrease warmth whereas 12 bit reduction tends to do the opposite, although this is very much dependent on the instrument. Interestingly, the most significant brightness changes, due to bit reduction, were obtained for bass sounds. For comparison purposes, instrument phrases were also processed with both an analogue compressor and an equalisation plugin to see if any subjective difference was noticed when simulating sonic characteristics that might be associated with warmth. Greater significance was observed when the sound excerpts were processed with the plugins being used to simulate the effects of bit depth reduction.}, Author = {Bromham, G. and Moffat, D. and Barthet, M. and Danielsen, A. and Fazekas, G.}, Booktitle = {14th International Audio Mostly Conference, 18-20 Sept., Nottingham, UK}, Date-Added = {2020-12-25 20:54:48 +0000}, Date-Modified = {2020-12-26 09:06:03 +0000}, Doi = {10.1145/3356590.3356618}, Keywords = {intelligent music production, semantic audio}, Pages = {183-190}, Publisher = {ACM}, Publisher-Url = {https://dl.acm.org/doi/10.1145/3356590.3356618}, Title = {The Impact of Audio Effects Processing on the Perception of Brightness and Warmth}, Url = {http://www.semanticaudio.net/files/papers/bromham2019am.pdf}, Year = {2019}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/bromham2019am.pdf}, Bdsk-Url-2 = {http://dx.doi.org/10.1145/3356590.3356618}} @conference{bromham2019dmrn, Abstract = {It is not uncommon to hear musicians and audio engineers speak of warmth and brightness when describing analog technologies such as vintage mixing consoles, multitrack tape machines, and valve compressors. What is perhaps less common, is hearing this term used in association with retro digital technology. A question exists as to how much the low bit rate and low-grade conversion quality contribute to the overall brightness or warmth of a sound when processed with audio effects simulating early sampling technologies. These two dimensions of timbre are notoriously difficult to define and more importantly, measure. We present a subjective user study of brightness and warmth, where a series of audio examples are processed with different audio effects. }, Author = {Bromham, G. and Moffat, D. and Barthet, M. and Fazekas, G.}, Booktitle = {Digital Music Research Network (DMRN+14) Workshop, Dec. 17., London, UK}, Date-Added = {2020-12-25 20:49:38 +0000}, Date-Modified = {2020-12-26 08:30:54 +0000}, Keywords = {intelligent music production, semantic audio}, Publisher = {QMUL}, Title = {The Retro in Digital: Understanding the Semantics of Audio Effects}, Url = {http://www.semanticaudio.net/files/papers/bromham2019dmrn.pdf}, Year = {2019}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/bromham2019dmrn.pdf}} @conference{sheng2019ijcnn, Abstract = {In this paper, a siamese DNN model is proposed to learn the characteristics of the audio dynamic range compressor (DRC). This facilitates an intelligent control system that uses audio examples to configure the DRC, a widely used non-linear audio signal conditioning technique in the areas of music production, speech communication and broadcasting. Several alternative siamese DNN architectures are proposed to learn feature embeddings that can characterise subtle effects due to dynamic range compression. These models are compared with each other as well as handcrafted features proposed in previous work. The evaluation of the relations between the hyperparameters of DNN and DRC parameters are also provided. The best model is able to produce a universal feature embedding that is capable of predicting multiple DRC parameters simultaneously, which is a significant improvement from our previous research. The feature embedding shows better performance than handcrafted audio features when predicting DRC parameters for both mono-instrument audio loops and polyphonic music pieces.}, Author = {Sheng, D. and Fazekas, G.}, Booktitle = {Proc. of the International Joint Conf. on Neural Networks (IJCNN), July 14-19, Budapest, Hungary}, Date-Added = {2019-06-04 11:03:21 +0000}, Date-Modified = {2019-06-04 11:10:08 +0000}, Keywords = {deep learning, audio effects}, Publisher-Url = {https://www.ijcnn.org/assets/2019/ijcnn2019-program22May.pdf}, Title = {A Feature Learning Siamese Model for Intelligent Control of the Dynamic Range Compressor}, Url = {https://arxiv.org/pdf/1905.01022.pdf}, Year = {2019}, Bdsk-Url-1 = {https://arxiv.org/pdf/1905.01022.pdf}} @conference{liang2019ijcnn, Abstract = {Detecting piano pedalling techniques in polyphonic music remains a challenging task in music information retrieval. While other piano-related tasks, such as pitch estimation and onset detection, have seen improvement through applying deep learning methods, little work has been done to develop deep learning models to detect playing techniques. In this paper, we propose a transfer learning approach for the detection of sustain-pedal techniques, which are commonly used by pianists to enrich the sound. In the source task, a convolutional neural network (CNN) is trained for learning spectral and temporal contexts when the sustain pedal is pressed using a large dataset generated by a physical modelling virtual instrument. The CNN is designed and experimented through exploiting the knowledge of piano acoustics and physics. This can achieve an accuracy score of 0.98 in the validation results. In the target task, the knowledge learned from the synthesised data can be transferred to detect the sustain pedal in acoustic piano recordings. A concatenated feature vector using the activations of the trained convolutional layers is extracted from the recordings and classified into frame-wise pedal press or release. We demonstrate the effectiveness of our method in acoustic piano recordings of Chopin's music. From the cross-validation results, the proposed transfer learning method achieves an average F-measure of 0.89 and an overall performance of 0.84 obtained using the micro-averaged F-measure. These results outperform applying the pre-trained CNN model directly or the model with a fine-tuned last layer. }, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the International Joint Conf. on Neural Networks (IJCNN), July 14-19, Budapest, Hungary}, Date-Added = {2019-06-04 11:10:43 +0000}, Date-Modified = {2019-06-04 11:15:06 +0000}, Keywords = {deep learning, piano pedaling recognition}, Publisher-Url = {https://www.ijcnn.org/assets/2019/ijcnn2019-program22May.pdf}, Title = {Transfer Learning for Piano Sustain-Pedal Detection}, Url = {http://www.semanticaudio.net/files/papers/liang2019ijcnn-preprint.pdf}, Year = {2019}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/liang2019ijcnn-preprint.pdf}} @conference{liang2019icassp, Abstract = {Recent research on piano transcription has focused primarily on note events. Very few studies have investigated pedalling techniques, which form an important aspect of expressive piano music performance. In this paper, we propose a novel method for piano sustain-pedal detection based on Convolutional Neural Networks (CNN). Inspired by different acoustic characteristics at the start (pedal onset) versus during the pedalled segment, two binary classifiers are trained separately to learn both temporal dependencies and timbral features using CNN. Their outputs are fused in order to decide whether a portion in a piano recording is played with the sustain pedal. The proposed architecture and our detection system are assessed using a dataset with frame-wise pedal on/off annotations. An average F1 score of 0.74 is obtained for the test set. The method performs better on pieces of Romantic-era composers, who intended to deliver more colours to the piano sound through pedalling techniques.}, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 44th International Conference on Audio, Speech and Signal Processing (ICASSP), Brighton, UK.}, Date-Added = {2019-06-04 11:08:04 +0000}, Date-Modified = {2020-12-25 21:06:14 +0000}, Doi = {10.1109/ICASSP.2019.8683505}, Keywords = {deep learning, piano pedaling recognition}, Publisher-Url = {https://doi.org/10.1109/ICASSP.2019.8683505}, Title = {Piano Sustain-Pedal Detection Using Convolutional Neural Networks}, Url = {http://www.semanticaudio.net/files/papers/liang2019icassp-preprint.pdf}, Year = {2019}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/liang2019icassp-preprint.pdf}, Bdsk-Url-2 = {http://dx.doi.org/10.1109/ICASSP.2019.8683505}} @article{choi2018ieee, Abstract = {Deep neural networks (DNN) have been successfully applied to music classification including music tagging. However, there are several open questions regarding the training, evaluation, and analysis of DNNs. In this article, we investigate specific aspects of neural networks, the effects of noisy labels, to deepen our understanding of their properties. We analyse and (re-)validate a large music tagging dataset to investigate the reliability of training and evaluation. Using a trained network, we compute label vector similarities which is compared to groundtruth similarity. The results highlight several important aspects of music tagging and neural networks. We show that networks can be effective despite relatively large error rates in groundtruth datasets, while conjecturing that label noise can be the cause of varying tag-wise performance differences. Lastly, the analysis of our trained network provides valuable insight into the relationships between music tags. These results highlight the benefit of using data-driven methods to address automatic music tagging.}, Author = {Choi, K. and Fazekas, G. and Sandler, M. and Cho, K.}, Date-Added = {2018-06-06 23:32:25 +0000}, Date-Modified = {2018-05-06 23:32:25 +0000}, Doi = {10.1109/TETCI.2017.2771298}, Journal = {IEEE Transactions on Emerging Topics in Computational Intelligence (TETCI)}, Keywords = {evaluation, music tagging, deep learning, CNN}, Number = {2}, Pages = {139 - 149}, Title = {The Effects of Noisy Labels on Deep Convolutional Neural Networks for Music Tagging}, Url = {https://arxiv.org/pdf/1706.02361.pdf}, Volume = {2}, Year = {2018}, Bdsk-Url-1 = {https://arxiv.org/pdf/1706.02361.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1109/TETCI.2017.2771298}} @article{liang2018jaes, Abstract = {When playing the piano, pedaling is one of the important techniques that lead to expressive performance, comprising not only the onset and offset information that composers often indicate in the score, but also gestures related to the musical interpretation by performers. This research examines pedaling gestures and techniques on the sustain pedal from the perspective of measurement, recognition, and visualization. Pedaling gestures can be captured by a dedicated measurement system where the sensor data is simultaneously recorded alongside the piano sound under normal playing conditions. Recognition is comprised of two separate tasks on the sensor data: pedal onset/offset detection and classification by technique. The onset and offset times of each pedaling technique were computed using signal processing algorithms. Based on features extracted from every segment when the pedal is pressed, the task of classifying the segments by pedaling technique was undertaken using machine-learning methods. High accuracy was obtained by cross validation. The recognition results can be represented using novel pedaling notations and visualized in an audio-based score-following application.}, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Date-Added = {2018-06-06 23:32:25 +0000}, Date-Modified = {2019-02-08 05:41:57 +0000}, Doi = {doi.org/10.17743/jaes.2018.0035}, Journal = {Journal of the Audio Engineering Society (JAES) Special Issue on Participatory Sound And Music Interaction Using Semantic Audio}, Keywords = {sensor system, piano pedalling, measurement, machine learning, gesture recognition, piano transcription}, Number = {6}, Pages = {448-456}, Title = {Measurement, Recognition and Visualisation of Piano Pedalling Gestures and Techniques}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=19584}, Volume = {66}, Year = {2018}, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=19584}, Bdsk-Url-2 = {http://dx.doi.org/10.17743/jaes.2018.0035}} @conference{milo2018dmrn, Abstract = {Significant amounts of user-generated audio content, such as sound effects, musical samples and music pieces, are uploaded to online repositories and made available under open licenses. Nevertheless, the creative industries are not yet using much this content in media production. A big share of creative commons content remains unreachable primarily because it is not well organised and annotated. In this paper we present the Audio Commons Initiative, which is aimed at promoting the use of open audio content and at developing technologies to support an ecosystem composed of audio content repositories, production tools and users. }, Author = {Milo, A. and Barthet, M. and Fazekas, G.}, Booktitle = {Proc. of the Digital Music Research Network (DMRN+13), 18 Dec., London, UK}, Date-Added = {2019-02-08 06:50:50 +0000}, Date-Modified = {2019-02-08 06:51:53 +0000}, Keywords = {Audio Commons, Creative Commons}, Title = {The Audio Common Initiative}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{sheng2018dmrn, Abstract = {Audio effects influence different perceptual attributes of sound due to linear and non-linear processing. They are typically applied to fulfil technical or aesthetic goals. Although audio effects are essential and widely used in music production, their use requires expert knowledge amateurs and hobbyists don't necessarily have. To reduce time and labour requirements, we designed an intelligent control system for a specific audio effect: dynamic range compressor (DRC). In previous research, we have established efficient feature sets for each individual DRC parameter. In this research, we are aiming to build a DNN model to extract features that are suitable to predict multiple features simultaneously given a sound example.}, Author = {Sheng, D. and Fazekas, G.}, Booktitle = {Proc. of the Digital Music Research Network (DMRN+13), 18 Dec., London, UK}, Date-Added = {2019-02-08 06:37:16 +0000}, Date-Modified = {2019-02-08 06:54:38 +0000}, Keywords = {deep learning, music production, dynamic range compression}, Title = {Using Triplet Network for the Intelligent Control of Audio Effects}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{viola2018SAAM, Abstract = {Playsound is a simple and intuitive web-based tool for music composition based on sounds from Freesound, an online repository of diverse audio content with Creative Commons licenses. In this paper, we present an approach based on Semantic Web technologies to provide recommendations to Playsound users. A Semantic Web of Things architecture is outlined, showing loosely coupled, independent software agents interoperating by means of a semantic publish/subscribe platform and a set of ontologies to describe agents, audio contents, input/output of audio analytics tools and recommendations. Preliminary tests confirm that the designed architecture adapts well to environments where services can be discovered and seamlessly orchestrated on the fly, resulting in a dynamic workflow.}, Author = {Viola, F. and Stolfi, A. and Milo, A. and Ceriani, M. and Barthet, M. and Fazekas, G.}, Booktitle = {Proc. of the 1st International Workshop on Semantic Applications for Audio and Music (ISWC SAAM), 9. Oct, Monterey, CA, USA}, Date-Added = {2019-02-08 06:23:36 +0000}, Date-Modified = {2019-02-08 06:50:26 +0000}, Doi = {10.1145/3243907.3243908}, Keywords = {Semantic Audio, Semantic Web, live music, live music-making}, Pages = {46-53}, Title = {Playsound.space: enhancing a live music performance tool with semantic recommendations}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{viola2018fruct, Abstract = {Semantic Web technologies are increasingly used in the Internet of Things due to their intrinsic propensity to foster interoperability among heterogenous devices and services. However, some of the IoT application domains have strict requirements in terms of timeliness of the exchanged messages, latency and support for constrained devices. An example of these domains is represented by the emerging area of the Internet of Musical Things. In this paper we propose C Minor, a CoAP-based semantic publish/subscribe broker specifically designed to meet the requirements of Internet of Musical Things applications, but relevant for any IoT scenario. We assess its validity through a practical use case.}, Author = {Viola, F. and Turchet, L. and Antoniazzi, F. and Fazekas, G.}, Booktitle = {Proc. of the 23rd IEEE Conference of Open Innovations Association (IEEE FRUCT), 13-16 Nov., Bologna, Italy}, Date-Added = {2019-02-08 06:16:14 +0000}, Date-Modified = {2019-02-08 07:19:32 +0000}, Doi = {10.23919/FRUCT.2018.8588087}, Keywords = {IoT, Semantic Audio, Semantic Web, IoMUT, MIR}, Pages = {405-415}, Title = {C Minor: a Semantic Publish/Subscribe Broker for the Internet of Musical Things}, Url = {https://www.fruct.org/publications/fruct23/files/Vio.pdf}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{turchet2018fruct, Abstract = {The Internet of Musical Things is an emerging research area that relates to the network of Musical Things, which are computing devices embedded in physical objects dedicated to the production and/or reception of musical content. In this paper we propose a semantically-enriched Internet of Musical Things architecture which relies on a semantic audio server and edge computing techniques. Specifically, a SPARQL Event Processing Architecture is employed as an interoperability enabler allowing multiple heterogeneous Musical Things to cooperate, relying on a music-related ontology. We technically validate our architecture by implementing an ecosystem around it, where five Musical Thing prototypes communicate between each other.}, Author = {Turchet, L. and Viola, F. and Fazekas, G. and Barthet, M.}, Booktitle = {Proc. of the 23rd IEEE Conference of Open Innovations Association (IEEE FRUCT), 13-16 Nov., Bologna, Italy}, Date-Added = {2019-02-08 06:08:37 +0000}, Date-Modified = {2019-02-08 07:19:55 +0000}, Doi = {10.23919/FRUCT.2018.8587917}, Keywords = {IoMUT, IoT, Semantic Audio}, Pages = {382-390}, Title = {Towards a Semantic Architecture for the Internet of Musical Things}, Url = {https://www.fruct.org/publications/fruct23/files/Tur2.pdf}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{bromham2018aes, Abstract = {Dynamic range compressors (DRC) are one of the most commonly used audio effect in music production. The timing settings are particularly important for controlling the manner in which they will shape an audio signal. We present a subjective user study of DRC, where a series of different compressor attack and release setting are varied and applied to a set of 30 sec audio tracks. Participants are then asked to rate which ballistic settings are most appropriate for the style of music in their judgment and asked to select one of a series of tag words to describe the style or setting of the song. Results show that the attack parameter influences perceived style more than the release parameter. From the study this is seen more evidently in the case of Jazz and Rock styles than in EDM or Hip-Hop. The area of intelligent music production systems might benefit from this study in the future as it may help to inform appropriateness for certain DRC settings in varying styles. }, Author = {Bromham, G. and Moffat, D. and Barthet, M. and Fazekas, G.}, Booktitle = {Proc. of the {145th Convention of the Audio Engineering Society}, 17-20 Oct., New York, USA}, Date-Added = {2019-02-08 07:11:10 +0000}, Date-Modified = {2019-02-08 07:14:37 +0000}, Keywords = {intelligent music production, dynamic range compression}, Title = {The Impact of Compressor Ballistics on the Perceived Style of Music}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=19806}, Year = {2018}, Bdsk-Url-1 = {http://www.aes.org/events/144/papers/?ID=5993}} @conference{xambo2018am, Abstract = {Nowadays, a number of online music databases are available under Creative Commons licenses (e.g. Jamendo, ccMixter). Typically, it is possible to navigate and play their content through search interfaces based on metadata and file-wide tags. However, because this music is largely unknown, additional methods of discovery need to be explored. In this paper, we focus on a use case for music learners. We present a web app prototype that allows novice and expert musicians to discover songs in Jamendo's music collection by specifying a set of chords. Its purpose is to provide a more pleasurable practice experience by suggesting novel songs to play along with, instead of practising isolated chords or with the same song over and over again. To handle less chord-oriented songs and transcription errors that inevitably arise from the automatic chord estimation used to populate the database, query results are ranked according to a computational confidence measure. In order to assess the validity of the confidence ranked system, we conducted a small pilot user study to assess its usefulness. Drawing on those preliminary findings, we identify some design recommendations for future applications of music learning and music search engines focusing on the user experience when interacting with sound.}, Author = {Xambo, A. and Pauwels, J. and Roma, G. and Barthet, M. and Fazekas, G.}, Booktitle = {Proc. of Audio Mostly 2018: Sound in Immersion and Emotion (AM '18), 12-14 Sept., Wrexham, United Kingdom.}, Date-Added = {2019-02-08 05:54:37 +0000}, Date-Modified = {2019-02-08 07:08:30 +0000}, Doi = {10.1145/3243274.3243291}, Keywords = {Audio Commons, Music Education, music information retrieval, MIR}, Local-Url = {http://annaxambo.me/pub/Xambo_et_al_2018_Jam_with_Jamendo.pdf}, Title = {Jam with Jamendo: Querying a Large Music Collection by Chords from a Learner's Perspective}, Url = {https://dl.acm.org/citation.cfm?id=3243291}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{pauwels2018wac, Abstract = {A common problem in music education is finding varied and engaging material that is suitable for practising a specific musical concept or technique. At the same time, a number of large music collections are available under a Creative Commons (CC) licence (e.g. Jamendo, ccMixter), but their potential is largely untapped because of the relative obscurity of their content. In this paper, we present *Jam with Jamendo*, a web application that allows novice and expert learners of musical instruments to query songs by chord content from a large music collection, and practise the chords present in the retrieved songs by playing along. Its goal is twofold: the learners get a larger variety of practice material, while the artists receive increased exposure. We experimented with two visualisation modes. The first is a linear visualisation based on a moving time axis, the second is a circular visualisation inspired by the chromatic circle. We conducted a small-scale thinking-aloud user study with seven participants based on a hands-on practice with the web app. Through this pilot study, we obtained a qualitative understanding of the potentials and challenges of each visualisation, which will be used to inform the next design iteration of the web app.}, Author = {Pauwels, J. and Xambo, A. and Roma, G. and Barthet, M. and Fazekas, G}, Booktitle = {Proc. of the Web Audio Conference (WAC `18), 19-21 Sept., Berlin, Germany.}, Date-Added = {2019-02-08 05:47:13 +0000}, Date-Modified = {2019-02-08 07:20:58 +0000}, Keywords = {Audio Commons, Creative Commons, Music Education, information retrieval, MIR, Jamendo}, Local-Url = {https://webaudioconf.com/papers/exploring-real-time-visualisations-to-support-chord-learning-with-a-large-music-collection.pdf}, Title = {Exploring Real-time Visualisations to Support Chord Learning with a Large Music Collection}, Url = {http://annaxambo.me/pub/Pauwels_et_al_2018_Exploring_real-time_visualisations.pdf}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{ceriani2018iswc, Abstract = {Multiple online services host repositories of audio clips of different kinds, ranging from music tracks, albums, playlists, to instrument samples and loops, to a variety of recorded or synthesized sounds. Programmatic access to these resources maybe used by client applications for tasks ranging from customized musical listening and exploration, to music/sounds creation from existing sounds and samples, to audio-based user interaction in apps and games. We designed an ontology to facilitate interoperability between repositories and clients in this domain. There was no previous comprehensive data model for our domain, however the new ontology relates to existing ontologies, such as the Functional Requirements for Bibliographic Records for the authoring and publication process of creative works, the Music Ontology for the authoring and publication of music, the EBU Core ontology to describe media files and formats and the Creative Commons Licensing ontology to describe licences. This paper documents the design of the ontology and its evaluation with respect to specific requirements gathered from stakeholders.}, Author = {Ceriani, M. and Fazekas, G.}, Booktitle = {Proc. of the 17th International Semantic Web Conference (ISWC'18), 8-12 Oct., Monterey, CA, USA}, Date-Added = {2019-02-07 23:29:48 +0000}, Date-Modified = {2019-02-08 06:06:07 +0000}, Doi = {doi.org/10.1007/978-3-030-00668-6_2}, Keywords = {ontology, music metadata, Audio Commons}, Local-Url = {https://link.springer.com/chapter/10.1007%2F978-3-030-00668-6_2}, Pages = {20-35}, Publisher = {Springer, Cham}, Title = {Audio Commons Ontology: A Data Model for an Audio Content Ecosystem}, Url = {https://qmro.qmul.ac.uk/xmlui/handle/123456789/43143}, Volume = {11137}, Year = {2018}} @conference{liang2018eusipco, Abstract = {In this paper, the problem of legato pedalling technique detection in polyphonic piano music is addressed. We propose a novel detection method exploiting the effect of sympathetic resonance which can be enhanced by a legato-pedal onset. To measure the effect, specific piano transcription was performed using the templates of pre-recorded isolated notes, from which partial frequencies were estimated. This promotes the acquisition of residual components associated to the weak co-excitation of damped notes due to the legato pedalling technique. Features that represent the sympathetic resonance measure were extracted from residuals. We finally used a logistic regression classifier to distinguish the existence of legato-pedal onsets.}, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the {26th European Signal Processing Conference (EUSIPCO 2018)}, 3-7 Sept, Rome, Italy}, Date-Added = {2018-05-06 23:32:25 +0000}, Date-Modified = {2019-02-08 05:37:30 +0000}, Doi = {10.23919/EUSIPCO.2018.8553341}, Keywords = {Signal Processing, Piano pedalling, Physical model}, Pages = {2484-2488}, Title = {Piano Legato-Pedal Onset Detection Based on a Sympathetic Resonance Measure}, Url = {https://ieeexplore.ieee.org/document/8553341}, Year = {2018}, Bdsk-Url-1 = {https://ieeexplore.ieee.org/document/8553341}, Bdsk-Url-2 = {http://dx.doi.org/10.23919/EUSIPCO.2018.8553341}} @conference{choi2018eusipco, Abstract = {In this paper, we empirically investigate the effect of audio preprocessing on music tagging with deep neural networks. While it is important to choose the best preprocessing strategy from an engineering perspective, it usually has been out of the focus in many academic research. We perform comprehensive experiments involving audio preprocessing using different time-frequency representations, logarithmic magnitude compression, frequency weighting, and scaling. We show that many commonly used input audio preprocessing techniques are redundant except logarithmic magnitude compression.}, Author = {Choi, K. and Fazekas, G. and Sandler, M. and Cho, K.}, Booktitle = {Proc. of the {26th European Signal Processing Conference (EUSIPCO 2018)}, 3-7 Sept, Rome, Italy}, Date-Added = {2018-05-06 23:32:25 +0000}, Date-Modified = {2019-02-08 05:35:33 +0000}, Doi = {10.23919/EUSIPCO.2018.8553106}, Keywords = {Signal Processing, Deep Learning, MIR, Auto-tagging}, Local-Url = {https://arxiv.org/abs/1709.01922}, Pages = {1870-1874}, Title = {A Comparison of Audio Signal Preprocessing Methods for Deep Neural Networks on Music Tagging}, Url = {https://ieeexplore.ieee.org/document/8553106}, Year = {2018}, Bdsk-Url-1 = {https://ieeexplore.ieee.org/document/8553106}, Bdsk-Url-2 = {http://dx.doi.org/10.23919/EUSIPCO.2018.8553106}} @conference{xambo2018nime, Abstract = {The recent increase in the accessibility and size of personal and crowdsourced digital sound collections brought about a valuable resource for music creation. Finding and retrieving relevant sounds in performance leads to challenges that can be approached using music information retrieval (MIR). In this paper, we explore the use of MIR to retrieve and repurpose sounds in musical live coding. We present a live coding system built on SuperCollider enabling the use of audio content from online Creative Commons (CC) sound databases such as Freesound or personal sound databases. The novelty of our approach lies in exploiting high-level MIR methods (e.g., query by pitch or rhythmic cues) using live coding techniques applied to sounds. We demonstrate its potential through the reflection of an illustrative case study and the feedback from four expert users. The users tried the system with either a personal database or a crowdsourced database and reported its potential in facilitating tailorability of the tool to their own creative workflows.}, Author = {Xambo, A. and Roma, G. and Lerch, A. and Barthet, M. and Fazekas, G.}, Booktitle = {Proc. of the {New Interfaces for Musical Expression (NIME)}, 3-6 June, Blacksburg, VA, USA.}, Date-Added = {2018-05-07 00:22:07 +0000}, Date-Modified = {2019-02-08 05:45:32 +0000}, Keywords = {live coding, MIR, sound samples, Creative Commons}, Pages = {364-369}, Title = {Live Repurposing of Sounds: MIR Explorations with Personal and Crowd-sourced Databases}, Url = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}, Year = {2018}, Bdsk-Url-1 = {https://pure.hud.ac.uk/files/13360267/Xambo_et_al_2018_Live_repurposing_of_sounds.pdf}} @conference{sheng2018aes, Abstract = {Casual users of audio effects may lack practical experience or knowledge of their low-level signal processing parameters. An intelligent control tool that allows using sound examples to control effects would strongly benefit these users. In a previous work we proposed a control method for the dynamic range compressor (DRC) using a random forest regression model. It maps audio features extracted from a reference sound to DRC parameter values, such that the processed signal resembles the reference. The key to good performance in this system is the relevance and effectiveness of audio features. This paper focusses on a thorough exposition and assessment of the features, as well as the comparison of different strategies to find the optimal feature set for DRC parameter estimation, using automatic feature selection methods. This enables us to draw conclusions about which features are relevant to core DRC parameters. Our results show that conventional time and frequency domain features well known from the literature are sufficient to estimate the DRC's threshold and ratio parameters, while more specialized features are needed for attack and release time, which induce more subtle changes to the signal. }, Author = {Sheng, D. and Fazekas, G.}, Booktitle = {Proc. of the {144th Convention of the Audio Engineering Society}, 23-26 May, Milan, Italy}, Date-Added = {2018-05-07 00:06:23 +0000}, Date-Modified = {2018-05-07 00:09:42 +0000}, Keywords = {feature selection,. intelligent music production, AES, intelligent audio effects}, Local-Url = {sheng2018aes.pdf}, Title = {Feature Selection for Dynamic Range Compressor Parameter Estimation}, Url = {http://www.aes.org/events/144/papers/?ID=5993}, Year = {2018}, Bdsk-Url-1 = {http://www.aes.org/events/144/papers/?ID=5993}} @conference{sheng2018icassp, Abstract = {This paper proposes a method of controlling the dynamic range compressor using sound examples. Our earlier work showed the effectiveness of random forest regression to map acoustic features to effect control parameters. We extend this work to address the challenging task of extracting relevant features when audio events overlap. We assess different audio decomposition approaches such as onset event detection, NMF, and transient/stationary audio separation using ISTA and compare feature extraction strategies for each case. Numerical and perceptual similarity tests show the utility of audio decomposition as well as specific features in the prediction of dynamic range compressor parameters.}, Author = {Sheng, D. and Fazekas, G.}, Booktitle = {Proc. of the {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, April 15-20, Calgary, Canada.}, Date-Added = {2018-05-06 23:33:10 +0000}, Date-Modified = {2019-02-08 06:20:19 +0000}, Doi = {10.1109/ICASSP.2018.8461513}, Keywords = {intelligent music production, ICASSP, intelligent audio effects}, Local-Url = {sheng2018icassp.pdf}, Title = {Feature Design Using Audio Decomposition for Intelligent Control of the Dynamic Range Compressor}, Url = {https://2018.ieeeicassp.org/Papers/ViewPapers.asp?PaperNum=3048}, Year = {2018}, Bdsk-Url-1 = {https://2018.ieeeicassp.org/Papers/ViewPapers.asp?PaperNum=3048}} @conference{marengo2018mw, Abstract = {The digitization of art collections is a great opportunity to engage audiences beyond the context of the museum visit. Interfaces to access collections have been initially tailored for professional search tasks: the new challenge is how to design systems for open, casual, and leisure-based explorations. In a human-centered framework, the users' perspective is a fundamental step to design and improve creative solutions. How can we listen to and understand the potential users, in order to design meaningful experiences? How can we collect insights, and what do these tell us about the users and the systems? We explore the use of inquiry techniques as a method to surface the curiosities people have for paintings. During two iterations, visitors of public events wrote questions they had about selected paintings. 138 Post-its were collected and thematically analyzed. Results highlight that curiosities are contextualized, and that artworks are interpreted mainly as scenes. People are interested in meanings and symbols; they also displayed the use of fantasy and empathy. Additionally, we evaluated the effect of age, previous knowledge of the painting, and frequency of visiting museums on the questions' content through statistical analysis. While no strong finding emerged, we noticed that adults and kids likewise display an active role in the inquiry process, and that a previous knowledge of the painting is connected to more descriptive and atomic curiosities. In the discussion, we suggest design opportunities might lay in the interactive discovery of information, in storytelling-based descriptions, and in emotional connection. Our findings suggest that in leisure-based explorations atomic information might not be satisfying, and that descriptions should be contextualized to the painting. Our presentation will be an opportunity to discuss the value of the method, and to comment on how the insights could be embedded into the design of leisure-based experiences.}, Author = {Marengo, L. and Fazekas, G. and Tombros, A.}, Booktitle = {Proc. International Conference on {Museums and the Web 2018}, April 18-21, Vancouver, Canada.}, Date-Added = {2018-05-01 00:11:04 +0000}, Date-Modified = {2018-05-01 00:16:25 +0000}, Keywords = {visual art, information design, inquiry techniques, user requirements, online collections, interaction design}, Title = {I Wonder... Inquiry Techniques As A Method To Gain Insights Into People's Encounters With Visual Art}, Url = {http://mw18.mwconf.org/paper/i-wonder-inquiry-techniques-as-a-method-to-gain-insights-into-peoples-encounters-with-visual-art}, Year = {2018}, Bdsk-Url-1 = {http://mw18.mwconf.org/paper/i-wonder-inquiry-techniques-as-a-method-to-gain-insights-into-peoples-encounters-with-visual-art}} @book{fazekas2017acm, Author = {Fazekas, G. and Barthet, M. and Stockman, T. (editors)}, Date-Added = {2017-12-22 01:44:19 +0000}, Date-Modified = {2019-02-08 06:07:31 +0000}, Isbn = {978-1-4503-5373-1}, Keywords = {Audio Mostly, Participatory Sound and Music Experiences}, Publisher = {Association of Computing Machinery (ACM)}, Title = {Proceedings of the 12th International Audio Mostly Conference on Augmented and Participatory Sound and Music Experiences, London, United Kingdom, August 23 - 26, 2017.}, Url = {https://dl.acm.org/citation.cfm?id=3123514}, Year = {2017}, Bdsk-Url-1 = {https://dl.acm.org/citation.cfm?id=3123514}} @conference{wilmering2017aes, Abstract = {Semantic Audio is an emerging field in the intersection of signal processing, machine learning, knowledge representation, and ontologies unifying techniques involving audio analysis and the Semantic Web. These mechanisms enable the creation of new applications and user experiences for music communities. We present a case study focusing on what Semantic Audio can offer to a particular fan base, that of the Grateful Dead, characterized by a profoundly strong affinity with technology and the internet. We discuss an application that combines information drawn from existing platforms and results from the automatic analysis of audio content to infer higher-level musical information, providing novel user experiences particularly in the context of live music events.}, Author = {Wilmering, T. and Thalmann, F. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. 143 Convention of the Audio Engineering Society, (e-Brief) Oct. 18-12, New York, USA}, Date-Added = {2017-12-22 20:15:54 +0000}, Date-Modified = {2017-12-22 20:22:06 +0000}, Keywords = {semantic audio, Semantic Web technologies, live music, live music archive, linked data, grateful dead}, Publisher-Url = {http://www.aes.org/e-lib/browse.cfm?elib=19335}, Title = {Bridging Fan Communities and Facilitating Access to Music Archives through Semantic Audio Applications}, Url = {http://www.semanticaudio.net/files/papers/wilmering2017aes.pdf}, Year = {2017}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/wilmering2017aes.pdf}} @conference{choi2017ismir, Abstract = {This paper won the ``Best paper award'' at ISMIR2017 (https://ismir2017.smcnus.org/awards/) --- In this paper, we present a transfer learning approach for music classification and regression tasks. We propose to use a pre-trained convnet feature, a concatenated feature vector using the activations of feature maps of multiple layers in a trained convolutional network. We show how this convnet feature can serve as general-purpose music representation. In the experiments, a convnet is trained for music tagging and then transferred to other music-related classification and regression tasks. The convnet feature outperforms the baseline MFCC feature in all the considered tasks and several previous approaches that are aggregating MFCCs as well as low- and high-level music features.}, Author = {Choi, K. and Fazekas, G. and Sandler, M. and Cho, K.}, Booktitle = {Proc. 18th International Society for Music Information Retrieval Conference (ISMIR), Oct. 23-27, Suzhou, China}, Date-Added = {2017-12-22 15:07:18 +0000}, Date-Modified = {2017-12-22 15:13:32 +0000}, Keywords = {transfer learning, CNN, DNN, genre classification, music emotion regressions, acoustic event detection}, Local-Url = {https://arxiv.org/abs/1703.09179}, Title = {Transfer learning for music classification and regression tasks [best paper award]}, Url = {https://ismir2017.smcnus.org/wp-content/uploads/2017/10/12_Paper.pdf}, Year = {2017}, Bdsk-Url-1 = {https://ismir2017.smcnus.org/wp-content/uploads/2017/10/12_Paper.pdf}} @conference{sheng2017dafx, Abstract = {Practical experience with audio effects as well as knowledge of their parameters and how they change the sound is crucial when controlling digital audio effects. This often presents barriers for musicians and casual users in the application of effects. These users are more accustomed to describing the desired sound verbally or using examples, rather than understanding and configuring low-level signal processing parameters. This paper addresses this issue by providing a novel control method for audio effects. While a significant body of works focus on the use of semantic descriptors and visual interfaces, little attention has been given to an important modality, the use of sound examples to control effects. We use a set of acoustic features to capture important characteristics of sound examples and evaluate different regression models that map these features to effect control parameters. Focusing on dynamic range compression, results show that our approach provides a promising first step in this direction.}, Author = {Sheng, D. and Fazekas, G.}, Booktitle = {Proc. of the 20th International Conference on Digital Audio Effects (DAFx-17), September 5--9, Edinburgh, UK}, Date-Added = {2017-12-22 20:08:43 +0000}, Date-Modified = {2017-12-22 20:13:07 +0000}, Keywords = {intelligent music production, DAFX, intelligent audio effects}, Local-Url = {sheng2017dafx.pdf}, Title = {Automatic Control Of The Dynamic Range Compressor Using A Regression Model And A Reference Sound}, Url = {http://www.dafx17.eca.ed.ac.uk/papers/DAFx17_paper_44.pdf}, Year = {2017}, Bdsk-Url-1 = {http://www.dafx17.eca.ed.ac.uk/papers/DAFx17_paper_44.pdf}} @book{marengo2017hci, Abstract = {As many cultural institutions are publishing digital heritage material on the web, a new type of user emerged, that casually interacts with the art collection in his/her free time, driven by intrinsic curiosity more than by a professional duty or an informational goal. Can choices in how the interaction with data is structured increase engagement of such users? In our exploratory study, we use the WikiArt project as a case study to analyse how users approach search interfaces for free exploration. Our preliminary results show that, despite the remarkable diversity of artworks available, users rely on familiarity as their main criterion to navigate the website; they stay within known topics and rarely discover new ones. Users show interest in heterogeneous datasets, but their engagement is rarely sustained, while the presence of slightly unrelated artworks in a set can increase curiosity and self-reflection. Finally, we discuss the role of the database's perceived size on users' expectations.}, Author = {Marengo, L. and Fazekas, G., Tombros A.}, Booktitle = {Proc. 19th International Conference on Human-Computer Interaction (HCI'17), 9-14 July, Vancouver, Canada}, Date-Added = {2017-12-22 18:06:25 +0000}, Date-Modified = {2017-12-22 18:42:08 +0000}, Doi = {10.1007/978-3-319-58753-0_82}, Keywords = {information retrieval, information seeking, casual interaction, curiosity, engagement}, Pages = {538-590}, Publisher = {Springer, Cham}, Publisher-Url = {https://link.springer.com/chapter/10.1007%2F978-3-319-58753-0_82}, Series = {Communications in Computer and Information Science}, Title = {The Interaction of Casual Users with Digital Collections of Visual Art: {An Exploratory Study of the WikiArt Website}}, Url = {http://www.semanticaudio.net/files/papers/marengo2017hci.pdf}, Volume = {714}, Year = {2017}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/marengo2017hci.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1007/978-3-319-58753-0_82}} @conference{liang2017aes, Abstract = {Automatic detection of piano pedaling techniques is challenging as it is comprised of subtle nuances of piano timbres. In this paper we address this problem on single notes using decision-tree-based support vector machines. Features are extracted from harmonics and residuals based on physical acoustics considerations and signal observations. We consider four distinct pedaling techniques on the sustain pedal (anticipatory full, anticipatory half, legato full, and legato half pedaling) and create a new isolated-note dataset consisting of different pitches and velocities for each pedaling technique plus notes played without pedal. Experiment shows the effectiveness of the designed features and the learned classifiers for discriminating pedaling techniques from the cross-validation trails.}, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. 143 Convention of the Audio Engineering Society, Oct. 18-12, New York, USA}, Date-Added = {2017-12-22 19:04:30 +0000}, Date-Modified = {2017-12-22 19:14:26 +0000}, Keywords = {pedaling recognition from audio, spectral modeling and feature extraction, machine learning}, Publisher-Url = {http://www.aes.org/e-lib/browse.cfm?elib=19209}, Title = {Detection of Piano Pedaling Techniques on the Sustain Pedal}, Url = {http://www.semanticaudio.net/files/papers/liang2017aes-preprint.pdf}, Year = {2017}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/liang2017aes-preprint.pdf}} @conference{sheng2017dmrn, Abstract = {We propose a method for the intelligent control of the dynamic range compressor targeting mono-timbral loops. Initial research using random forest regression has been shown to work in the context of isolated notes. Since audio loops have become the important in many production scenarios, this paper addresses this problem by decomposing loops into appropriate inputs for the initial system. We explore three types of audio decomposition approaches, onset event detection, NMF, and audio transient/stationary separation using ISTA, and extract features correspondingly. Results show a convincing trend that using features extracted in the decomposition domain to train the regression model improves the performance both numerically and perceptually. }, Author = {Sheng, D. and Fazekas, G.}, Booktitle = {Proc. Digital Music Research Network Workshop (DMRN+12), Dec. 19, London, UK}, Date-Added = {2017-12-22 20:37:14 +0000}, Date-Modified = {2019-02-07 23:35:05 +0000}, Keywords = {intelligent audio effects, intelligent music production, DAFX, dynamic range compression, feature extraction}, Title = {Feature design for intelligent control of the dynamic range compressor using audio decomposition}, Url = {http://www.semanticaudio.net/files/papers/sheng2017dmrnea.pdf}, Year = {2017}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/sheng2017dmrnea.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.26494/DMRN.2017.30583}} @conference{liang2017nime, Abstract = {This paper presents the results of a study of piano pedalling techniques on the sustain pedal using a newly designed measurement system named Piano Pedaller. The system is comprised of an optical sensor mounted in the piano pedal bearing block and an embedded platform for recording audio and sensor data. This enables recording the pedalling gesture of real players and the piano sound under normal playing conditions. Using the gesture data collected from the system, the task of classifying these data by pedalling technique was undertaken using a Support Vector Machine (SVM). Results can be visualised in an audio based score following application to show pedalling together with the player's position in the score.}, Author = {Liang, B. and Fazekas, G. and McPherson, A. and Sandler, M.}, Booktitle = {Proc. of the International Conference on New Interfaces for Musical Expression (NIME), May 15-18, Copenhagen, Denmark}, Date-Added = {2017-12-22 18:53:42 +0000}, Date-Modified = {2017-12-22 19:03:05 +0000}, Keywords = {piano gesture recognition, optical sensor, real-time data acquisition, bela, music informatics}, Local-Url = {https://pdfs.semanticscholar.org/fd00/fcfba2f41a3f182d2000ca4c05fb2b01c475.pdf}, Pages = {325-329}, Publisher-Url = {http://homes.create.aau.dk/dano/nime17/}, Title = {Piano Pedaller: A Measurement System for Classification and Visualisation of Piano Pedalling Techniques}, Url = {http://www.nime.org/proceedings/2017/nime2017_paper0062.pdf}, Year = {2017}, Bdsk-Url-1 = {http://www.nime.org/proceedings/2017/nime2017_paper0062.pdf}} @conference{barthet2016chi, Abstract = {We discuss several state-of-the-art systems that propose new paradigms and user workflows for music composition, production, performance, and listening. We focus on a selection of systems that exploit recent advances in semantic and affective computing, music information retrieval (MIR) and semantic web, as well as insights from fields such as mobile computing and information visualisation. These systems offer the potential to provide transformative experiences for users, which is manifested in creativity, engagement, efficiency, discovery and affect.}, Author = {Barthet, M. and Fazekas, G. and Thalmann, F. and Sandler, M. and Wiggins, G.A.}, Booktitle = {Proc. ACM Conference on Human Factors in Computing Systems (CHI), May 7--12, San Jose, CA, USA.}, Date-Added = {2017-12-22 18:26:58 +0000}, Date-Modified = {2017-12-22 18:38:33 +0000}, Keywords = {mood-based interaction, intelligent music production, HCI}, Local-Url = {https://qmro.qmul.ac.uk/xmlui/handle/123456789/12502}, Publisher-Url = {http://mcl.open.ac.uk/music-chi/uploads/19/HCIMUSIC_2016_paper_15.pdf}, Title = {Crossroads: Interactive Music Systems Transforming Performance, Production and Listening}, Url = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/12502/Barthet%20Crossroads%3A%20Interactive%20Music%20Systems%202016%20Accepted.pdf}, Year = {2016}, Bdsk-Url-1 = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/12502/Barthet%20Crossroads%3A%20Interactive%20Music%20Systems%202016%20Accepted.pdf}} @conference{page2017jcdl, Abstract = {Building upon a collection with functionality for discovery and analysis has been described by Lynch as a `layered' approach to digital libraries. Meanwhile, as digital corpora have grown in size, their analysis is necessarily supplemented by automated application of computational methods, which can create layers of information as intricate and complex as those within the content itself. This combination of layers - aggregating homogeneous collections, specialised analyses, and new observations - requires a flexible approach to systems implementation which enables pathways through the layers via common points of understanding, while simultaneously accommodating the emergence of previously unforeseen layers. In this paper we follow a Linked Data approach to build a layered digital library based on content from the Internet Archive Live Music Archive. Starting from the recorded audio and basic information in the Archive, we first deploy a layer of catalogue metadata which allows an initial - if imperfect - consolidation of performer, song, and venue information. A processing layer extracts audio features from the original recordings, workflow provenance, and summary feature metadata. A further analysis layer provides tools for the user to combine audio and feature data, discovered and reconciled using interlinked catalogue and feature metadata from layers below. Finally, we demonstrate the feasibility of the system through an investigation of `key typicality' across performances. This highlights the need to incorporate robustness to inevitable `imperfections' when undertaking scholarship within the digital library, be that from mislabelling, poor quality audio, or intrinsic limitations of computational methods. We do so not with the assumption that a `perfect' version can be reached; but that a key benefit of a layered approach is to allow accurate representations of information to be discovered, combined, and investigated for informed interpretation.}, Author = {Page, K. and Bechhofer, S. and Fazekas, G. and Weigl D. and Wilmering T.}, Booktitle = {ACM/IEEE Joint Conference on Digital Libraries (JCDL), June 19-23, Toronto, Canada}, Date-Added = {2017-12-22 17:57:51 +0000}, Date-Modified = {2017-12-22 21:07:08 +0000}, Doi = {10.1109/JCDL.2017.7991563}, Keywords = {Semantic Audio, Metadata, Feature extraction, Resource description framework, Databases, Ontologies}, Pages = {1-10}, Title = {Realising a Layered Digital Library: Exploration and Analysis of the Live Music Archive through Linked Data}, Url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7991563}, Year = {2017}, Bdsk-Url-1 = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7991563}, Bdsk-Url-2 = {https://dx.doi.org/10.1109/JCDL.2017.7991563}} @book{bechhofer2017iswc, Abstract = {We describe the publication of a linked data set exposing metadata from the Internet Archive Live Music Archive along with detailed feature analysis data of the audio files contained in the archive. The collection is linked to existing musical and geographical resources allowing for the extraction of useful or nteresting subsets of data using additional metadata. The collection is published using a `layered' approach, aggregating the original information with links and specialised analyses, and forms a valuable resource for those investigating or developing audio analysis tools and workflows.}, Author = {Bechhofer, S. and Page, K. and Weigl, D. and Fazekas, G. and Wilmering, T.}, Booktitle = {The Semantic Web, proc. of the 16th International Semantic Web Conference (ISWC), Oct. 21-25, Vienna, Austria}, Date-Added = {2017-12-22 15:39:21 +0000}, Date-Modified = {2017-12-22 15:53:18 +0000}, Doi = {10.1007/978-3-319-68204-4_3}, Keywords = {Linked Data, Semantic Audio, Semantic Web, live music archive}, Local-Url = {https://link.springer.com/chapter/10.1007/978-3-319-68204-4_3}, Pages = {29-37}, Publisher = {Springer, Cham}, Series = {Lecture Notes in Computer Science}, Title = {Linked Data Publication of Live Music Archives and Analyses}, Url = {https://iswc2017.semanticweb.org/wp-content/uploads/papers/MainProceedings/221.pdf}, Volume = {10588}, Year = {2017}, Bdsk-Url-1 = {https://iswc2017.semanticweb.org/wp-content/uploads/papers/MainProceedings/221.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1007/978-3-319-68204-4_3}} @conference{choi2017tutorial, Abstract = {Following their success in Computer Vision and other areas, deep learning techniques have recently become widely adopted in Music Information Retrieval (MIR) research. However, the majority of works aim to adopt and assess methods that have been shown to be effective in other domains, while there is still a great need for more original research focusing on music primarily and utilising musical knowledge and insight. The goal of this paper is to boost the interest of beginners by providing a comprehensive tutorial and reducing the barriers to entry into deep learning for MIR. We lay out the basic principles and review prominent works in this hard to navigate field. We then outline the network structures that have been successful in MIR problems and facilitate the selection of building blocks for the problems at hand. Finally, guidelines for new tasks and some advanced topics in deep learning are discussed to stimulate new research in this fascinating field. }, Author = {Choi, K. and Fazekas, G. and Cho, K. and Sandler, M.}, Booktitle = {Journal Paper - arXiv preprint}, Date-Added = {2017-12-22 15:34:37 +0000}, Date-Modified = {2017-12-22 15:37:50 +0000}, Keywords = {Deep Learning, tutorial, Semantic Audio, Music Information Retrieval}, Local-Url = {https://arxiv.org/abs/1709.04396}, Title = {A Tutorial on Deep Learning for Music Information Retrieval}, Url = {https://arxiv.org/pdf/1709.04396.pdf}, Year = {2017}, Bdsk-Url-1 = {https://arxiv.org/pdf/1709.04396.pdf}} @conference{choi2017icassp, Abstract = {We introduce a convolutional recurrent neural network (CRNN) for music tagging. CRNNs take advantage of convolutional neural networks (CNNs) for local feature extraction and recurrent neural networks for temporal summarisation of the extracted features. We compare CRNN with three CNN structures that have been used for music tagging while controlling the number of parameters with respect to their performance and training time per sample. Overall, we found that CRNNs show a strong performance with respect to the number of parameter and training time, indicating the effectiveness of its hybrid structure in music feature extraction and feature summarisation.}, Author = {Choi, K. and Fazekas, G. and Sandler, M. and Cho, K.}, Booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), March 5-9, New Orleans, USA}, Date-Added = {2017-12-22 15:20:33 +0000}, Date-Modified = {2017-12-22 19:28:47 +0000}, Doi = {10.1109/ICASSP.2017.7952585}, Keywords = {Deep Learning, CRNN, music tagging}, Local-Url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7952585}, Pages = {2392-2396}, Title = {Convolutional Recurrent Neural Networks for Music Classification}, Url = {https://arxiv.org/pdf/1609.04243.pdf}, Year = {2017}, Bdsk-Url-1 = {https://arxiv.org/pdf/1609.04243.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1109/ICASSP.2017.7952585}} @conference{pauwels2017ismir, Abstract = {Inspired by previous work on confidence measures for tempo estimation in loops, we explore ways to add confidence measures to other music labelling tasks. We start by reflecting on the reasons why the work on loops was successful and argue that it is an example of the ideal scenario in which it is possible to define a confidence measure independently of the estimation algorithm. This requires additional domain knowledge not used by the estimation algorithm, which is rarely available. Therefore we move our focus to defining confidence measures for hidden Markov models, a technique used in multiple music information retrieval systems and beyond. We propose two measures that are oblivious to the specific labelling task, trading off performance for computational requirements. They are experimentally validated by means of a chord estimation task. Finally, we have a look at alternative uses of confidence measures, besides those applications that require a high precision rather than a high recall, such as most query retrievals.}, Author = {Pauwels, J. and O'hanlon, K. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. 18th International Society for Music Information Retrieval Conference (ISMIR), Oct. 23-27, Suzhou, China}, Date-Added = {2017-12-22 14:56:50 +0000}, Date-Modified = {2017-12-22 15:05:47 +0000}, Keywords = {music labelling, chord and key recognition, probabilistic models, confidence measure, usability, channel separation from stereo signals, Audio Commons}, Local-Url = {https://qmro.qmul.ac.uk/xmlui/handle/123456789/30483}, Pages = {279 - 279}, Title = {Confidence Measures and Their Applications in Music Labelling Systems Based on Hidden Markov Models}, Url = {https://ismir2017.smcnus.org/wp-content/uploads/2017/10/195_Paper.pdf}, Year = {2017}, Bdsk-Url-1 = {https://ismir2017.smcnus.org/wp-content/uploads/2017/10/195_Paper.pdf}} @conference{liang2017dmrn, Abstract = {Notations of piano pedalling technique in the music score are usually lacking in detail: they provide boundary locations of pedalling techniques, but do not indicate what musical attribute prompts the pedalling change. Understanding this relationship would be useful for musicology and piano pedagogy. We propose to model how musically-motivated features correlate with pedalling transitions. Our aim is to employ this model as prior information for the detection of pedal onsets and offsets from audio recordings.}, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. Digital Music Research Network Workshop (DMRN+12), Dec. 19, London, UK}, Date-Added = {2017-12-22 20:26:25 +0000}, Date-Modified = {2017-12-22 20:42:03 +0000}, Doi = {10.26494/DMRN.2017.30583}, Keywords = {gesture recognition, piano pedaling, feature extraction, expressive performance, symbolic music analysis}, Title = {Discovering Feature Relevance in Pedalling Analyses of Piano Music}, Url = {http://www.semanticaudio.net/files/papers/liang2017dmrnea.pdf}, Year = {2017}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/liang2017dmrnea.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.26494/DMRN.2017.30583}} @conference{liang2017am, Abstract = {This paper presents a study of piano pedalling technique recognition on the sustain pedal utilising gesture data that is collected using a novel measurement system. The recognition is comprised of two separate tasks: onset/offset detection and classification. The onset and offset time of each pedalling technique was computed through signal processing algorithms. Based on features extracted from every segment when the pedal is pressed, the task of classifying the segments by pedalling technique was undertaken using machine learning methods. We exploited and compared a Support Vector Machine (SVM) and a hidden Markov model (HMM) for classification. Recognition results can be represented by customised pedalling notations and visualised in a score following system.}, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Booktitle = {Proceeding Proc. of the ACM 12th International Audio Mostly Conference on Augmented and Participatory Sound and Music Experiences, Aug. 23-26, London, United Kingdom}, Date-Added = {2017-12-22 18:43:40 +0000}, Date-Modified = {2017-12-22 18:50:24 +0000}, Doi = {10.1145/3123514.3123535}, Keywords = {gesture recognition, expressive performance, signal processing, machine learning, SVM, HMM, piano pedaling}, Local-Url = {http://www.semanticaudio.net/files/papers/liang2017am.pdf}, Publisher-Url = {https://dl.acm.org/citation.cfm?id=3123514.3123535}, Title = {Recognition of Piano Pedalling Techniques Using Gesture Data}, Url = {https://dl.acm.org/citation.cfm?id=3123514.3123535}, Year = {2017}, Bdsk-Url-1 = {https://dl.acm.org/citation.cfm?id=3123514.3123535}, Bdsk-Url-2 = {https://dx.doi.org/10.1145/3123514.3123535}} @conference{allik2016wac, Abstract = {myMoodplay is a web app that allows users to interactively discover music by selecting desired emotions. The application uses the Web Audio API, JavaScript animation for visualisation, linked data formats and affective computing technologies. We explore how artificial intelligence, the Semantic Web and audio synthesis can be combined to provide new personalised online musical experiences. Users can choose degrees of energy and pleasantness to shape the desired musical mood trajectory. Semantic Web technologies have been embedded in the system to query mood coordinates from a triple store using a SPARQL endpoint and to connect to external linked data sources for metadata.}, Author = {Allik, A. and Fazekas, G. and Barthet, M. and Sandler, M.}, Booktitle = {Proc. of the 2nd Web Audio Conference (WAC), April 4--6, Atlanta, Georgia, USA.}, Date-Added = {2017-12-29 19:26:47 +0000}, Date-Modified = {2017-12-29 19:38:36 +0000}, Keywords = {Semantic Audio, mood-based interaction, Ontology-based systems}, Local-Url = {http://www.semanticaudio.net/files/papers/allik2016wac.pdf}, Title = {myMoodplay: An interactive mood-based music discovery app}, Url = {http://hdl.handle.net/1853/54589}, Year = {2016}, Bdsk-Url-1 = {http://hdl.handle.net/1853/54589}} @article{barthet2015am, Abstract = {Moodplay is a system that allows users to collectively con- trol music and lighting effects to express desired emotions. The interaction is based on the Mood Conductor participa- tory performance system that uses web, data visualisation and affective computing technologies. We explore how arti- ficial intelligence, semantic web and audio synthesis can be combined to provide new personalised and immersive musi- cal experiences. Participants can choose degrees of energy and pleasantness to shape the music played using a web in- terface. Semantic Web technologies have been embedded in the system to query mood coordinates from a triple store us- ing a SPARQL endpoint and to connect to external linked data sources for metadata. }, Author = {Barthet, M. and Fazekas, G. and Allik, A. and Sandler, M.}, Date-Added = {2017-12-29 19:18:29 +0000}, Date-Modified = {2017-12-29 19:22:13 +0000}, Doi = {10.1145/2814895.2814922}, Isbn = {978-1-4503-3896-7}, Journal = {Proc. of the ACM Audio Mostly International Conference, 7-9 Oct. Thessaloniki, Greece.}, Keywords = {Semantic Audio, Music and Emotion, Ontology-based systems, Music Performance and Interactive Systems}, Local-Url = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/12173/Barthet%20Moodplay%3A%20an%20interactive%20mood-based%20musical%20experience%202015%20Published.pdf}, Title = {Moodplay: an interactive mood-based musical experience}, Url = {https://dl.acm.org/citation.cfm?id=2814922}, Year = {2015}, Bdsk-Url-1 = {http://audiomostly.com/keynote/george-fazekas/}} @book{juric2016mtsr, Abstract = {Creating an ecosystem that will tie together the content, technologies and tools in the field of digital music and audio is possible if all the entities of the ecosystem share the same vocabulary and high quality metadata. Creation of such metadata will allow the creative industries to retrieve and reuse the content of Creative Commons audio in innovative new ways. In this paper we present a highly automated method capable of exploiting already existing API (Application Programming Interface) descriptions about audio content and turning it into a knowledge base that can be used as a building block for ontologies describing audio related entities and services.}, Author = {Juric, D. and Fazekas, G.}, Booktitle = {Proc. Metadata and Semantics Research (MTSR), Nov. 22-25, G{\"o}ttingen, Germany}, Date-Added = {2017-12-21 20:32:50 +0000}, Date-Modified = {2017-12-22 13:03:51 +0000}, Doi = {10.1007/978-3-319-49157-8_5}, Keywords = {Metadata, Audio content, Ontologies, Natural language processing, Knowledge extraction, Audio Commons}, Local-Url = {http://www.semanticaudio.net/files/papers/juric2016mtsr.pdf}, Pages = {55-66}, Publisher = {Springer, Cham}, Publisher-Url = {https://link.springer.com/chapter/10.1007/978-3-319-49157-8_5}, Series = {Communications in Computer and Information Science,}, Title = {Knowledge Extraction from Audio Content Service Providers' API Descriptions}, Url = {http://www.semanticaudio.net/files/papers/juric2016mtsr.pdf}, Volume = {672}, Year = {2016}, Bdsk-Url-1 = {https://link.springer.com/chapter/10.1007/978-3-319-49157-8_5}, Bdsk-Url-2 = {https://dx.doi.org/10.1007/978-3-319-49157-8_5}} @conference{chiliguano2016hybrid, Abstract = {Internet resources available today, including songs, albums, playlists or podcasts, that a user cannot discover if there is not a tool to filter the items that the user might consider relevant. Several recommendation techniques have been developed since the Internet explosion to achieve this filtering task. In an attempt to recommend relevant songs to users, we propose an hybrid recommender that considers real-world users information and high-level representation for audio data. We use a deep learning technique, convolutional deep neural networks, to represent an audio segment in a n-dimensional vector, whose dimensions define the probability of the segment to belong to a specific music genre. To capture the listening behavior of a user, we investigate a state-of-the-art technique, estimation of distribution algorithms. The designed hybrid music recommender outperforms the predictions compared with a traditional content-based recommender.}, Author = {Chiliguano, P. and Fazekas, G.}, Booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 20-25 March, Snahghai, China}, Date-Added = {2017-12-21 19:19:39 +0000}, Date-Modified = {2017-12-21 19:28:25 +0000}, Doi = {10.1109/ICASSP.2016.7472151}, Issn = {2379-190X}, Keywords = {Estimation of Distribution Algorithms (EDA), Deep Learning, CNN}, Title = {Hybrid music recommender using content-based and social information}, Url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7472151}, Year = {2016}, Bdsk-Url-1 = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7472151}, Bdsk-Url-2 = {https://dx.doi.org/10.1109/ICASSP.2016.7472151}} @conference{font2016aesg, Abstract = {Significant amounts of user-generated audio content, such as sound effects, musical samples and music pieces, are uploaded to online repositories and made available under open licenses. Moreover, a constantly increasing amount of multimedia content, originally released with traditional licenses, is becoming public domain as its license expires. Nevertheless, the creative industries are not yet using much of all this content in their media productions. There is still a lack of familiarity and understanding of the legal context of all this open content, but there are also problems related with its accessibility. A big percentage of this content remains unreachable either because it is not published online or because it is not well organised and annotated. In this paper we present the Audio Commons Initiative, which is aimed at promoting the use of open audio content and at developing technologies with which to support the ecosystem composed by content repositories, production tools and users. These technologies should enable the reuse of this audio material, facilitating its integration in the production workflows used by the creative industries. This is a position paper in which we describe the core ideas behind this initiative and outline the ways in which we plan to address the challenges it poses.}, Author = {Font, F. and Brookes, T. and Fazekas, G. and Guerber, M. and La Burthe, A. and Plans, D. and Plumbley, M. and Wang, W. and Serra, X.}, Booktitle = {Proc. AES 61st International Conference on Audio for Games, Feb 10--12, London, UK}, Date-Added = {2017-12-22 14:36:41 +0000}, Date-Modified = {2017-12-22 14:56:04 +0000}, Keywords = {music informatics, open sound content, Creative Commons, Audio Commons, Game audio, music production, video production}, Local-Url = {http://www.aes.org/e-lib/browse.cfm?elib=18093}, Title = {Audio Commons: Bringing Creative Commons Audio Content to the Creative Industries}, Url = {http://www.semanticaudio.net/files/papers/font2016aes.pdf}, Year = {2016}, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/font2016aes.pdf}} @conference{choi2016umap, Author = {Choi, K. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. 24th ACM Conference on User Modeling, Adaptation and Personalisation (UMAP 2016), Workshop on Surprise, Opposition, and Obstruction in Adaptive and Personalized Systems (SOAP) June 13--17, Halifax, Canada}, Date-Added = {2017-12-22 14:17:02 +0000}, Date-Modified = {2017-12-22 15:19:16 +0000}, Doi = {10.1145/1235}, Keywords = {playlist generation, semantic audio, music transition modeling}, Local-Url = {https://arxiv.org/pdf/1606.02096.pdf}, Title = {Towards Playlist Generation Algorithms Using RNNs Trained on Within-Track Transitions}, Url = {http://ceur-ws.org/Vol-1618/SOAP_paper4.pdf}, Year = {2016}, Bdsk-Url-1 = {http://ceur-ws.org/Vol-1618/SOAP_paper4.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1145/1235}} @conference{choi2016text, Abstract = {In this paper, we introduce new methods and discuss results of text-based LSTM (Long Short-Term Memory) networks for automatic music composition. The proposed network is designed to learn relationships within text documents that represent chord progressions and drum tracks in two case studies. In the experiments, word-RNNs (Recurrent Neural Networks) show good results for both cases, while character-based RNNs (char-RNNs) only succeed to learn chord progressions. The proposed system can be used for fully automatic composition or as semi-automatic systems that help humans to compose music by controlling a diversity parameter of the model.}, Author = {Choi, K. and Fazekas, G. and Sandler, M.}, Booktitle = {1st Conference on Computer Simulation of Musical Creativity, 17-19 June, University of Huddersfield. UK.}, Doi = {doi.org/10.48550/arXiv.1604.05358}, Keywords = {LSTM, music generation}, Title = {Text-Based LSTM networks for Automatic Music Composition}, Url = {https://arxiv.org/pdf/1604.05358}, Year = {2016}} @book{allik2016iswc, Abstract = {Feature extraction algorithms in Music Informatics aim at deriving statistical and semantic information directly from audio signals. These may be ranging from energies in several frequency bands to musical information such as key, chords or rhythm. There is an increasing diversity and complexity of features and algorithms in this domain and applications call for a common structured representation to facilitate interoperability, reproducibility and machine interpretability. We propose a solution relying on Semantic Web technologies that is designed to serve a dual purpose (1) to represent computational workflows of audio features and (2) to provide a common structure for feature data to enable the use of Open Linked Data principles and technologies in Music Informatics. The Audio Feature Ontology is based on the analysis of existing tools and music informatics literature, which was instrumental in guiding the ontology engineering process. The ontology provides a descriptive framework for expressing different conceptualisations of the audio feature extraction domain and enables designing linked data formats for representing feature data. In this paper, we discuss important modelling decisions and introduce a harmonised ontology library consisting of modular interlinked ontologies that describe the different entities and activities involved in music creation, production and publishing.}, Author = {Allik, A. and Fazekas, G. and Sandler, M.}, Booktitle = {The Semantic Web, proc. of the 15th International Semantic Web Conference (ISWC), Oct. 17--21, Kobe, Japan}, Date-Added = {2017-12-22 13:58:35 +0000}, Date-Modified = {2017-12-22 13:58:35 +0000}, Doi = {10.1007/978-3-319-46547-0_1}, Keywords = {Semantic audio analysis, Music Information Retrieval, Linked open data, Semantic Web technologies}, Local-Url = {http://www.semanticaudio.net/files/papers/allik2016iswc.pfd}, Pages = {3-11}, Publisher = {Springer, Cham}, Series = {Lecture Notes in Computer Science,}, Title = {Ontological Representation of Audio Features}, Url = {http://www-kasm.nii.ac.jp/iswc2016/papers/paper_R59_.pdf}, Volume = {9982}, Year = {2016}, Bdsk-Url-1 = {http://www-kasm.nii.ac.jp/iswc2016/papers/paper_R59_.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1007/978-3-319-46547-0_1}} @book{allik2016iswcd, Abstract = {This paper was a nominee for the ``People's Choice Best Demonstration Award'' --- This demo presents MusicWeb, a novel platform for linking music artists within a web-based application for discovering associations between them. MusicWeb provides a browsing experience using connections that are either extra-musical or tangential to music, such as the artists' political affiliation or social influence, or intra-musical, such as the artists' main instrument or most favoured musical key. The platform integrates open linked semantic metadata from various Semantic Web, music recommendation and social media data sources. The connections are further supplemented by thematic analysis of journal articles, blog posts and content-based similarity measures focussing on high level musical categories.}, Author = {Allik, A. and Mora-Mcginity, M. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. 15th International Semantic Web Conference (ISWC), Posters & Demonstrations Track, Oct. 17--21, Kobe, Japan}, Date-Added = {2017-12-22 13:33:35 +0000}, Date-Modified = {2018-07-31 23:27:18 +0000}, Keywords = {Semantic Web, Linked Open Data, music metadata, semantic audio analysis, music information retrieval}, Local-Url = {http://iswc2016.semanticweb.org/pages/program/awards.html}, Series = {CEUR Workshop Proceedings}, Title = {MusicWeb: Music Discovery with Open Linked Semantic Metadata [nominee, best demo award]}, Url = {http://ceur-ws.org/Vol-1690/paper47.pdf}, Volume = {1690}, Year = {2016}, Bdsk-Url-1 = {http://ceur-ws.org/Vol-1690/paper47.pdf}} @conference{carrillo2016wac, Abstract = {We present a web-based cross-platform adaptive music player that combines music information retrieval (MIR) and audio processing technologies with the interaction capabilities offered by GPS-equipped mobile devices. The application plays back a list of music tracks, which are linked to geographic paths in a map. The music player has two main enhanced features that adjust to the location of the user, namely, adaptable length of the songs and automatic transitions between tracks. Music tracks are represented as data packages containing audio and metadata (descriptive and behavioral) that builds on the concept of Digital Music Object (DMO). This representation, in line with nextgeneration web technologies, allows for exible production and consumption of novel musical experiences. A content provider assembles a data pack with music, descriptive analysis and action parameters that users can experience and control within the restrictions and templates defined by the provider. }, Author = {Carrillo, A. and Thalmann, F. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. Web Audio Conference WAC-2016, April 4--6, Atlanta, USA}, Date-Added = {2017-12-22 20:02:39 +0000}, Date-Modified = {2017-12-22 20:05:50 +0000}, Keywords = {adaptive music, intelligent music player, semantic audio, feature extraction}, Title = {Geolocation Adaptive Music Player}, Url = {https://smartech.gatech.edu/bitstream/handle/1853/54586/WAC2016-47.pdf}, Year = {2016}, Bdsk-Url-1 = {https://smartech.gatech.edu/bitstream/handle/1853/54586/WAC2016-47.pdf}} @conference{thalmann2016wac, Abstract = {The Semantic Music Player is a cross-platform web and mobile app built with Ionic and the Web Audio API that explores new ways of playing back music on mobile devices, particularly indeterministic, context-dependent, and interactive ways. It is based on Dynamic Music Objects, a format that represents musical content and structure in an abstract way and makes it modifiable within definable constraints. For each Dynamic Music Object, the Semantic Music Player generates a custom graphical interface and enables appropriate user interface controls and mobile sensors based on its requirements. When the object is played back, the player takes spontaneous decisions based on the given structural information and the analytical data and reacts to sensor and user interface inputs. In this paper, we introduce the player and its underlying concepts and give some examples of the potentially infinite amount of use cases and musical results.}, Author = {Thalmann, F. and Perez Carillo, A. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. Web Audio Conference WAC-2016, April 4--6, Atlanta, USA}, Date-Added = {2017-12-22 19:47:03 +0000}, Date-Modified = {2017-12-22 19:52:36 +0000}, Keywords = {ontolgies, mobile music player, mobile applications, mobile audio ontology}, Local-Url = {thalmann2016wac.pdf}, Publisher-Url = {http://hdl.handle.net/1853/54596}, Title = {The Semantic Music Player: A Smart Mobile Player Based on Ontological Structures and Analytical Feature Metadata}, Url = {https://smartech.gatech.edu/bitstream/handle/1853/54596/WAC2016-71.pdf}, Year = {2016}, Bdsk-Url-1 = {https://smartech.gatech.edu/bitstream/handle/1853/54596/WAC2016-71.pdf}} @book{wilmering2016iswc, Abstract = {This paper introduces the Audio Effect Ontology (AUFX-O) building on previous theoretical models describing audio processing units and workflows in the context of music production. We discuss important conceptualisations of different abstraction layers, their necessity to successfully model audio effects, and their application method. We present use cases concerning the use of effects in music production projects and the creation of audio effect metadata facilitating a linked data service exposing information about effect implementations. By doing so, we show how our model facilitates knowledge sharing, reproducibility and analysis of audio production workflows.}, Author = {Wilmering, T. and Fazekas, G. and Sandler, M.}, Booktitle = {The Semantic Web, proc. of the 15th International Semantic Web Conference (ISWC), Oct. 17--21, Kobe, Japan}, Date-Added = {2017-12-22 13:17:52 +0000}, Date-Modified = {2017-12-22 14:03:51 +0000}, Doi = {10.1007/978-3-319-46547-0_24}, Keywords = {Semantic audio analysis, Music Information Retrieval, Linked open data, Semantic Web technologies}, Local-Url = {http://www.semanticaudio.net/files/papers/wilmerin2016iswc.pfd}, Pages = {229-237}, Publisher = {Springer, Cham}, Series = {Lecture Notes in Computer Science,}, Title = {AUFX-O: Novel Methods for the Representation of Audio Processing Workflows}, Url = {http://www-kasm.nii.ac.jp/iswc2016/papers/paper_R60_.pdf}, Volume = {9982}, Year = {2016}, Bdsk-Url-1 = {http://www-kasm.nii.ac.jp/iswc2016/papers/paper_R60_.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1007/978-3-319-46547-0_24}} @conference{thalmann2016icsc, Abstract = {Summary form only given. Strong light-matter coupling has been recently successfully explored in the GHz and THz [1] range with on-chip platforms. New and intriguing quantum optical phenomena have been predicted in the ultrastrong coupling regime [2], when the coupling strength Ω becomes comparable to the unperturbed frequency of the system ω. We recently proposed a new experimental platform where we couple the inter-Landau level transition of an high-mobility 2DEG to the highly subwavelength photonic mode of an LC meta-atom [3] showing very large Ω/ωc = 0.87. Our system benefits from the collective enhancement of the light-matter coupling which comes from the scaling of the coupling Ω ∝ √n, were n is the number of optically active electrons. In our previous experiments [3] and in literature [4] this number varies from 104-103 electrons per meta-atom. We now engineer a new cavity, resonant at 290 GHz, with an extremely reduced effective mode surface Seff = 4 × 10-14 m2 (FE simulations, CST), yielding large field enhancements above 1500 and allowing to enter the few (<;100) electron regime. It consist of a complementary metasurface with two very sharp metallic tips separated by a 60 nm gap (Fig.1(a, b)) on top of a single triangular quantum well. THz-TDS transmission experiments as a function of the applied magnetic field reveal strong anticrossing of the cavity mode with linear cyclotron dispersion. Measurements for arrays of only 12 cavities are reported in Fig.1(c). On the top horizontal axis we report the number of electrons occupying the topmost Landau level as a function of the magnetic field. At the anticrossing field of B=0.73 T we measure approximately 60 electrons ultra strongly coupled (Ω/ω- ||}, Author = {Thalmann, F. and Carrillo, A. Fazekas, G. and Wiggins, G. A. and Sandler, M.}, Booktitle = {IEEE International Conference on Semantic Computing (ICSC), Feb. 4-6, Laguna Hills, CA, USA}, Date-Added = {2017-12-22 13:07:02 +0000}, Date-Modified = {2017-12-22 19:55:24 +0000}, Doi = {10.1109/ICSC.2016.61}, Keywords = {music ontologies, artificial intelligence, user interfaces, dynamic music objects, mobile audio ontology, mobile sensor data, music consumption experiences, semantic audio framework, user interface controls, Data mining, Feature extraction}, Pages = {47-54}, Title = {The Mobile Audio Ontology: Experiencing Dynamic Music Objects on Mobile Devices}, Url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7439304}, Year = {2016}, Bdsk-Url-1 = {https://dx.doi.org/10.1109/ICSC.2016.61}} @book{mcginity2016mtsr, Abstract = {This paper presents MusicWeb, a novel platform for music discovery by linking music artists within a web-based application. MusicWeb provides a browsing experience using connections that are either extra-musical or tangential to music, such as the artists' political affiliation or social influence, or intra-musical, such as the artists' main instrument or most favoured musical key. The platform integrates open linked semantic metadata from various Semantic Web, music recommendation and social media data sources. Artists are linked by various commonalities such as style, geographical location, instrumentation, record label as well as more obscure categories, for instance, artists who have received the same award, have shared the same fate, or belonged to the same organisation. These connections are further enhanced by thematic analysis of journal articles, blog posts and content-based similarity measures focussing on high level musical categories.}, Author = {Mora-Mcginity, M. and Allik, A. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. Metadata and Semantics Research (MTSR), Nov. 22-25, G{\"o}ttingen, Germany}, Date-Added = {2017-12-22 12:46:08 +0000}, Date-Modified = {2017-12-22 13:02:24 +0000}, Doi = {10.1007/978-3-319-49157-8_25}, Keywords = {Semantic Web, Semantic Audio, Semantic Graph, music recommendation, artist similarity, web application, MusicLynx, MusicWeb, Linked open data, Music metadata, Semantic audio analysis, Music information retrieval}, Pages = {291-296}, Publisher = {Springer, Cham}, Series = {Communications in Computer and Information Science,}, Title = {MusicWeb: Music Discovery with Open Linked Semantic Metadata}, Url = {https://link.springer.com/chapter/10.1007/978-3-319-49157-8_25}, Volume = {672}, Year = {2016}, Bdsk-Url-1 = {https://link.springer.com/chapter/10.1007/978-3-319-49157-8_5}, Bdsk-Url-2 = {https://dx.doi.org/10.1007/978-3-319-49157-8_5}} @conference{satables2016acmmm, Abstract = {In music production, descriptive terminology is used to define perceived sound transformations. By understanding the underlying statistical features associated with these descriptions, we can aid the retrieval of contextually relevant processing parameters using natural language, and create intelligent systems capable of assisting in audio engineering. In this study, we present an analysis of a dataset containing descriptive terms gathered using a series of processing modules, embedded within a Digital Audio Workstation. By applying hierarchical clustering to the audio feature space, we show that similarity in term representations exists within and between transformation classes. Furthermore, the organisation of terms in low-dimensional timbre space can be explained using perceptual concepts such as size and dissonance. We conclude by performing Latent Semantic Indexing to show that similar groupings exist based on term frequency.}, Author = {Stables, R and De Man, B and Enderby, S and Reiss, JD and Fazekas, G and Wilmering, T.}, Booktitle = {Proc. ACM Multimedia, Oct. 15-19, Amsterdam, Netherlands}, Date-Added = {2017-12-21 20:25:22 +0000}, Date-Modified = {2017-12-21 20:30:59 +0000}, Doi = {10.1145/2964284.2967238}, Isbn = {978-1-4503-3603-1}, Keywords = {semantic control of audio effects, ADFX, adaptive effects, music production, natural language processing, NLP}, Pages = {337-341}, Title = {Semantic description of timbral transformations in music production}, Url = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/22150/De%20Man%20Semantic%20description%20of%20timbral%202016%20Accepted.pdf}, Year = {2016}, Bdsk-Url-1 = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/22150/De%20Man%20Semantic%20description%20of%20timbral%202016%20Accepted.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1145/2964284.2967238}} @conference{choi2016ismir, Abstract = {We present a content-based automatic music tagging algorithm using fully convolutional neural networks (FCNs). We evaluate different architectures consisting of 2D convolutional layers and subsampling layers only. In the experiments, we measure the AUC-ROC scores of the architectures with different complexities and input types using the MagnaTagATune dataset, where a 4-layer architecture shows state-of-the-art performance with mel-spectrogram input. Furthermore, we evaluated the performances of the architectures with varying the number of layers on a larger dataset (Million Song Dataset), and found that deeper models outperformed the 4-layer architecture. The experiments show that mel-spectrogram is an effective time-frequency representation for automatic tagging and that more complex models benefit from more training data.}, Author = {Choi, K. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 17th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}-16) conference, {August 7-11}., {New York}, USA}, Date-Added = {2017-12-21 20:13:29 +0000}, Date-Modified = {2017-12-21 20:21:49 +0000}, Keywords = {auto tagging, CNN}, Local-Url = {https://arxiv.org/abs/1606.00298}, Pages = {805-811}, Title = {Automatic Tagging Using Deep Convolutional Neural Networks}, Url = {https://pdfs.semanticscholar.org/b9ba/8c4a00f5ee43e768db2acc8b56f017176f3e.pdf}, Year = {2016}, Bdsk-Url-1 = {https://pdfs.semanticscholar.org/b9ba/8c4a00f5ee43e768db2acc8b56f017176f3e.pdf}} @conference{buccoli2016ismir, Abstract = {The Valence, Arousal and Dominance (VAD) model for emotion representation is widely used in music analysis. The ANEW dataset is composed of more than 2000 emotion related descriptors annotated in the VAD space. However, due to the low number of dimensions of the VAD model, the distribution of terms of the ANEW dataset tends to be compact and cluttered. In this work, we aim at finding a possibly higher-dimensional transformation of the VAD space, where the terms of the ANEW dataset are better organised conceptually and bear more relevance to music tagging. Our approach involves the use of a kernel expansion of the ANEW dataset to exploit a higher number of dimensions, and the application of distance learning techniques to find a distance metric that is consistent with the semantic similarity among terms. In order to train the distance learning algorithms, we collect information on the semantic similarity from human annotation and editorial tags. We evaluate the quality of the method by clustering the terms in the found high-dimensional domain. Our approach exhibits promising results with objective and subjective performance metrics, showing that a higher dimensional space could be useful to model semantic similarity among terms of the ANEW dataset.}, Author = {Buccoli, M. and Zanoni, M. and Fazekas, G. and Sarti, A. and Sandler, M.}, Booktitle = {Proc. of the 17th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}-16) conference, {August 7-11}., {New York}, USA}, Date-Added = {2017-12-21 20:04:14 +0000}, Date-Modified = {2017-12-21 20:40:59 +0000}, Keywords = {music tagging, mood, kernel methods, evaluation, natural language processing, folksonomy, Arousal, Valence}, Pages = {316-322}, Title = {A Higher-Dimensional Expansion of Affective Norms for English Terms for Music Tagging}, Url = {https://wp.nyu.edu/ismir2016/wp-content/uploads/sites/2294/2016/07/253_Paper.pdf}, Year = {2016}, Bdsk-Url-1 = {https://wp.nyu.edu/ismir2016/wp-content/uploads/sites/2294/2016/07/253_Paper.pdf}} @conference{allik2016ismir, Abstract = {A plurality of audio feature extraction toolsets and feature datasets are used by the MIR community. Their different conceptual organisation of features and output formats however present difficulties in exchanging or comparing data, while very limited means are provided to link features with content and provenance. These issues are hindering research reproducibility and the use of multiple tools in combination. We propose novel Semantic Web ontologies (1) to provide a common structure for feature data formats and (2) to represent computational workflows of audio features facilitating their comparison. The Audio Feature Ontology provides a descriptive framework for expressing different conceptualisations of and designing linked data formats for content-based audio features. To accommodate different views in organising features, the ontology does not impose a strict hierarchical structure, leaving this open to task and tool specific ontologies that derive from a common vocabulary. The ontologies are based on the analysis of existing feature extraction tools and the MIR literature, which was instrumental in guiding the design process. They are harmonised into a library of modular interlinked ontologies that describe the different entities and activities involved in music creation, production and consumption.}, Author = {Allik, A. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 17th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}-16) conference, {August 7-11}., {New York}, USA}, Date-Added = {2017-12-21 19:40:25 +0000}, Date-Modified = {2017-12-22 21:11:41 +0000}, Keywords = {ontology, audio analysis, audio features, interoperability, JSON-LD}, Pages = {73-79}, Title = {An Ontology for Audio Features}, Url = {https://wp.nyu.edu/ismir2016/wp-content/uploads/sites/2294/2016/07/077_Paper.pdf}, Year = {2016}, Bdsk-Url-1 = {https://wp.nyu.edu/ismir2016/wp-content/uploads/sites/2294/2016/07/077_Paper.pdf}} @conference{thalmann2016creating, Abstract = {Dynamic music is gaining increasing popularity outside of its initial environment, the videogame industry, and is gradually becoming an autonomous medium. Responsible for this is doubtlessly the prevalence of integrated multisensory platforms such as smartphones as well as the omnipresence of the internet as a provider of content on demand. The music format Dynamic Music Objects builds on these assumptions and on recent advances in music information retrieval and semantic web technologies. It is capable of describing a multitude of adaptive, interactive, and immersive musical experiences. This paper introduces the Dymo Designer, a prototypical web app that allows people to create and analyze Dynamic Music Objects in a visual, interactive, and computer-assisted way.}, Author = {Thalmann, F. and Fazekas, G. and Wiggins, G.A. and Sandler, M.}, Booktitle = {Proc. ACM Audio Mostly Conference, Oct. 4-6, Norrk{\"o}ping, Sweden}, Date-Added = {2017-12-21 19:31:03 +0000}, Date-Modified = {2017-12-22 13:13:01 +0000}, Doi = {10.1145/2986416.2986445}, Isbn = {978-1-4503-4822-5}, Keywords = {music ontology, dynamic music objects, semantic audio, intelligent music production, mobile applications}, Local-Url = {https://dl.acm.org/citation.cfm?id=2986445}, Pages = {39-46}, Title = {Creating, Visualizing, and Analyzing Dynamic Music Objects in the Browser with the Dymo Designer}, Url = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/16155/Thalmann%20Creating%20Visualizing%20and%20Analyzing%202016%20Submitted.pdf}, Year = {2016}, Bdsk-Url-1 = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/16155/Thalmann%20Creating%20Visualizing%20and%20Analyzing%202016%20Submitted.pdf}, Bdsk-Url-2 = {https://dx.doi.org/10.1145/2986416.2986445}} @article{barthet2016jaes, Abstract = {Listeners of audio are increasingly shifting to a participatory culture where technology allows them to modify and control the listening experience. This report describes the developments of a mood-driven music player, Moodplay, which incorporates semantic computing technologies for musical mood using social tags and informative and aesthetic browsing visualizations. The prototype runs with a dataset of over 10,000 songs covering various genres, arousal, and valence levels. Changes in the design of the system were made in response to user evaluations from over 120 participants in 15 different sectors of work or education. The proposed client/server architecture integrates modular components powered by semantic web technologies and audio content feature extraction. This enables recorded music content to be controlled in flexible and nonlinear ways. Dynamic music objects can be used to create mashups on the fly of two or more simultaneous songs to allow selection of multiple moods. The authors also consider nonlinear audio techniques that could transform the player into a creative tool, for instance, by reorganizing, compressing, or expanding temporally prerecorded content.}, Author = {Barthet, M. and Fazekas, G. and Allik, A. and Thalmann, F. and Sandler, M.}, Date-Added = {2017-12-21 11:55:45 +0000}, Date-Modified = {2017-12-21 18:51:04 +0000}, Doi = {10.17743/jaes.2016.0042}, Journal = {Journal of the Audio Engineering Society (JAES)}, Keywords = {mood, personalisation, audio-based mood detection, web application}, Number = {9}, Pages = {673-682}, Publisher-Url = {http://www.aes.org/e-lib/browse.cfm?elib=18376}, Title = {From interactive to adaptive mood-based music listening experiences in social or personal context}, Url = {http://www.semanticaudio.net/files/papers/barthet2016jaes-preprint.pdf}, Volume = {64}, Year = {2016}, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=18376}, Bdsk-Url-2 = {https://dx.doi.org/10.17743/jaes.2016.0042}} @article{fazekas2015conv, Abstract = {Science and technology plays in an increasingly vital role in how we experience, how we compose, perform, share and enjoy musical audio. The invention of recording in the late 19th century is a profound example that, for the first time in human history, disconnected music performance from listening and gave rise to a new industry as well as new fields of scientific investigation. But musical experience is not just about listening. Human minds make sense of what we hear by categorising and by making associations, cognitive processes which give rise to meaning or influence our mood. Perhaps the next revolution akin to recording is therefore in audio semantics. Technologies that mimic our abilities and enable interaction with audio on human terms are already changing the way we experience it. The emerging field of Semantic Audio is in the confluence of several key fields, namely, signal processing, machine learning and Semantic Web ontologies that enable knowledge representation and logic-based inference. In my talk, I will put forward that synergies between these fields provide a fruitful way, if not necessary to account for human interpretation of sound. I will outline music and audio related ontologies and ontology based systems that found applications on the Semantic Web, as well as intelligent audio production tools that enable linking musical concepts with signal processing parameters in audio systems. I will outline my recent work demonstrating how web technologies may be used to create interactive performance systems that enable mood-based audience-performer communication and how semantic audio technologies enable us to link social tags and audio features to better understand the relationship between music and emotions. I will hint at how some principles used in my research also contribute to enhancing scientific protocols, ease experimentation and facilitate reproducibility. Finally, I will discuss challenges in fusing audio and semantic technologies and outline some future opportunities they may bring about.}, Author = {Fazekas, G.}, Date-Added = {2015-10-03 12:15:00 +0000}, Date-Modified = {2017-12-28 10:36:48 +0000}, Invited = {keynote talk}, Journal = {Presented at the ACM Audio Mostly International Conference, 7-9 Oct. Thessaloniki, Greece.}, Keywords = {Semantic Audio, Ontology-based systems, Music and Emotion, Music Performance and Interactive Systems, Semantic Audio Production}, Presentation-Url = {files/papers/fazekas2015conv.pdf}, Title = {Convergence of technologies to connect audio with meaning: from Semantic Web ontologies to semantic audio production}, Url = {https://portalparts.acm.org/2820000/2814895/fm/frontmatter.pdf}, Year = {2015}, Bdsk-Url-1 = {https://portalparts.acm.org/2820000/2814895/fm/frontmatter.pdf}} @conference{choi2015understanding, Abstract = {As music streaming services dominate the music industry, the playlist is becoming an increasingly crucial element of music consumption. Consequently, the music recommendation problem is often casted as a playlist generation problem. Better understanding of the playlist is therefore necessary for developing better playlist generation algorithms. In this work, we analyse two playlist datasets to investigate some commonly assumed hypotheses about playlists. Our findings indicate that deeper understanding of playlists is needed to provide better prior information and improve machine learning algorithms in the design of recommendation systems.}, Author = {Choi, K. and Fazekas, G. and Sandler, M.}, Booktitle = {International Conference on Machine Learning (ICML), Machine Learning for Music Discovery Workshop, 6-11 July, Lille, France}, Date-Added = {2015-05-24 20:30:05 +0000}, Date-Modified = {2017-12-28 10:36:42 +0000}, Keywords = {playlist generation, recommendation, machine learning}, Title = {Understanding Music Playlists}, Url = {https://sites.google.com/site/ml4md2015/accepted-talks}, Year = {2015}, Bdsk-Url-1 = {https://sites.google.com/site/ml4md2015/accepted-talks}} @conference{wilmering2015towards, Abstract = {There is a growing need for large online media libraries with structured descriptions of the resources, whereby accurate feature extraction from live music recordings present additional challenges. In this paper we describe a set of tools that automate the process of feature extraction from large music collections which we applied to the Live Music Archive. The system produces Linked Data of the analysis workflow and results which is then combined with editorial metadata. We point out problems of high level feature extraction specific to live music recordings.}, Author = {Wilmering, T. and Fazekas, G. and Dixon, S. and Page, K. and Bechhofer, S.}, Booktitle = {International Conference on Machine Learning (ICML), Machine Learning for Music Discovery Workshop, 6-11 July, Lille, France}, Date-Added = {2015-05-24 20:24:59 +0000}, Date-Modified = {2015-05-24 20:38:49 +0000}, Keywords = {live music archive, feature extraction, big data, machine learning}, Title = {Towards High Level Feature Extraction from Large Live Music Recording Archives}, Url = {https://sites.google.com/site/ml4md2015/accepted-talks}, Year = {2015}, Bdsk-Url-1 = {https://sites.google.com/site/ml4md2015/accepted-talks}} @conference{zanoni2015violin, Abstract = {Violin makers and musicians describe the timbral qualities of violins using semantic terms coming from natural language. In this study we use regression techniques of machine intelligence and audio features to model in a training-based fashion a set of high-level (semantic) descriptors for the automatic annotation of musical instruments. The most relevant semantic descriptors are collected through interviews to violin makers. These descriptors are then correlated with objective features extracted from a set of violins from the historical and contemporary collections of the Museo del Violino and of the International School of Luthiery both in Cremona. As sound description can vary throughout a performance, our approach also enables the modelling of time-varying (evolutive) semantic annotations}, Author = {Zanoni, M. and Setragno, F. and Antonacci, F. and Sarti, A. and Fazekas, G. and Sandler, M.}, Booktitle = {Proceedings of the 138th Convention of the Audio Engineering Society (AES), 7-10 May, Warsaw, Poland.}, Date-Added = {2015-05-24 20:09:13 +0000}, Date-Modified = {2020-12-26 09:18:59 +0000}, Keywords = {semantic descriptors, violin, audio analysis, ontology, MIR}, Publisher-Url = {https://www.aes.org/e-lib/online/browse.cfm?elib=17777}, Title = {Training-based Semantic Descriptors modeling for violin quality sound characterization}, Url = {http://www.semanticaudio.net/files/papers/zanoni2015aes-preprint.pdf}, Year = {2015}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGJCVYJHZlcnNpb25YJG9iamVjdHNZJGFyY2hpdmVyVCR0b3ASAAGGoKgHCBMUFRYaIVUkbnVsbNMJCgsMDxJXTlMua2V5c1pOUy5vYmplY3RzViRjbGFzc6INDoACgAOiEBGABIAFgAdccmVsYXRpdmVQYXRoWWFsaWFzRGF0YV8QIXBhcGVycy96YW5vbmkyMDE1YWVzLXByZXByaW50LnBkZtIXCxgZV05TLmRhdGFPEQGqAAAAAAGqAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8aemFub25pMjAxNWFlcy1wcmVwcmludC5wZGYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAABAAMAAAogY3UAAAAAAAAAAAAAAAAABnBhcGVycwACAE0vOlVzZXJzOmdmYXpla2FzOkRvY3VtZW50czp3ZWJzaXRlLWhnOmZpbGVzOnBhcGVyczp6YW5vbmkyMDE1YWVzLXByZXByaW50LnBkZgAADgA2ABoAegBhAG4AbwBuAGkAMgAwADEANQBhAGUAcwAtAHAAcgBlAHAAcgBpAG4AdAAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAS1VzZXJzL2dmYXpla2FzL0RvY3VtZW50cy93ZWJzaXRlLWhnL2ZpbGVzL3BhcGVycy96YW5vbmkyMDE1YWVzLXByZXByaW50LnBkZgAAEwABLwAAFQACAA///wAAgAbSGxwdHlokY2xhc3NuYW1lWCRjbGFzc2VzXU5TTXV0YWJsZURhdGGjHR8gVk5TRGF0YVhOU09iamVjdNIbHCIjXE5TRGljdGlvbmFyeaIiIF8QD05TS2V5ZWRBcmNoaXZlctEmJ1Ryb290gAEACAARABoAIwAtADIANwBAAEYATQBVAGAAZwBqAGwAbgBxAHMAdQB3AIQAjgCyALcAvwJtAm8CdAJ/AogClgKaAqECqgKvArwCvwLRAtQC2QAAAAAAAAIBAAAAAAAAACgAAAAAAAAAAAAAAAAAAALb}} @conference{tian2015tempogram, Abstract = {This paper presents a new set of audio features to describe music content based on tempo cues. Tempogram, a mid-level representation of tempo information, is constructed to characterize tempo variation and local pulse in the audio signal. We introduce a collection of novel tempogram-based features inspired by musicological hypotheses about the relation between music structure and its rhythmic components prominent at different metrical levels. The strength of these features is demonstrated in music structural segmentation, an important task in Music information retrieval (MIR), using several published popular music datasets. Our evaluation shows improvement over the state of the art using the presented features alone. Results indicate that incorporating tempo information into audio segmentation is a promising new direction. }, Author = {Tian, M. and Fazekas, G. and Black, D. A. A. and Sandler, M.}, Booktitle = {Proc. of the 40th International Conference on Acoustics, Speech and Signal Processing (ICASSP), 19-24 April, Brisbane, Australia}, Date-Added = {2015-05-24 19:44:46 +0000}, Date-Modified = {2017-12-28 10:36:30 +0000}, Keywords = {tempogram, structural segmentation, MIR}, Title = {On the Use of the Tempogram to Describe Audio Content and its Application to Music Structural Segmentation}, Url = {https://www2.securecms.com/ICASSP2015/Papers/ViewPapers.asp?PaperNum=3407}, Year = {2015}, Bdsk-Url-1 = {https://www2.securecms.com/ICASSP2015/Papers/ViewPapers.asp?PaperNum=3407}} @conference{thalmann2015ismir, Author = {Thalmann, F. and Carrillo, A. and Fazekas, G. and Wiggins, G. A. and Sandler, M.}, Booktitle = {Proc. of the 16th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}-15) conference, Late-breaking session, Oct. 26-30, Malaga, Spain}, Date-Added = {2017-12-22 19:18:33 +0000}, Date-Modified = {2017-12-22 19:59:04 +0000}, Keywords = {ontology, mobile applications, mobile audio ontology, web application}, Local-Url = {http://ismir2015.uma.es/LBD/LBD26.pdf}, Publisher-Url = {http://ismir2015.uma.es/LBD/LBD19.pdf}, Title = {Navigating Ontological Structures based on Feature Metadata Using the Semantic Music Player}, Url = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/16154/Thalmann%20Navigating%20Ontological%20Structures%202015%20Published.pdf}, Year = {2015}, Bdsk-Url-1 = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/16154/Thalmann%20Navigating%20Ontological%20Structures%202015%20Published.pdf}} @conference{liang2015ismir, Author = {Liang, B. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 16th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}-15) conference, Late-breaking session, Oct. 26-30, Malaga, Spain}, Date-Added = {2017-12-22 19:57:10 +0000}, Date-Modified = {2017-12-22 19:57:10 +0000}, Keywords = {web application, church organ, culture and presearvaion, music informatics}, Local-Url = {http://www.semanticaudio.net/files/papers/liang2015ismir.pdf}, Publisher-Url = {http://ismir2015.uma.es/LBD/LBD19.pdf}, Title = {The Organ Web App}, Year = {2015}} @conference{wilmering2015automating, Abstract = {Computational feature extraction provides one means of gathering structured analytic metadata for large media collections. We demonstrate a suite of tools we have developed that automate the process of feature extraction from audio in the Internet Archive. The system constructs an RDF description of the analysis work ow and results which is then reconciled and combined with Linked Data about the recorded performance. This Linked Data and provenance information provides the bridging information necessary to employ analytic output in the generation of structured metadata for the underlying media les, with all data published within the same description framework}, Author = {Wilmering, T. and Fazekas, G. and Dixon, S. and Bechhofer, S. and Page, K.}, Booktitle = {Third International Workshop on Linked Media (LiME 2015) co-located with the WWW'15 conference, 18-22 May, Florence, Italy.}, Date-Added = {2015-05-24 19:26:51 +0000}, Date-Modified = {2015-05-24 20:41:05 +0000}, Keywords = {linked-data, ontologies, live music archive, LMA, feature extraction}, Presentation-Url = {http://calma.linkedmusic.org/calma_lime_2015.pdf}, Title = {Automating Annotation of Media with Linked Data Workflows}, Url = {http://www.www2015.it/documents/proceedings/companion/p737.pdf}, Year = {2015}, Bdsk-Url-1 = {http://www.www2015.it/documents/proceedings/companion/p737.pdf}} @conference{mauch2015tenor, Abstract = {We present Tony, a software tool for the interactive an- notation of melodies from monophonic audio recordings, and evaluate its usability and the accuracy of its note extraction method. The scientific study of acoustic performances of melodies, whether sung or played, requires the accurate transcription of notes and pitches. To achieve the desired transcription accuracy for a particular application, researchers manually correct results obtained by automatic methods. Tony is an interactive tool directly aimed at making this correction task efficient. It provides (a) state-of-the art algorithms for pitch and note estimation, (b) visual and auditory feedback for easy error-spotting, (c) an intelligent graphical user interface through which the user can rapidly correct estimation errors, (d) extensive export functions enabling further processing in other applications. We show that Tony's built in automatic note transcription method compares favourably with existing tools. We report how long it takes to annotate recordings on a set of 96 solo vocal recordings and study the effect of piece, the number of edits made and the annotator's increasing mastery of the software. Tony is Open Source software, with source code and compiled binaries for Windows, Mac OS X and Linux available from https://code.soundsoftware.ac.uk/projects/tony/.}, Author = {Mauch, M. and Cannam, C. and Bittner, R. and Fazekas, G. and Salamon, J. and Dai, J. and Bello, J. and Dixon S.}, Booktitle = {Proceedings of the First International Conference on Technologies for Music Notation and Representation}, Date-Added = {2015-05-24 19:18:46 +0000}, Date-Modified = {2017-12-28 10:36:36 +0000}, Keywords = {Tony, melody, note, transcription, open source software}, Title = {Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency}, Url = {https://code.soundsoftware.ac.uk/attachments/download/1423/tony-paper_preprint.pdf}, Year = {2015}, Bdsk-Url-1 = {https://code.soundsoftware.ac.uk/attachments/download/1423/tony-paper_preprint.pdf}} @conference{stables2014dmrn, Author = {Stables, R. and Enderby, S. and De Man, B. and Fazekas, G. and Reiss, J. D.}, Booktitle = {{Presented} at the {Digital} {Music} {Research} {Network} {Workshop}, 16. {Dec}., {London}, UK}, Date-Added = {2014-11-26 16:44:10 +0000}, Date-Modified = {2014-11-26 17:11:59 +0000}, Keywords = {Semantic Audio, feature extraction, DAW, HCI}, Title = {The SAFE project: Musical semantics in the DAW}, Url = {http://c4dm.eecs.qmul.ac.uk/dmrn/events/dmrnp9/#programme}, Year = {2014}, Bdsk-Url-1 = {http://c4dm.eecs.qmul.ac.uk/dmrn/events/dmrnp9/#programme}} @conference{DeMan2014the, Abstract = {We introduce the Open Multitrack Testbed, an online repository of multitrack audio, mixes or processed versions thereof, and corresponding mix settings or process parameters such as DAW files. Multitrack audio is a much sought after resource for audio researchers, students, and content producers, and while some online resources exist, few are large and reusable and none allow querying audio fulfilling specific criteria. The test bed we present contains a semantic database of metadata corresponding with the songs and individual tracks, enabling users to retrieve all pop songs featuring an accordion, or all tracks recorded in reverberant spaces. The open character is made possible by requiring the contributions, mainly from educational institutions and individuals, to have a Creative Commons license.}, Author = {De Man, B. and Mora-Mcginity, M. and Fazekas, G. and Reiss, J. D.}, Booktitle = {137th Convention of the Audio Engineering Society, 7 Oct., Los Angeles, USA}, Date-Added = {2014-11-25 21:32:31 +0000}, Date-Modified = {2014-11-25 21:39:28 +0000}, Keywords = {Multitrack audio, Sematics, Database, Engineering brief}, Title = {The Open Multitrack Testbed}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=17400}, Year = {2014}, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=17400}} @conference{mora2014sem, Abstract = {The paper describes research carried out to provide a tool designed to offer music researchers data and resources linked to other science areas and domains. The tool gathers data from the internet and stores it semantically. Most of this data consists of publications and articles about music related issues, such as artists, styles, music tags and keywords. The data is offered to researchers in a faceted manner, allowing the user to navigate the data through an interface, in the hope of allowing her to discover new resources which might be of value to her research.}, Author = {Mora-McGinity, M. and Ogilvie, G. and Fazekas, G.}, Booktitle = {Workshop on Semantic Technologies for Research in the Humanities and Social Sciences (STRiX), November 24-25, Gothenburg, Sweden}, Date-Added = {2014-11-25 16:44:17 +0000}, Date-Modified = {2014-11-25 21:23:31 +0000}, Keywords = {Semantic metadata, Research, Music, Humanities, Social Sciences, Linked data}, Title = {Semantically Linking Humanities Research Articles and Music Artists}, Url = {https://svn.spraakdata.gu.se/kbc/public/web/workshop/papers/8.pdf}, Year = {2014}, Bdsk-Url-1 = {https://svn.spraakdata.gu.se/kbc/public/web/workshop/papers/8.pdf}} @book{fazekas2014novel, Abstract = {While listeners' emotional response to music is the subject of numerous studies, less attention is paid to the dynamic emotion variations due to the interaction between artists and audiences in live improvised music performances. By opening a direct communication channel from audience members to performers, the Mood Conductor system provides an experimental framework to study this phenomenon. Mood Conductor facilitates interactive performances and thus also has an inherent entertainment value. The framework allows audience members to send emotional directions using their mobile devices in order to "conduct" improvised performances. Audience indicated emotion coordinates in the arousal-valence space are aggregated and clustered to create a video projection. This is used by the musicians as guidance, and provides visual feedback to the audience. Three different systems were developed and tested within our framework so far. These systems were trialled in several public performances with different ensembles. Qualitative and quantitative evaluations demonstrate that musicians and audiences are highly engaged with the systems, and raise new insights enabling future improvements of the framework.}, Author = {Fazekas, G. and Barthet, M. and Sandler, M.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-11-26 17:36:53 +0000}, Edition = {{S}ound {M}usic and {M}otion}, Editor = {Aramaki, M. and Kronland-Martinet, R. and Ystad, S.}, Keywords = {audience-performer interaction, music, emotion, mood, arousal, valence, improvisation, live music}, Publisher = {Springer-Verlag, Heidelberg, Germany.}, Series = {{L}ecture {N}otes In {C}omputer {S}cience (LNCS)}, Title = {Novel {Methods} in {Facilitating} {Audience} and {Performer} {Interaction} using the {Mood} {Conductor} {Framework}}, Url = {http://www.springer.com/computer/database+management+%26+information+retrieval/book/978-3-319-12975-4}, Volume = {8905}, Year = 2014, Bdsk-Url-1 = {http://www.springer.com/computer/database+management+%26+information+retrieval/book/978-3-319-12975-4}} @article{saari2015genreadaptive, Abstract = {This study investigates whether taking genre into account is beneficial for automatic music mood annotation in terms of core affects valence, arousal, and tension, as well as several other mood scales. Novel techniques employing genre-adaptive semantic computing and audio-based modelling are proposed. A technique called the ACTwg employs genre-adaptive semantic computing of mood-related social tags, whereas ACTwg-SLPwg combines semantic computing and audio-based modelling, both in a genre-adaptive manner. The proposed techniques are experimentally evaluated at predicting listener ratings related to a set of 600 popular music tracks spanning multiple genres. The results show that ACTwg outperforms a semantic computing technique that does not exploit genre information, and ACTwg-SLPwg outperforms conventional techniques and other genre-adaptive alternatives. In particular, improvements in the prediction rates are obtained for the valence dimension which is typically the most challenging core affect dimension for audio-based annotation. The specificity of genre categories is not crucial for the performance of ACTwg-SLPwg. The study also presents analytical insights into inferring a concise tag-based genre representation for genre-adaptive music mood analysis.}, Author = {Saari, P. and Fazekas, G. and Eerola, T. and Barthet, M. and Lartillot and O., Sandler and M.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2017-12-21 19:16:58 +0000}, Doi = {10.1109/TAFFC.2015.2462841}, Isbn = {1949-3045}, Journal = {{IEEE} Transactions on Affective Computing (TAC)}, Keywords = {semantic mood model, music, emotion, genre}, Number = {2}, Pages = {122-135}, Title = {Genre-adaptive Semantic Computing Enhances Audio-based Music Mood Prediction}, Url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7173419}, Volume = {7}, Year = 2016, Bdsk-Url-1 = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7173419}} @article{kolozali2013automatic, Abstract = {In this paper we present a novel hybrid system that involves a formal method of automatic ontology generation for web-based audio signal processing applications. An ontology is seen as a knowledge management structure that represents domain knowledge in a machine interpretable format. It describes concepts and relationships within a particular domain, in our case, the domain of musical instruments. However, the different tasks of ontology engineering including manual anno- tation, hierarchical structuring and organization of data can be laborious and challenging. For these reasons, we investigate how the process of creating ontologies can be made less dependent on human supervision by exploring concept analysis techniques in a Semantic Web environment. In this study, various musical instruments, from wind to string families, are classified using timbre features extracted from audio. To obtain models of the analysed instrument recordings, we use K-means clustering to determine an optimised codebook of Line Spectral Frequencies (LSFs), or Mel-frequency Cepstral Coefficients (MFCCs). Two classification techniques based on Multi-Layer Perceptron (MLP) neural network and Support Vector Machines (SVM) were tested. Then, Formal Concept Analysis (FCA) is used to automatically build the hierarchical structure of musical instrument ontologies. Finally, the generated ontologies are expressed using the Ontology Web Language (OWL). System performance was evaluated under natural recording conditions using databases of isolated notes and melodic phrases. Analysis of Variance (ANOVA) were conducted with the feature and classifier attributes as independent variables and the musical instrument recognition F-measure as dependent variable. Based on these statistical analyses, a detailed comparison between musical instrument recognition models is made to investi- gate their effects on the automatic ontology generation system. The proposed system is general and also applicable to other research fields that are related to ontologies and the Semantic Web.}, Author = {Kolozali, S. and Barthet, M. and Fazekas, G. and Sandler, M.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:11:32 +0000}, Doi = {10.1109/TASL.2013.2263801}, Journal = {{IEEE} Transactions on Audio, Speech and Language Processing (TASLP)}, Keywords = {ontology, formal concept analysis, neural networks, classification, automatic ontology generation}, Number = {10}, Pages = {2207-2220}, Title = {Automatic Ontology Generation for Musical Instruments Based on Audio Analysis}, Volume = {21}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1109/TASL.2013.2263801}} @book{barthet2013lncs, Abstract = {The striking ability of music to elicit emotions assures its prominent status in human culture and every day life. Music is often enjoyed and sought for its ability to induce or convey emotions, which may manifest in anything from a slight variation in mood, to changes in our physical condition and actions. Consequently, research on how we might associate musical pieces with emotions and, more generally, how music brings about an emotional response is attracting ever increasing attention. First, this paper provides a thorough review of studies on the relation of music and emotions from different disciplines. We then propose new insights to enhance automated music emotion recognition models using recent results from psychology, musicology, affective computing, semantic technologies and music information retrieval.}, Author = {Barthet, M. and Fazekas, G. and Sandler, M.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2018-01-13 11:35:35 +0000}, Doi = {10.1007/978-3-642-41248-6_13}, Edition = {From Sounds to Music and Emotions}, Editor = {Aramaki, M. and Barthet, M. and Kronland-Martinet, R. and Ystad, S.}, Isbn = {978-3-642-41247-9}, Keywords = {music, mood, emotion, review, new model, context, emotion recognition}, Local-Url = {http://www.semanticaudio.net/files/papers/barthet2013lncs-preprint.pdf}, Pages = {228--252}, Publisher = {Springer-Verlag, Heidelberg, Germany.}, Series = {Lecture Notes in Computer Science}, Title = {Music Emotion Recognition: From Content- to Context-Based Models}, Url = {http://www.semanticaudio.net/files/papers/barthet2013lncs-preprint.pdf}, Volume = {7900}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1007/978-3-642-41248-6_13}} @article{fazekas2010an, Abstract = {The use of cultural information is becoming increasingly important in music information research, especially in music retrieval and recommendation. While this information is widely available on the Web, it is most commonly published using proprietary Web Application Programming Interfaces (APIs). The Linked Data community is aiming at resolving the incompatibilities between these diverse data sources by building a Web of data using Semantic Web technologies. The OMRAS2 project has made several important contributions to this by developing an ontological framework and numerous software tools, as well as publishing music related data on the Semantic Web. These data and tools have found their use even beyond their originally intended scope. In this paper, we first provide a broad overview of the Semantic Web technologies underlying this work. We describe the Music Ontology, an open-ended framework for communicating musical information on the Web, and show how this framework can be extended to describe specific sub-domains such as music similarity, content-based audio features, musicological data and studio production. We describe several data-sets that have been published and data sources that have been adapted using this framework. Finally, we provide application examples ranging from software libraries to end user Web applications.}, Author = {Fazekas, G. and Raimond, Y. and Jakobson, K. and Sandler, M.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2017-12-28 10:35:31 +0000}, Doi = {10.1080/09298215.2010.536555}, Journal = {{Journal} of {New} {Music} {Research} (JNMR) special issue on {Music} {Informatics} and the {OMRAS}2 {Project}}, Keywords = {Semantic Web, OMRAS2, Semantic Audio, ontology, SAWA}, Number = {4}, Pages = {295--311}, Title = {An overview of {Semantic} {Web} activities in the {OMRAS}2 {Project}}, Url = {files/papers/Fazekas2010jnmr.pdf}, Volume = {39}, Year = 2011, Bdsk-Url-1 = {files/papers/Fazekas2010jnmr.pdf}} @article{tidhar2010tempest, Abstract = {Issues concerning tuning and temperament bear relevance to music research in areas such as historical musicology, performance and recording studies, and music perception. We have recently demonstrated that it is possible to classify keyboard temperament automatically from audio recordings of standard musical works to the extent of accurately distinguishing between six different temperaments often used in harpsichord recordings. The current paper extends this work by combining digital signal processing with semantic computing and demonstrates the use of the temperament classifier in a Semantic Web environment. We present the Temperament Ontology which models the main concepts, relationships, and parameters of musical temperament, and facilitates the description and inference of various characteristics of specific temperaments. We then describe TempEst, a web application for temperament estimation. TempEst integrates the classifier with ontology-based information processing in order to provide an extensible online service, which reports the class and properties of both known and unknown temperaments. TempEst allows users to upload harpsichord recordings, and provides them with an estimated temperament as well as other inferred characteristics of the instrument's tuning}, Author = {Tidhar, D. and Fazekas, G. and Mauch, M. and Dixon, S.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2017-12-28 10:35:51 +0000}, Doi = {10.1080/09298215.2010.520720}, Journal = {{Journal} of {New} {Music} {Research} (JNMR) special issue on {Music} {Informatics} and the {OMRAS}2 {Project}}, Keywords = {temperament, ontology, audio analysis, Semantic Audio, Semantic Web, inference, SAWA}, Number = {4}, Pages = {327--336}, Title = {Tempest - harpsichord temperament estimation in a {Semantic} {Web} environment}, Url = {http://www.eecs.qmul.ac.uk/~simond/pub/2010/Tidhar-Fazekas-Mauch-Dixon-JNMR-2010.pdf}, Volume = {39}, Year = 2011, Bdsk-Url-1 = {http://www.eecs.qmul.ac.uk/~simond/pub/2010/Tidhar-Fazekas-Mauch-Dixon-JNMR-2010.pdf}} @conference{fazekas2013describing, Abstract = {Modern environments for creating, editing or managing multimedia content involve increasingly complex tools and components. These tools are typically used in multi-aspect workflows exhibiting creative, procedural and computational properties, while most components deal with the underlying electrical or digital signal-based representation of content. Collecting and sharing information about these workflows on the Semantic Web can be beneficial for content management or educational purposes. In this paper, we describe an ontological model for the representation of workflows in audio production, and show how this model facilitates capturing and sharing information about the production process. We then examine how this model can be used in a larger framework for representing domain knowledge about production and outline why this information is beneficial.}, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 14th IEEE International Workshop on Image and Audio Analysis for Multimedia Interactive Services (WIAMIS) 3--5 July, Paris, France}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-10 21:55:22 +0000}, Doi = {10.1109/WIAMIS.2013.6616135}, Invited = {invited paper}, Keywords = {ontology, Semantic Web, audio production, workflow, description logic}, Title = {Describing audio production workflows on the {Semantic} {Web}}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1109/WIAMIS.2013.6616135}} @conference{wilmering2013semantic, Abstract = {In this paper we describe how the Audio Effects Ontology, an extension to the Studio Ontology, can be used for the ontological representation of detailed metadata about the use of audio effects in music production projects. The ontologies are using Semantic Web technologies that enable knowledge representation and sharing, either on the Semantic Web or local RDF databases maintained by music production studios. The generated metadata facilitates reproducibility and detailed analysis of music production practices. We discuss how audio effect implementations and transformations are conceptualised in the ontologies, give examples of real-world use cases and present results of a qualitative evaluation.}, Author = {Wilmering, T. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 12th {International} {Semantic} {Web} {Conference} ({ISWC}), first {International} {Workshop} on {Semantic} {Music} and {Media} ({SMAM}2013)}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-10 21:55:52 +0000}, Invited = {invited paper}, Keywords = {music production, ontology, metadata, Semantic Web}, Pages = {21--25}, Title = {Semantic Metadata for Music Production Projects}, Url = {http://semanticmedia.org.uk/smam2013/papers/wilmering_smam2013.pdf}, Year = 2013, Bdsk-Url-1 = {http://semanticmedia.org.uk/smam2013/papers/wilmering_smam2013.pdf}} @conference{tian2014design, Abstract = {Note onset detection is one of the most investigated tasks in Music Information Retrieval (MIR) and various detection methods have been proposed in previous research. The primary aim of this paper is to investigate different fusion policies to combine existing onset detectors, thus achieving better results. Existing algorithms are fused using three strategies, first by combining different algorithms, second, by using the linear combination of detection functions, and third, by using a late decision fusion approach. Large scale evaluation was carried out on two published datasets and a new percussion database composed of Chinese traditional instrument samples. An exhaustive search through the parameter space was used enabling a systematic analysis of the impact of each parameter, as well as reporting the most generally applicable parameter settings for the onset detectors and the fusion. We demonstrate improved results attributed to both fusion and the optimised parameter settings.}, Author = {Tian, M. and Fazekas, G. and Black, D. A. A. and Sandler, M.}, Booktitle = {Presented at the 15th {International} {Society} of {Music} {Information} {Retrieval} ({ISMIR}) Conference, {Oct} 27-31, 2014, {Taipei, Taiwan}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-11-26 17:31:03 +0000}, Keywords = {onset detection, data fusion, large scale evaluation, signal processing, vamp plugins}, Title = {Design and {Evaluation} of {Onset} {Detectors} {Using} {Different} {Fusion} {Policies}}, Url = {http://www.semanticaudio.net/files/papers/tian2014design.pdf}, Year = 2014, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/tian2014design.pdf}} @conference{baume2014selection, Abstract = {Music emotion recognition typically attempts to map audio features from music to a mood representation using machine learning techniques. In addition to having a good dataset, the key to a successful system is choosing the right inputs and outputs. Often, the inputs are based on a set of audio features extracted from a single software library, which may not be the most suitable combination. This paper describes how 47 different types of audio features were evaluated using a five-dimensional support vector regressor, trained and tested on production music, in order to find the combination which produces the best performance. The results show the minimum number of features that yield optimum performance, and which combinations are strongest for mood prediction.}, Author = {Baume, C. and Fazekas, G. and Barthet, M. and Martson, D. and Sandler, M.}, Booktitle = {Proc. of the {AES} 53rd International Conference on Semantic Audio, Jan, 26-29., London, UK}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-02 16:50:41 +0000}, Keywords = {mood, emotion, feature selection, wrapper method, M4 project}, Title = {Selection of audio features for music emotion recognition using production music}, Year = 2014} @conference{kolozali2014aes, Abstract = {Ontologies have been established for knowledge sharing and are widely used for structuring domains of interests conceptually. With growing amount of data on the internet, manual annotation and development of ontologies becomes critical. We propose a hybrid system to develop ontologies from audio signals automatically, in order to provide assistance to ontology engineers. The method is examined using various musical instruments, from wind to string families, that are classified using timbre features extracted from audio. To obtain models of the analysed instrument recordings, we use K-means clustering and determine an optimised codebook of Line Spectral Frequencies (LSFs) or Mel-frequency Cepstral Coefficients (MFCCs). The system was tested using two classification techniques, Multi-Layer Perceptron (MLP) neural network and Support Vector Machines (SVM). We then apply Formal Concept Analysis (FCA) to derive a lattice of concepts which is transformed into an ontology using the Ontology Web Language (OWL). The system was evaluated using Multivariate Analysis of Variance (MANOVA), with the feature and classifier attributes as independent variables and the lexical and taxonomic evaluation metrics as dependent variables.}, Author = {Kolozali, S. and Fazekas, G. and Barthet, M. and Sandler, M.}, Booktitle = {Proc. of the {AES} 53rd International Conference on Semantic Audio, Jan, 26-29., London, UK}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2017-12-30 02:42:17 +0000}, Keywords = {instrument, ontology, audio analysis, formal concept analysis, statistics, ANOVA, automatic ontology generation, Semantic Web}, Publisher-Url = {http://www.aes.org/e-lib/browse.cfm?elib=17100}, Title = {A framework for automatic ontology generation based on semantic audio analysis}, Url = {http://www.semanticaudio.net/files/papers/kolozali2014aes-preprint.pdf}, Year = 2014, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/kolozali2014aes-preprint.pdf}} @conference{lou2014evaluation, Abstract = {In traditional music performances, audience members have a passive role in the music creation process and can't manifest what they would desire to listen to. We proposed an interactive system, Mood Conductor, to allow for interactions between the audience and performers in improvised performance situations. The system consists of three parts: a smartphone-friendly web application, a server component aggregating and clustering the messages sent from the application, and a visualisation client showing the emotional intentions from the audience. In this system, audience members can express emotional directions via the application. The collected data are processed and then fed back visually to the performers to indicate which emotions to express. A first user survey was conducted to assess the initial system following two public performances involving different ensembles and several issues were uncovered. This paper aims at describing changes made to the web application user interface and the visualisation system following a user-centred design approach. A second series of performances and user survey was then conducted validating the benefit of the changes.}, Author = {Lou, T. and Barthet, M. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the {AES} 53rd International Conference on Semantic Audio, Jan, 26-29., London, UK}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-03 13:28:36 +0000}, Keywords = {mood conductor, audience interaction, user study, evaluation, MAT project}, Title = {Evaluation and Improvement of the Mood Conductor Interactive System}, Year = 2014} @conference{fazekas2013the, Abstract = {Recommending music for professional use presents challenges that are substantially different from those faced by systems targeting recreational listeners and other classes of end users. This paper describes a trial system and survey for assessing the utility of content and metadata-based recommendation technology targeting television and radio producers. First, we briefly assess the applicability of existing recommendation technologies. We then describe the trial system and the applied recommendation methodologies used in the context of a music database exceeding one million tracks. Finally we draw conclusions from a small user study conducted with professional programme producers.}, Author = {Fazekas, G. and Barthet, M. and Sandler, M.}, Booktitle = {Proc. of the {IEEE} International Conference on Multimedia and Expo ({ICME}), 15--19, July, San Jose, CA, USA.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:01:56 +0000}, Doi = {10.1109/ICMEW.2013.6618235}, Keywords = {music recommendation, user trial, music similarity, big data, M4 project, BBC}, Title = {The {BBC} {Desktop} {Jukebox} music recommendation system: {A} large-scale trial with professional users}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1109/ICMEW.2013.6618235}} @conference{fazekas2013mood, Abstract = {Mood Conductor is a system that allows the audience to interact with stage performers to create directed improvisations. The term "conductor" is used metaphorically. Rather than directing a musical performance by way of visible gestures, spectators act as conductors by communicating emotional intentions to the performers through our web-based smartphone-friendly Mood Conductor app. Performers receive the audience's directions via a visual feedback system operating in real-time. Emotions are represented by coloured blobs in a two-dimensional space (vertical dimension: arousal or excitation; horizontal dimension: valence or pleasantness). The size of the "emotion blobs" indicates the number of spectators that have selected the corresponding emotions at a given time.}, Author = {Fazekas, G. and Barthet, M. and Sandler, M.}, Booktitle = {Proc. of the Humaine Association Conference on Affective Computing and Intelligent Interaction (ACII'13), 2-5 September, Geneva, Switzerland}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:01:47 +0000}, Doi = {10.1109/ACII.2013.165}, Keywords = {mood conductor, music, emotion, live music, performance, affective computing}, Title = {Mood Conductor: Emotion-Driven Interactive Music Performance}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1109/ACII.2013.165}} @book{allik2013facilitating, Abstract = {There is currently no agreement on common shared representations of audio features in the field of music information retrieval. The Audio Feature Ontology has been developed as part of a harmonised library of modular ontologies to solve the problem of interoperability between music related data sources. We demonstrate a software framework which combines this ontology and related Semantic Web technologies with data extraction and analysis software, in order to enhance audio feature extraction workflows.}, Author = {Allik, A. and Fazekas, G. and Dixon, S. and Sandler, M.}, Booktitle = {Post proceedings of 10th Extended Semantic Web Conference (ESWC'13), 26-30 May, Montpellier, France}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-03 14:32:31 +0000}, Doi = {10.1007/978-3-642-41242-4_20}, Editor = {Cimiano, P. and Fern{\'a}ndez, M. and Lopez, V. and Schlobach, S. and V{\"o}lker, J.}, Keywords = {linked-data, shared vocabularies, audio features, Semantic Web, SOVARR, JISC}, Pages = {178-183}, Publisher = {Springer-Verlag, Heidelberg, Germany.}, Series = {Lecture Notes in Computer Science (LNCS)}, Title = {Facilitating Music Information Research with Shared Open Vocabularies}, Volume = {7955}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1007/978-3-642-41242-4_20}} @book{allik2013a, Abstract = {The aim of the Shared Open Vocabulary for Audio Research and Retrieval project is to foster greater agreement on the representation of content-based audio features within music research communities. The Audio Feature Ontology has been developed for this purpose as part of a library of modular ontologies in order to increase interoperability, reproducibility and sustainability in music information retrieval workflows. The ontology provides a descriptive framework for expressing different conceptualisations of the audio features domain and allows for publishing content-derived information about audio recordings. }, Author = {Allik, A. and Fazekas, G. and Dixon, S. and Sandler, M.}, Booktitle = {Post proceedings of 10th Extended Semantic Web Conference (ESWC'13), 26-30 May, Montpellier, France}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:49:40 +0000}, Doi = {10.1007/978-3-642-41242-4_44}, Editor = {Cimiano, P. and Fern{\'a}ndez, M. and Lopez, V. and Schlobach, S. and V{\"o}lker, J.}, Keywords = {linked-data, shared vocabularies, audio features, Semantic Web, SOVARR, JISC}, Pages = {285--286}, Publisher = {Springer-Verlag, Heidelberg, Germany.}, Series = {Lecture Notes in Computer Science (LNCS)}, Title = {A {Shared} {Vocabulary} for {Audio} {Features}}, Volume = {7955}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1007/978-3-642-41242-4_44}} @conference{saari2013semantic, Abstract = {Social media services such as Last.fm provide crowd-sourced mood tags which are a rich but often noisy source of information. In contrast, editorial annotations from production music libraries are meant to be incisive in nature. We compare the efficiency of these two data sources in capturing semantic information on mood expressed by music. First, a semantic computing technique devised for mood-related tags in large datasets is applied to Last.fm and I Like Music (ILM) corpora separately (250,000 tracks each). The resulting semantic estimates are then correlated with listener ratings of arousal, valence and tension. High correlations (Spearman's rho) are found between the track positions in the dimensional mood spaces and listener ratings using both data sources (0.60 <; rs <; 0.70). In addition, the use of curated editorial data provides a statistically significant improvement compared to crowd-sourced data for predicting moods perceived in music.}, Author = {Saari, P. and Barthet, M. and Fazekas, G. and Eerola, T. and Sandler, M.}, Booktitle = {Proc. of the IEEE International Conference on Multimedia & Expo ({ICME}2013) International Workshop on Affective Analysis in Multimedia ({AAM}), 15-19 July 2013, San Jose, CA, USA}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:40:48 +0000}, Doi = {10.1109/ICMEW.2013.6618436}, Keywords = {production music, mood, emotion, M4 project, evaluation}, Title = {Semantic models of musical mood: {Comparison} between crowd-sourced and curated editorial tags}, Year = 2013, Bdsk-Url-1 = {https://dx.doi.org/10.1109/ICMEW.2013.6618436}} @conference{saari2013using, Abstract = {We propose a novel technique called Semantic Layer Projection (SLP) for predicting moods expressed by music based on audio features. In SLP, the predictive models are formed by a two-stage mapping from audio features to listener ratings of mood via a semantic mood layer. SLP differs from conventional techniques that produce a direct mapping from audio features to mood ratings. In this work, large social tag data from the Last.fm music service was analysed to produce a semantic layer that represents mood-related information in a low number of dimensions. The method is compared to baseline techniques at predicting the expressed Valence and Arousal in 600 popular music tracks. SLP clearly outperformed the baseline techniques at predicting Valence (R^2= 0.334 vs 0.245), and produced roughly equivalent performance in predicting Arousal (R^2 = 0.782 vs.0. 770). The difficulty of modelling Valence was highlighted by generally lower performance compared to Arousal. The improved prediction of Valence, and the increasingly abundant sources of social tags related to digital music make SLP a highly promising technique for future developments in modelling mood in music}, Author = {Saari, P. and Eerola, T. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the Sound and Music Computing Conference (SMC'13), Stockholm, Sweden}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:41:00 +0000}, Keywords = {music, mood, emotion, audio analysis, semantic computing, semantic layer projection, M4 project}, Title = {Using {Semantic} {Layer} {Projection} for {Enhancing} {Music} {Mood} {Prediction} {With} {Audio} {Features}}, Url = {files/papers/saari2013using.pdf}, Year = 2013, Bdsk-Url-1 = {files/papers/saari2013using.pdf}} @conference{tian2013towards, Abstract = {This paper examines existing metadata standards for describing music related information in the context of Chinese music tradition. With most research attention focussing on music, research into computational methods and knowledge representation for world music is still in its infancy. Following the introduction of symbolic elements in the Chinese traditional system, a comparison between these elements and the expressiveness of some prevailing metadata models and standards including Semantic Web ontologies is presented.}, Author = {Tian, M. and Fazekas, G. and Black, D. A. A. and Sandler, M.}, Booktitle = {Proc. of the {DCMI International Conference on Dublin Core and Metadata Applications (DC-2013)}, 2-6, September, Lisbon, Portugal}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-07 08:57:29 +0000}, Keywords = {metadata, standardisation, non-Western music, music representation, review, Dublin Core}, Local-Url = {files/papers/tian2013towards.pdf}, Pages = {71--81}, Presentation-Url = {http://dcevents.dublincore.org/IntConf/dc-2013/paper/view/160/135}, Title = {Towards the Representation of Chinese Traditional Music: A State of the Art Review of Music Metadata Standards}, Url = {http://dcpapers.dublincore.org/pubs/article/download/3672/1895}, Year = 2013, Bdsk-Url-1 = {files/papers/tian2013towards.pdf}} @conference{fazekas2013theA, Abstract = {This paper describes Mood Conductor, an interactive system that allows audience members to communicate emotional directions to performers in order to ``conduct'' improvised performances (e.g. music). Mood Conductor consists of three main technical components: a smartphone-friendly web application used by the audience, a server-side application for aggregating and clustering audience indicated emotion coordinates in the arousal-valence space, and a visualisation client that creates a video projection used by the musicians as guidance. This pro- jection also provides visual feedback for the audience. In this paper, we present the architecture of the system and the constrained real-time clustering algorithm that drives the visualisation. The tuning and testing of the system's parameters was based on three public interactive music performances held in UK and France with different ensembles. Qualitative and quantitative evaluations demonstrated that both musicians and audience are highly engaged with the system during performances and raised new insights for future improvements.}, Author = {Fazekas, G. and Barthet, M. and Sandler, M.}, Booktitle = {Proc. of the 10th {International} {Symposium} on {Computer} {Music} {Multidisciplinary} {Research} ({CMMR}'13), 15-18 October, Marseille, France.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:10:44 +0000}, Keywords = {interactive systems,audience-performer interaction, live music-making, improvisation, music, emotion, mood, arousal, valence}, Title = {The {Mood} {Conductor} {System}: {Audience} and {Performer} {Interaction} using {Mobile} {Technology} and {Emotion} {Cues}}, Year = 2013} @conference{lou2013evaluation, Abstract = {Only few audience-performer interactive systems for live music-making have been proposed so far. In previous works, we introduced Mood Conductor (MC), a system that allows audience members to guide improvised music performances using emotion cues. The MC system consists of a smartphone-friendly web application, a server component clus- tering emotion cues, and a visualisation client providing feedback. This study presents an online user survey following two public performances with a vocal quartet and a rock trio. 35 participants took part in the survey (29 audience members and 6 performers). The qualitative feedback helped us to identify several issues in the current web application and the visualisation client. Future versions of the system will aim at representing a single emotion cue reflecting the audience's average vote gradually over time, rather than rapid changes of individual intentions, which have shown to make impractical the interpretation of the data by performers and audience members.}, Author = {Lou, T. and Barthet, M. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 10th {International} {Symposium} on {Computer} {Music} {Multidisciplinary} {Research} ({CMMR}'13), 15-18 October, Marseille, France.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 15:25:05 +0000}, Keywords = {mood conductor, user study, evaluation, music, mood, emotion, interaction, interactive systems}, Title = {Evaluation of the {Mood} {Conductor} {Interactive} {System} {Based} on {Audience} and {Performers}' {Perspectives}}, Year = 2013} @conference{wilmering2013audio, Abstract = {While the classification of audio effects has several applications in music production, the heterogeneity of possible taxonomies, as well as the many viable points of view for organizing effects, present research problems that are not easily solved. Creating extensible Semantic Web ontologies provide a possible solution to this problem. This paper presents the results of a listening test that facilitates the creation of a classification system based on auditory perceptual attributes that are affected by the application of audio effects. The obtained results act as a basis for a classification system to be integrated in a Semantic Web Ontology covering the domain of audio effects in the context of music production.}, Author = {Wilmering, T. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 135th Convention of the Audio Engineering Society, New York, NY, USA.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 16:21:21 +0000}, Keywords = {audio effects, ontology, classification, perception, listening test, user trial}, Local-Url = {http://www.semanticaudio.net/files/papers/wilmering2013audio.pdf}, Title = {Audio Effect Classification Based on Auditory Perceptual Attributes}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=17057}, Year = 2013, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=17057}} @conference{saari2013the, Abstract = {Semantic Layer Projection (SLP) is a method for automatically annotating music tracks according to expressed mood based on audio. We evaluate this method by comparing it to a system that infers the mood of a given track using associated tags only. SLP differs from conventional auto-tagging algorithms in that it maps audio features to a low-dimensional semantic layer congruent with the circumplex model of emotion, rather than training a model for each tag separately. We build the semantic layer using two large-scale data sets -- crowd-sourced tags from Last.fm, and editorial annotations from the I Like Music (ILM) production music corpus -- and use subsets of these corpora to train SLP for mapping audio features to the semantic layer. The performance of the system is assessed in predicting mood ratings on continuous scales in the two data sets mentioned above. The results show that audio is in general more efficient in predicting perceived mood than tags. Furthermore, we analytically demonstrate the benefit of using a combination of semantic tags and audio features in automatic mood annotation.}, Author = {Saari, P. and Eerola, T. and Fazekas, G. and Barthet, M. and Lartillot O. and Sandler, M.}, Booktitle = {Proc. of the 14th International Society for Music Information Retrieval Conference, ISMIR'13, November 4-8, Curitiba, Brazil}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 19:32:46 +0000}, Editor = {Britto, A. S. Jr. and Gouyon, F. and Dixon, S.}, Isbn = {978-0-615-90065-0}, Keywords = {music, mood, emotion recognition, audio analysis, semantic layer projection, M4 project}, Title = {The Role of Audio and Tags in Music Mood Prediction: a Study Using Semantic Layer Projection}, Url = {http://www.ppgia.pucpr.br/ismir2013/wp-content/uploads/2013/09/225_Paper.pdf}, Year = 2013, Bdsk-Url-1 = {http://www.ppgia.pucpr.br/ismir2013/wp-content/uploads/2013/09/225_Paper.pdf}} @conference{wilmering2013the, Abstract = {In this paper we present the Audio Effects Ontology for the ontological representation of audio effects in music production workflows. Designed as an extension to the Studio Ontology, its aim is to provide a framework for the detailed description and sharing of information about audio effects, their implementations, and how they are applied in real-world production scenarios. The ontology enables capturing and structuring data about the use of audio effects and thus facilitates reproducibility of audio effect application, as well as the detailed analysis of music production practices. Furthermore, the ontology may inform the creation of metadata standards for adaptive audio effects that map high-level semantic descriptors to control parameter values. The ontology is using Semantic Web technologies that enable knowledge representation and sharing, and is based on modular ontology design methodologies. It is evaluated by examining how it fulfils requirements in a number of production and retrieval use cases.}, Author = {Wilmering, T. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 14th International Society for Music Information Retrieval Conference, ISMIR'13, November 4-8, Curitiba, Brazil}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 19:31:21 +0000}, Editor = {Britto, A. S. Jr. and Gouyon, F. and Dixon, S.}, Keywords = {ontology, audio effects, classification, Semantic Web, Studio Ontology, Music Ontology}, Title = {The Audio Effects Ontology}, Url = {http://ismir2013.ismir.net/wp-content/uploads/2013/09/41_Paper.pdf}, Year = 2013, Bdsk-Url-1 = {http://ismir2013.ismir.net/wp-content/uploads/2013/09/41_Paper.pdf}} @conference{song2013using, Abstract = {A wealth of literature on musical emotion exists, including investigation of the use of tags to classify musical emotions. However, the relationship between musical emotions and human annotated information is still unclear. Likewise, the understanding of the differences between induced emotion (also known as felt emotion) and perceived emotion (also known as expressed emotion) is at an early stage. In previous work, lists of songs labelled with one of the four basic emotion tags ``happy'', ``sad'', ``angry'' and ``relaxed'' were retrieved from Last.FM, and audio excerpts were fetched from 7Digital.com. In this study, we asked listeners to rate musical excerpts with the perceived or induced emotion fitting the excerpt. 80 excerpts (20 for each of the four emotions considered) were rated by 40 participants from various backgrounds and levels of musical expertise. The results show that in majority of the selected songs the tags agreed more closely with the ratings of perceived emotion than induced emotion. In addition, each induced emotion was highly correlated with its corresponding perceived emotion and induced anger can also be very distinct from its perceived ratings. However, the participants' emotional judgements were not related to measured cultural or musical factors.}, Author = {Song, Y. and Dixon, S. and Pearce, M. and Fazekas, G.}, Booktitle = {Proc. 3rd International Conference on Music and Emotion (ICME), June 11-15, Jyv{\"a}skyl{\"a}, Finland}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 19:31:37 +0000}, Editor = {Luck, G. and Brabant, O.}, Keywords = {music, emotion, mood, tags, user study, evaluation}, Publisher = {University of Jyv{\"a}skyl{\"a}, Department of Music}, Title = {Using Tags to Select Stimuli in the Study of Music and Emotion}, Url = {https://jyx.jyu.fi/dspace/handle/123456789/41639#}, Year = 2013, Bdsk-Url-1 = {https://jyx.jyu.fi/dspace/handle/123456789/41639#}} @conference{kosta2013a, Abstract = {Several algorithms have been developed in the music information retrieval community for predicting mood in music in order to facilitate organising and accessing large audio collections. Little attention has been paid however to how perceived emotion depends on cultural factors, such as listeners' acculturation or familiarity with musical background or language. In this study, we examine this dependence in the context of Greek music. A large representative database of Greek songs has been created and sampled observing predefined criteria such as the balance between Eastern and Western influenced musical genres. Listeners were then asked to rate songs according to their perceived mood. We collected continuous ratings of arousal and valence for short song excerpts and also asked participants to select a mood tag from a controlled mood vocabulary that best described the music. We analysed the consistency of ratings between Greek and non-Greek listeners and the relationships between the categorical and dimensional representations of emotions. Our results show that there is a greater agreement in listener's judgements with Greek background compared to the group with varying background. These findings suggest valuable implications on the future development of mood prediction systems.}, Author = {Kosta, K. and Song, Y. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 14th International Society for Music Information Retrieval Conference, ISMIR'13, November 4-8, Curitiba, Brazil}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 19:32:29 +0000}, Editor = {Britto, A. S. Jr. and Gouyon, F. and Dixon, S.}, Isbn = {978-0-615-90065-0}, Keywords = {music, mood, emotion, cross cultural study, greek music, listening test, statistics}, Pages = {317-322}, Title = {A Study of Cultural Dependence of Perceived Mood in Greek Music}, Url = {http://www.ppgia.pucpr.br/ismir2013/wp-content/uploads/2013/09/222_Paper.pdf}, Year = 2013, Bdsk-Url-1 = {http://www.ppgia.pucpr.br/ismir2013/wp-content/uploads/2013/09/222_Paper.pdf}} @conference{barthet2013design, Abstract = {In this paper we present and evaluate two semantic music mood models relying on metadata extracted from over 180,000 production music tracks sourced from I Like Music (ILM)'s collection. We performed non-metric multidimensional scaling (MDS) analyses of mood stem dissimilarity matrices (1 to 13 dimensions) and devised five different mood tag summarisation methods to map tracks in the dimensional mood spaces. We then conducted a listening test to assess the ability of the proposed models to match tracks by mood in a recommendation task. The models were compared against a classic audio contentbased similarity model relying on Mel Frequency Cepstral Coefficients (MFCCs). The best performance (60% of correct match, on average) was yielded by coupling the fivedimensional MDS model with the term-frequency weighted tag centroid method to map tracks in the mood space.}, Author = {Barthet, M. and Marston, D. and Baume, C. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 14th International Society for Music Information Retrieval Conference, ISMIR'13, November 4-8, Curitiba, Brazil}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 19:33:06 +0000}, Editor = {Britto, A. S. Jr. and Gouyon, F. and Dixon, S.}, Isbn = {978-0-615-90065-0}, Keywords = {music, mood, emotion, M4 project, emotion recognition}, Pages = {421-426}, Title = {Design and Evaluation of Semantic Mood Models for Music Recommendation Using Editorial Tags}, Url = {http://www.ppgia.pucpr.br/ismir2013/wp-content/uploads/2013/09/14_Paper.pdf}, Year = 2013, Bdsk-Url-1 = {http://www.ppgia.pucpr.br/ismir2013/wp-content/uploads/2013/09/14_Paper.pdf}} @conference{fazekas2012knowledge, Abstract = {In order for audio applications to interoperate, some agreement on how information is structured and encoded has to be in place within developer and user communities. This agreement can take the form of an industry standard or a widely adapted open framework consisting of conceptual data models expressed using formal description languages. There are several viable approaches to conceptualize audio related metadata, and several ways to describe the conceptual models, as well as encode and exchange information. While emerging standards have already been proven invaluable in audio information management, it remains difficult to design or choose the model that is most appropriate for an application. This paper facilitates this process by providing an overview, focusing on differences in conceptual models underlying audio metadata schemata.}, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 133rd Convention of the Audio Engineering Society, San Francisco, {CA}, {USA}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2020-12-27 14:38:23 +0000}, Keywords = {metadata, ontology, evaluation, review}, Local-Url = {http://semanticaudio.net/files/papers/fazekas2012aes133.pdf}, Publisher-Url = {https://secure.aes.org/forum/pubs/conventions/?elib=16507}, Title = {Knowledge Representation Issues in Audio-Related Metadata Model Design}, Url = {http://semanticaudio.net/files/papers/fazekas2012aes133.pdf}, Year = 2012, Bdsk-Url-1 = {https://secure.aes.org/forum/pubs/conventions/?elib=16507}} @misc{fazekas2012semantic, Abstract = {The emerging Semantic Web provides a powerful framework for the expression and reuse of structured data. Recent efforts have brought this framework to bear on the field of Semantic Audio, as well as information management in audio applications. This tutorial will provide an introduction to Semantic Web concepts and how they can be used in the context of music-related studies. We will outline the use of the Resource Description Framework (RDF) and related ontology and query languages. Using practical examples, we will demonstrate the use of the Music and Studio Ontologies, and show how they facilitate interoperability between audio applications and linked data sets on the Web. We will explore how signal processing tools and results can be described as structured data and utilised in audio production. }, Author = {Fazekas, G. and Wilmering, T.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2020-12-27 14:43:49 +0000}, Doi = {https://doi.org/10.5281/zenodo.1324499}, Keywords = {Semantic Audio, Semantic Web, tutorial}, Publisher = {Tutorial presented at the 132nd {Convention} of the {Audio} {Engineering} {Society}, 26-29 April, {Budapest}, {Hungary}}, Title = {Semantic Web and Semantic Audio Technologies}, Url = {http://isophonics.net/content/aes132-tutorial}, Year = 2012, Bdsk-Url-1 = {http://isophonics.net/content/aes132-tutorial}} @conference{wilmering2012high, Abstract = {Existing adaptive digital audio effects predominantly use low-level features in order to derive control data. These data do not typically correspond to high-level musicological or semantic information about the content. In order to apply audio transformations selectively on different musical events in a multitrack project, audio engineers and music producers have to resort to manual selection or annotation of the tracks in traditional audio production environments. We propose a new class of audio effects that uses high-level semantic audio features in order to obtain control data for multitrack effects. The metadata is expressed in RDF using several music and audio related Semantic Web ontologies and retrieved using the SPARQL query language.}, Author = {Wilmering, T. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 133rd Convention of the Audio Engineering Society, San Francisco, {CA}, {USA}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 16:59:57 +0000}, Keywords = {audio effects, adaptive effects, VST, Semantic Audio Processing}, Title = {High level semantic metadata for the control of multitrack adaptive audio effects}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=16508}, Year = 2012, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=16508}} @conference{barthet2012social, Abstract = {Advances in content-based multimedia analysis, recommender systems and Web-based social platforms for content and metadata sharing provide opportunities to create novel applications for music education. In this paper we describe a framework for intelligent music tutoring systems, through the combined use of content and context-based approaches. First, we investigate traditional computer-assisted music education applications, and review music information retrieval and Web technologies relevant to social media retrieval and music education. We discuss semantic aspects of these technologies and the use of ontologies as common grounds for structuring heterogeneous information available on the Web and from machine analyses. The importance of multimodality in music education tools is highlighted before we discuss how the reviewed technologies and information resources may be combined in interactive tools for music learning, for instance, a tool for searching the Web for guitar tablatures and YouTube video tutorials.}, Author = {Barthet, M. and Fazekas, G. and Dixon, S. and Sandler, M.}, Booktitle = {In Digital Futures 2012: The Third Annual Digital Economy All Hands Conference, 23-25 October, Aberdeedn, UK}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 16:33:14 +0000}, Keywords = {guitar tuition, interaction, HCI, social media, information retrieval, hotttabs}, Title = {Social Media Retrieval for Music Education}, Url = {http://www.eecs.qmul.ac.uk/~simond/pub/2012/Barthet-etal-SocialMediaRetrieval.pdf}, Year = 2012, Bdsk-Url-1 = {http://www.eecs.qmul.ac.uk/~simond/pub/2012/Barthet-etal-SocialMediaRetrieval.pdf}} @conference{terrell2012listening, Abstract = {We examine the effect of listening level, i.e. the absolute sound pressure level at which sounds are reproduced, on music similarity, and in particular, on playlist generation. Current methods commonly use similarity metrics based on Mel-frequency cepstral coefficients (MFCCs), which are derived from the objective frequency spectrum of a sound. We follow this approach, but use the level-dependent auditory spectrum, evaluated using the loudness models of Glasberg and Moore, at three listening levels, to produce auditory spectrum cepstral coefficients (ASCCs). The ASCCs are used to generate sets of playlists at each listening level, using a typical method, and these playlists were found to differ greatly. From this we conclude that music recommendation systems could be made more perceptually relevant if listening level information were included. We discuss the findings in relation to other fields within MIR where inclusion of listening level might also be of benefit}, Author = {Terrell, M. J. and Fazekas, G. and Simpson, A. J. R. and Smith, J. and Dixon S.}, Booktitle = {Proc. of the 13th {International} {Society} for {Music} {Information} {Retrieval} {Conference} ({ISMIR}'12), 8-12 October, Porto, Portugal}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-08 19:01:29 +0000}, Keywords = {audiotory model, music similarity, loudness, ASCC, auditory spectrum cepstral coefficients}, Pages = {487--492}, Title = {Listening level changes music similarity}, Url = {http://ismir2012.ismir.net/event/papers/487_ISMIR_2012.pdf}, Year = 2012, Bdsk-Url-1 = {http://ismir2012.ismir.net/event/papers/487_ISMIR_2012.pdf}} @conference{barthet2012multidisciplinary, Abstract = {The prominent status of music in human culture and every day life is due in large part to its striking ability to elicit emotions, which may manifest from slight variation in mood to changes in our physical condition and actions. In this paper, we first review state of the art studies on music and emotions from different disciplines including psychology, musicology and music information retrieval. Based on these studies, we then propose new insights to enhance automated music emotion recognition models.}, Author = {Barthet, M. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 9th {International Symposium on Computer Music Modelling and Retrieval (CMMR'12)} 19-22 June, London, UK}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 16:47:55 +0000}, Keywords = {music, mood, emotion, M4 project}, Title = {Multidisciplinary Perspectives on Music Emotion Recognition: Implications for Content and Context-Based Models}, Url = {http://www.cmmr2012.eecs.qmul.ac.uk/sites/cmmr2012.eecs.qmul.ac.uk/files/pdf/papers/cmmr2012_submission_101.pdf}, Year = 2012, Bdsk-Url-1 = {http://www.cmmr2012.eecs.qmul.ac.uk/sites/cmmr2012.eecs.qmul.ac.uk/files/pdf/papers/cmmr2012_submission_101.pdf}} @conference{fazekas2011a, Abstract = {This paper presents a general framework for using appropriately structured information about audio recordings in music processing, and shows how this framework can be utilised in multitrack music production tools. The information, often referred to as metadata, is commonly represented in a highly domain and application specific format. This prevents interoperability and its ubiquitous use across applications. In this paper, we address this issue. The basis for the formalism we use is provided by Semantic Web ontologies rooted in formal logic. A set of ontologies are used to describe structured representation of information such as tempo, the name of instruments or onset times extracted from audio. This information is linked to audio tracks in music production environments as well as processing blocks such as audio effects. We also present specific case studies, for example, the use of audio effects capable of processing and predicting metadata associated with the processed signals. We show how this increases the accuracy of description, and reduces the computational cost, by omitting repeated application of feature extraction algorithms.}, Author = {Fazekas, G. and Wilmering, T. and Sandler, M. B.}, Booktitle = {Proc. of the {AES} 42nd International Conference on Semantic Audio, 22-24 July, Ilmenau, Germany}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 17:13:30 +0000}, Keywords = {Semantic Audio Processing, intelligent editing, audio production}, Pages = {22--24}, Title = {A knowledge representation framework for context-dependent audio processing}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=15967}, Year = 2011, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=15967}} @conference{wilmering2011towards, Abstract = {In this paper we discuss the development of ontological representations of digital audio effects and provide a framework for the description of digital audio effects and audio effect transformations. After a brief account on our current research in the field of high-level semantics for music production using Semantic Web technologies, we detail how an Audio Effects Ontology can be used within the context of intelligent music production tools, as well as for musicological purposes. Furthermore, we discuss problems in the design of such an ontology arising from discipline-specific classifications, such as the need for encoding different taxonomical systems based on, for instance, implementation techniques or perceptual attributes of audio effects. Finally, we show how information about audio effect transformations is represented using Semantic Web technologies, the Resource Description framework (RDF) and retrieved using the SPARQL query language.}, Author = {Wilmering, T. and Fazekas, G. and Sandler, M. B.}, Booktitle = {Proc. of the 14th International Conference on Digital Audio Effects ({DAFx}-11)}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 17:26:59 +0000}, Keywords = {audio effects, ontology design, knowledge representation}, Pages = {19--23}, Title = {Towards ontological representations of digital audio effects}, Url = {http://recherche.ircam.fr/pub/dafx11/Papers/64_e.pdf}, Year = 2011, Bdsk-Url-1 = {http://recherche.ircam.fr/pub/dafx11/Papers/64_e.pdf}} @conference{fazekas2011the, Abstract = {This paper introduces the Studio Ontology Framework for describing and sharing detailed information about music production. The primary aim of this ontology is to capture the nuances of record production by providing an explicit, application and situation independent conceptualisation of the studio environment. We may use the ontology to describe real-world recording scenarios involving physical hardware, or (post) production on a personal computer. It builds on Semantic Web technologies and previously published ontologies for knowledge representation and knowledge sharing.}, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 12th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}'11) conference, 24-28 Oct., Miami, Florida, USA}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 17:13:59 +0000}, Keywords = {ontology, Studio Ontology, audio production, Semantic Audio}, Pages = {24--28}, Title = {The Studio Ontology Framework}, Url = {http://ismir2011.ismir.net/papers/PS3-20.pdf}, Year = 2011, Bdsk-Url-1 = {http://ismir2011.ismir.net/papers/PS3-20.pdf}} @conference{kolozali2011knowledge, Abstract = {This paper presents preliminary work on musical instruments ontology design, and investigates heterogeneity and limitations in existing instrument classification schemes. Numerous research to date aims at representing information about musical instruments. The works we examined are based on the well known Hornbostel and Sach's classification scheme. We developed representations using the Ontology Web Language (OWL), and compared terminological and conceptual heterogeneity using SPARQL queries. We found evidence to support that traditional designs based on taxonomy trees lead to ill-defined knowledge representation, especially in the context of an ontology for the Semantic Web. In order to overcome this issue, it is desirable to have an instrument ontology that exhibits a semantically rich structure.}, Author = {Kolozali, S. and Fazekas, G. and Barthet, M. and Sandler, M.}, Booktitle = {Proc. of the 12th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}'11) conference, 24-28 Oct., Miami, Florida, USA}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 19:36:09 +0000}, Keywords = {ontology, instrument taxonomy, ontology design, Semantic Web}, Title = {Knowledge representation issues in musical instrument ontology design}, Year = 2011} @conference{barthet2011music, Abstract = {Music recommendation systems built on top of music information retrieval (MIR) technologies are usually designed to provide new ways to discover and listen to digital music collections. However, they do not typically assist in another important aspect of musical activity, music learning. In this study we present the application Hotttabs, an online music recommendation system dedicated to guitar learning. Hotttabs makes use of The Echo Nest music platform to retrieve the latest popular or hot songs based on editorial, social and charts/sales criteria, and YouTube to find relevant guitar video tutorials. The audio tracks of the YouTube videos are processed with an automatic chord extraction algorithm in order to provide a visual feedback of the chord labels synchronised with the video. Guitar tablatures, a form of music notation showing instrument ngerings, are mined from the web and their chord sequences are extracted. The tablatures are then clustered based on the songs' chord sequences complexity so that guitarists can pick up those adapted to their performance skills}, Author = {Barthet, M. and Anglade, A. and Fazekas, G. and Kolozali, S. and Macrae, R.}, Booktitle = {in {Proc}. of the 2nd {Workshop} on {Music} {Recommendation} and {Discovery} ({WOMRAD}'11) in conjunction with the {ACM} {Recommender} {Systems} conference ({RecSys}'11)}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 17:05:06 +0000}, Keywords = {guitar tuition, hotttabs, social media, recommendation, gutar tabs, interaction}, Title = {Music recommendation for music learning: {Hotttabs} a multimedia guitar tutor}, Url = {http://ceur-ws.org/Vol-793/womrad2011_paper2.pdf}, Year = 2011, Bdsk-Url-1 = {http://ceur-ws.org/Vol-793/womrad2011_paper2.pdf}} @conference{wilmering2010the, Abstract = {The task of onset detection is relevant in various contexts such as music information retrieval and music production, while reverberation has always been an important part of the production process. The effect may be the product of the recording space, or it may be artificially added, and in our context destructive. In this paper, we investigate the effect of reverberation on onset detection tasks. We compare state-of-the art techniques and show that the algorithms have varying degrees of robustness in the presence of reverberation depending on the content of the analysed audio material. }, Author = {Wilmering, T. and Fazekas, G. and Sandler, M.}, Booktitle = {in {Proceedings} of the 128th {Convention} of the {Audio} {Engineering} {Society}, {London}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:37:18 +0000}, Keywords = {onset detection, perception}, Title = {The effects of reverberation on onset detection tasks}, Year = 2010} @book{kolozali2011towards, Abstract = {In this study, we present a novel hybrid ontology generation system for musical instruments. The music ontology is a Semantic Web ontology that describes music-related information (e.g., release, artist, performance), but does not provide models of musical instruments. Hence, there is a need to develop a separate instrument ontology to deepen how music knowledge is represented on the Se- mantic Web. Such complementary knowledge on musical instruments can be useful to develop music recognition and recommendation systems based on semantic reasoning. This work is a preliminary step which focuses on automatic instrument taxonomy generation in Ontology Web Language (OWL). The taxonomy of musical instruments given by Hornbostel and Sachs [3] was considered as the basis for our instrument terms and initial hierarchical structure. The hybrid system consists of three main units: i) musical instrument analysis, ii) Formal Concept Analysis, iii) lattice pruning and hierarchical form generation.}, Author = {Kolozali, S. and Barthet, M. and Fazekas, G. and Sandler, M.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 19:35:26 +0000}, Doi = {10.1007/978-3-642-23017-2_13}, Edition = {Semantic Multimedia}, Isbn = {978-3-642-23016-5}, Keywords = {automatic ontology generation, instrument taxonomy, ontology design}, Pages = {186-187}, Publisher = {Springer-Verlag Berlin, Heidelberg}, Series = {Lecture Notes in Computer Science Volume}, Title = {Towards the automatic generation of a {Semantic} {Web} ontology for musical instruments}, Volume = {6725}, Year = 2011, Bdsk-Url-1 = {https://dx.doi.org/10.1007/978-3-642-23017-2_13}} @conference{fazekas2009novel, Abstract = {This paper discusses architectural aspects of a software library for unified metadata management in audio processing applications. The data incorporates editorial, production, acoustical and musicological features for a variety of use cases, ranging from adaptive audio effects to alternative metadata based visualisation. Our system is designed to capture information, prescribed by modular ontology schema. This advocates the development of intelligent user interfaces and advanced media workflows in music production environments. In an effort to reach these goals, we argue for the need of modularity and interoperable semantics in representing information. We discuss the advantages of extensible Semantic Web ontologies as opposed to using specialised but disharmonious metadata formats. Concepts and techniques permitting seamless integration with existing audio production software are described in detail}, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 12th {International} {Conference} on {Digital} {Audio} {Effects} ({DAFx}-09), {Como}, Italy}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:52:49 +0000}, Keywords = {Semantic Audio Processing, ontology, RDF, C++, library}, Title = {Novel methods in information management for advanced audio workflows}, Url = {http://www.semanticaudio.net/files/papers/fazekas2009novel.pdf}, Year = 2009, Bdsk-Url-1 = {http://dafx09.como.polimi.it/proceedings/papers/paper_93.pdf}} @conference{tidhar2009publishing, Abstract = {We describe the process of collecting, organising and publishing a large set of music similarity features produced by the SoundBite [10] playlist generator tool. These data can be a valuable asset in the development and evaluation of new Music Information Retrieval algorithms. They can also be used in Web-based music search and retrieval applications. For this reason, we make a database of features available on the Semantic Web via a SPARQL end-point, which can be used in Linked Data services. We provide examples of using the data in a research tool, as well as in a simple web application which responds to audio queries and finds a set of similar tracks in our database.}, Author = {Tidhar, D. and Fazekas, G. and Kolozali, S. and Sandler, M.}, Booktitle = {Proc. of the 10th {International} {Society} for {Music} {Information} {Retrieval} ({ISMIR}-09) conference, {Oct}., {Kobe}, Japan}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:53:44 +0000}, Keywords = {music similarity, recommendation, ontology, Semantic Web, MFCCs}, Title = {Publishing {Music} {Similarity} {Features} on the {Semantic} {Web}}, Url = {http://ismir2009.ismir.net/proceedings/PS3-10.pdf}, Year = 2009, Bdsk-Url-1 = {http://ismir2009.ismir.net/proceedings/PS3-10.pdf}} @conference{fazekas2009ontology, Abstract = {In information management, ontologies are used for defining concepts and relationships of a domain in question. The use of a schema permits structuring, interoperability and automatic interpretation of data, thus allows accessing information by means of complex queries. In this paper, we use ontologies to associate metadata, captured during music production, with explicit semantics. The collected data is used for finding audio clips processed in a particular way, for instance, using engineering procedures or acoustic signal features. As opposed to existing metadata standards, our system builds on the Resource Description Framework, the data model of the Semantic Web. This provides flexible and open-ended knowledge representation. Using this model, we demonstrate a framework for managing information, relevant in music production.}, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {Proc. 126th Convention of the Audio Engineering Society, Munich, Germany}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:12:37 +0000}, Keywords = {Semantic Audio Processing, ontology, audio production}, Title = {Ontology based information management in music production}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=14861}, Year = 2009, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=14861}} @conference{fazekas2008a, Abstract = {Musical metadata may include references to individuals, equipment, procedures, parameters or audio features extracted from signals. There are countless possibilities for using this data during the production process. An intelligent audio editor, besides internally relying on it, can be both producer and consumer of information about specific aspects of music production. In this paper, we propose a framework for producing and managing meta information about a recording session, a single take or a subsection of a take. As basis for the necessary knowledge representation we use the Music Ontology with domain specific extensions. We provide examples on how metadata can be used creatively, and demonstrate the implementation of an extended metadata editor in a multitrack audio editor application.}, Author = {Fazekas, G. and Raimond, Y. and Sandler, M.}, Booktitle = {Proc. of the 125th Convention of the Audio Engineering Society, San Francisco, USA}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:30:04 +0000}, Keywords = {audio production, metadata}, Title = {A framework for producing rich musical metadata in creative music production}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=14695}, Year = 2008, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=14695}} @conference{fazekas2007structural, Abstract = {In an intelligent editing environment, the semantic music structure can be used as beneficial assistance during the post production process. In this paper we propose a new approach to extract both low and high level hierarchical structure from vocal tracks of multi-track master recordings. Contrary to most segmentation methods for polyphonic audio, we utilize extra information available when analyzing a single audio track. A sequence of symbols is derived using a hierarchical decomposition method involving onset detection, pitch tracking and timbre modelling to capture phonetic similarity. Results show that the applied model well captures similarity of short voice segments.}, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 123rd {Convention} of the {Audio} {Engineering} {Society}, {New} {York}, USA}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:34:04 +0000}, Keywords = {structural segmentation, Gaussian Mixture Model, timbre similarity, audio editing guide}, Title = {Structural decomposition of recorded vocal performances and its application to intelligent audio editing}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=14307}, Year = 2007, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=14307}} @conference{fazekas2007intelligent, Abstract = {In a complex sound editing project, automatic exploration and labelling of the semantic music structure can be highly beneficial as a creative assistance. This paper describes the development of new tools that allow the engineer to navigate around the recorded project using a hierarchical music segmentation algorithm. Segmentation of musical audio into intelligible sections like: chorus and verses will be discussed followed by a short overview of the novel segmentation approach by timbre-based music representation. Popular sound-editing platforms were investigated to find an optimal way of implementing the necessary features. The integration of music segmentation and the development of a new navigation toolbar in Audacity, an open-source multi-track editor will be described in more detail.}, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 122nd {Convention} of the {Audio} {Engineering} {Society}, {Vienna}, Austria}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:33:47 +0000}, Keywords = {structural segmentation, Hidden Markov Model, audio editing guide, intelligent editing, Semantic Audio Processing}, Title = {Intelligent editing of studio recordings with the help of automatic music structure extraction}, Url = {http://www.aes.org/e-lib/browse.cfm?elib=14024}, Year = 2007, Bdsk-Url-1 = {http://www.aes.org/e-lib/browse.cfm?elib=14024}} @conference{font2014extending, Abstract = {Currently proposed tagging ontologies are mostly focused on the definition of a common schema for representing the agents involved in a tagging process. In this paper we introduce an idea for extending tagging ontologies by incorporating some domain specific class definitions and relations. We illustrate our idea with a particular use case where a tag recommendation system is driven by such an ontology. Besides our use case, we believe that such extended tagging ontologies can bring more meaningful structure into folksonomies and improve browsing and organisation functionalities of online platforms relying on tagging systems.}, Author = {Font, F. and Oramas, S. and Fazekas, G. and Serra, X.}, Booktitle = {Presented at the 13th {International} {Semantic} {Web} {Conference} ({ISWC}), 19-23 {October}, {Trento}, {Italy}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-11-26 17:13:06 +0000}, Keywords = {folksonomy, ontology, music tagging, freesound, Semantic Web}, Title = {Extending {Tagging} {Ontologies} with {Domain} {Specific} {Knowledge}}, Year = 2014} @conference{bechhofer2014computational, Abstract = {The Computational Analysis of the Live Music Archive (CALMA) project aims to facilitate investigation and scholarship related to live music through development of a Linked Data service combining metadata captured during deposition of audio to the Internet Archive, with computational analyses over these recordings through feature extraction, clustering, and classification. In this poster and demonstrator we introduce the architecture, tools, and data structures we have developed to create this combined resource, and provide a first release of the dataset including provenance metadata to assist its interrogation and reuse. We also show the early results of questions assessed over the data that (i) aid resolution of uncertain metadata, identification of potential errors, and validation of existing entries, and (ii) provide metrics for broad patterns in performance variation that can be used to select subsets within the data for further longitudinal and musicological study.}, Author = {Bechhofer, S. and Dixon, S. and Fazekas, G. and Wilmering, T. and Page, K.}, Booktitle = {Presented at the 15th {International} {Society} of {Music} {Information} {Retrieval} ({ISMIR}) Conference late-breaking workshop, {Oct} 27-31, 2014, {Taipei, Taiwan}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-11-26 17:43:37 +0000}, Keywords = {linked-data, live music, LMA, provenance, metadata, Semantic Web, CALMA, semantic media mini-project}, Title = {Computational {Analysis} of the {Live} {Music} {Archive}}, Year = 2014} @conference{moramcginity2014creating, Abstract = {This paper presents the application that we intend to demonstrate. Our project aims at discovering and offering researchers in music and social sciences new information resources by linking music and publishing metadata. The application gathers metadata by accessing various web resources, links the data and stores it in a semantic database. The data is presented in a faceted manner, allowing the user to navigate the data through an interface, thus making it possible for her to discover new and valuable resources.}, Author = {Mora-McGinity, M. and Fazekas, G. and Ogilive, G.}, Booktitle = {{Presented} at the {Digital} {Music} {Research} {Network} {Workshop}, {Dec}., {London}, UK at the 15th {International} {Society} of {Music} {Information} {Retrieval} ({ISMIR}) Conference late-breaking workshop, {Oct} 27-31, 2014, {Taipei, Taiwan}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-11-26 17:41:44 +0000}, Keywords = {linked-data, ontology, Semantic Web, semantic media mini-project, MUSIC, Academic Charts}, Title = {Creating {Semantic} {Links} between {Research} {Articles} and {Music} {Artists}}, Year = 2014} @conference{stables2014safe, Abstract = {In this paper, we present an overview of the Semantic Audio Feature Extraction (SAFE) Project, a system for the extraction and retrieval of semantic descriptions of musical timbre, deployed within the digital audio workstation. By embedding the data capture system into the music production workflow, we are able to maximise the return of semantically annotated music production data, whilst mit- igating against issues such as musical and environmental bias. Users of the plugins are free to submit semantic de- scriptions of their own music, whilst utilising the continually growing collaborative dataset of musical descriptors. In order to provide more contextually representative timbral transformations, the dataset is partitioned using metadata, captured within the application.}, Author = {Stables, R. and Enderby, S. and De Man, B. and Fazekas}, Booktitle = {{Presented} at the 15th {International} {Society} of {Music} {Information} {Retrieval} ({ISMIR}) Conference late-breaking workshop, {Oct} 27-31, 2014, {Taipei, Taiwan}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-11-26 17:42:49 +0000}, Keywords = {semantic audio, VST plugins, data collection, ISMIR demo}, Title = {SAFE: {A} {System} for {Extraction} and {Retrieval} of {Semantic} {Audio} {Descriptors}}, Url = {http://www.semanticaudio.net/files/papers/stables2014safe.pdf}, Year = 2014} @incollection{fazekas2014a, Abstract = {Viewers watching TV may would like to use their tablet or smart phone as a 'second screen', firstly to identify any music playing on the TV, and then secondly to discover more information about it. Thus, the microphone of the 'second screen' device is used to listen to the music playing on the TV, whilst audio fingerprinting technology is used to identify it. Then, a dynamically webpage is generated providing rich information about the music identified, as well as related music and musical artists based on social-cultural factors. The latter is achieved by querying web services such as YouTube, The Echonest, Last.fm and MusicBrainz. Linking and making sense - knowledge inference - out of such wide range and diverse music-related data acquired across multiple sources and services on the web is achieved thanks to C4DM Music Ontology. An Android app acting as a 'second screen' is currently available for demonstration purposes.}, Author = {Fazekas, G. and Kudumakis, P.}, Booktitle = {{S}ubmitted in response to the {Digital} {Media} {Project} ({DMP}) {Hybrid} {Media} {Services} call}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-03 13:15:54 +0000}, Keywords = {linked-data, music recommendation, standardisation, second-screen, Semantic Web}, Title = {A second screen music discovery and recommendation service based on social and cultural factors}, Year = 2014} @conference{tian2013the, Author = {Tian, M. and Black, D. A. A. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 3rd {International Workshop on Folk Music Analysis (FMA'13)}, 6-7 June, Amsterdam, Netherlands}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-07 09:07:55 +0000}, Editor = {Kranenburg, P. van and Anagnostopoulou, C. and Volk, A}, Isbn = {978-90-70389-78-9}, Keywords = {non-Western music, ontology, audio analysis, emotion recognition}, Title = {Content-based Emotion Categorisation Analysis of Chinese Cultural Revolution Songs}, Url = {http://dspace.library.uu.nl/handle/1874/276246}, Year = 2013, Bdsk-Url-1 = {http://dspace.library.uu.nl/handle/1874/276246}} @conference{fazekas2012shared, Abstract = {This paper presents two ongoing projects at the Centre for Digital Music, Queen Mary University of London. Both projects are investigating the benefits of common data representations when dealing with large collections of media. The Semantic Media project aims at establishing an open interdisciplinary research network with the goal of creating highly innovative media navigation tools, while the Shared Open Vocabulary for Audio Research and Retrieval (SOVARR) project builds on community involvement to improve existing tools and ontologies for MIR research. Common goals include bringing together experts with various research backgrounds and establishing open vocabularies in combination with semantic media technologies as viable tools for sustainable and interoperable workflows. In this paper, we summarise our projects as well as the results of the Shared Open Vocabularies session that took place at ISMIR 2012}, Author = {Fazekas, G. and Ewert, S. and Allik, A. and Dixon, S. and Sandler, M.}, Booktitle = {Proc. of the 13th {International} {Society} for {Music} {Information} {Retrieval} {Conference} ({ISMIR}'12), late-breaking workshop, 8-12 October, Porto, Portugal}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:50:43 +0000}, Keywords = {SOVARR, Semantic Media}, Title = {Shared Open Vocabularies and Semantic Media}, Url = {http://ismir2012.ismir.net/event/papers/LBD9.pdf}, Year = 2012, Bdsk-Url-1 = {http://ismir2012.ismir.net/event/papers/LBD9.pdf}} @conference{kolozali2010the, Author = {Kolozali, S. and Barthet, M. and Fazekas, G. and Tidhar D. and Sandler, M.}, Booktitle = {presented at the {Digital} {Music} {Research} {Network} {Workshop}, 21 {Dec}., {London}, UK.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 17:39:09 +0000}, Keywords = {instrument taxonomy, ontology, ontology design}, Title = {The musical instrument ontology}, Year = 2010} @conference{fazekas2010tempest, Author = {Fazekas, G. and Tidhar, D.}, Booktitle = {presented at the {Digital} {Music} {Research} {Network} {Workshop}}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 17:36:14 +0000}, Keywords = {temperament, tuning, Web Service, Semantic Web, Semantic Audio, SAWA}, Title = {TempEst - {Temperament} estimation {Web} service}, Year = 2010} @conference{tidhar2010temperament, Abstract = {Tuning and temperament have been occupying musical and scientic minds for many centuries. Towards the end of the twentieth century, as historical performance practice was gradually becoming an established part of mainstream musical activity, more attention has been directed to the study and application of historical unequal temperaments. We have recently presented experimental results demonstrating that it is possible to classify keyboard temperament automatically from recordings of typical harpsichord pieces (Tidhar, Mauch, & Dixon, 2010). Six different commonly-used temperaments have been accurately recognised in a dataset consisting of 48 recordings. In (Tidhar, Fazekas, Mauch, & Dixon, 2010) we present TempEst, an online temperament estimation service based on components developed within the OMRAS2 project. TempEst employs the estimation algorithms developed in (Tidhar, Mauch, & Dixon, 2010), enhanced by a Temperament Ontology ((Fazekas & Tidhar, 2009)) and an additional inference module. We are currently working on improving and extending the ontology and inference components, and on applying the temperament estimation method to larger collections of commercially available recordings. In this late-breaking presentation we will brie y provide some background to the temperament estimation project, present the current state of the Temperament Ontology, discuss the nature of temperament estimation as an MIR task, and present some initial results of the analysis of commercially available harpsichord recordings.}, Author = {Tidhar, D. and Fazekas, G. and Mauch, M. and Dixon, S.}, Booktitle = {{Presented} at the 11th {International} {Society} for {Music} {Information} {Retrieval} {Conference} ({ISMIR}'10), {Late}-breaking session}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:36:29 +0000}, Keywords = {temperament, tuning, audio analysis}, Title = {Temperament {Estimation} as an {MIR} task}, Url = {http://ismir2010.ismir.net/proceedings/late-breaking-demo-30.pdf}, Year = 2010, Bdsk-Url-1 = {http://ismir2010.ismir.net/proceedings/late-breaking-demo-30.pdf}} @webpage{raimond2010the, Abstract = {The Music Ontology Specification provides main concepts and properties fo describing music (i.e. artists, albums and tracks) on the Semantic Web.}, Author = {Raimond, Y. and G{\"a}ngler, T. and Giasson, F. and Jacobson, K. and Fazekas, G. and Reinhardt, S. and Passant, A.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-07 08:58:13 +0000}, Keywords = {ontology, Music Ontology, Semantic Web}, Local-Url = {http://musicontology.com}, Publisher = {Published online}, Title = {The music ontology specification}, Url = {http://musicontology.com}, Year = 2010, Bdsk-Url-1 = {http://musicontology.com}} @conference{fazekas2009uncovering, Abstract = {One of the burning issues in collecting and managing audio related information is the loss of detail in the production chain. During recording and post-production a number of participants: musicians, engineers and producers interact with numerous real-world or software based equipment such as instruments, audio processing hardware and computer programs used in virtual studio environments. This scenario potentially creates a wealth of information which can be used creatively in music production, music education, sound engineer training, music information retrieval or for enriching music related knowledge on the Semantic Web. For instance, discovering influences in musicianship, audio engineering practices, finding out how a particular song was produced, what equipment, plug-ins and parameters were used to achieve a certain sound or ambience can be done if metadata is collected during the production process. However, in order to make use of this data, it needs to be formatted carefully using well designed schemas. We found that existing metadata formats fall short in one way or another, mainly in expressiveness and extensibility in describing the information detailed above. We address these issues by developing an ontology based information management solution based on Semantic Web ontologies, such as the Music Ontology and extensions specific to studio production. The system allows capturing of a diverse set of metadata including audio signal features and performs automatic data collection in the studio. Its interface can also be used to enter relevant details manually, for example by an archivist annotating a recording, using information from different sources. }, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {{Presented} at the {Unlocking} {Audio} 2 {Conference}, 16-17 {March}, {London}, UK}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:18:44 +0000}, Keywords = {audio production, ontology}, Title = {Uncovering the details of music production using ontologies}, Url = {http://www.bl.uk/reshelp/bldept/soundarch/unlockaudio/papers09/unlockingaudio2.pdf}, Year = 2009, Bdsk-Url-1 = {http://www.bl.uk/reshelp/bldept/soundarch/unlockaudio/papers09/unlockingaudio2.pdf}} @conference{cannam2009a, Author = {Cannam, C. and Fazekas, G. and Noland, K.}, Booktitle = {presented at the Special SIGMUS Symposium, 2 Nov., Tokyo, Japan}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-10 22:18:44 +0000}, Keywords = {sonic visualiser, demo, ISMIR2009}, Title = {A Demonstration of Sonic Visualiser}, Url = {http://www.sigmus.jp/SIG/sig200911listofdemos-e.html}, Year = 2009, Bdsk-Url-1 = {http://www.sigmus.jp/SIG/sig200911listofdemos-e.html}} @techreport{fazekas2009a, Abstract = {We describe the construction of SAWA a simple Web-based system for automated audio analysis. This system is capable of calculating an easily extended set of musically meaningful features such as beat, tempo, and key estimates from uploaded audio files, returning the results as rich RDF data suitable for interlinking on the Semantic Web. Unlike existing systems, our application is built on open and reusable components and provides an example of quick and straightforward development.}, Author = {Fazekas, G. and Cannam, C. and Sandler, M.}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-06 18:53:29 +0000}, Keywords = {Semantic Audio, Semantic Web}, Publisher = {{Centre} for {Digital} {Music}}, Title = {A {Simple} {Guide} to {Automated} {Music} {Analysis} on the {Semantic} {Web} (white paper)}, Year = 2009} @conference{sandler2008ontology, Author = {Fazekas, G. and Sandler, M.}, Booktitle = {{Presented} at the {Digital} {Music} {Research} {Network} {Workshop}, {Dec}., {London}, UK}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2017-12-21 20:31:23 +0000}, Keywords = {audio production, ontology; Arousal}, Title = {Ontology based information management in music production}, Year = 2008} @conference{mauch2014efficient, Abstract = {We present Tony, a free, open-source software tool for computer-aided pitch track and note annotation of melodic audio content. The accurate annotation of fundamental frequencies and notes is essential to the scientific study of intonation in singing and other instruments. Unlike commercial applications for singers and producers or other academic tools for generic music annotation and visualisation Tony has been designed for the scientific study of monophonic music: a) it implements state-of-the art algorithms for pitch and note estimation from audio, b) it provides visual and auditory feedback of the extracted pitches for the identification of detection errors, c) it provides an intelligent graphical user interface through which the user can identify and rapidly correct estimation errors, d) it provides functions for exporting pitch track and note track enabling further processing in spreadsheets or other applications. Software versions for Windows, OSX and Linux platforms can be downloaded from http://code.soundsoftware.ac.uk/projects/tony!}, Author = {Mauch, M. and Cannam, C. and Fazekas, G.}, Booktitle = {Society for Education, Music and Psychology Research (SEMPRE'14) conference, April 3-4, London, UK}, Date-Added = {2014-08-06 18:56:56 +0000}, Date-Modified = {2017-12-21 20:02:45 +0000}, Isbn = {978-1905351299}, Keywords = {Tony, pitch-tracking, annotation, software, singing}, Pages = {143-147}, Presentation-Url = {https://code.soundsoftware.ac.uk/attachments/download/1087/SempreTony.pdf}, Title = {Efficient computer-aided pitch track and note estimation for scientific applications}, Url = {http://tinyurl.com/mcutwgd}, Year = 2014, Bdsk-Url-1 = {https://code.soundsoftware.ac.uk/attachments/download/1067/mauch_sempre2014_formattedpreprint.pdf}} @conference{kolozali2010towardsA, Abstract = {In this study we present a novel hybrid system by developing a formal method of automatic ontology generation for web-based audio signal processing applications. An ontology is seen as a knowledge management structure that represents domain knowledge in a machine interpretable format. It describes concepts and relationships within a particular domain, in our case, the domain of musical instruments. The different tasks of ontology engineering including manual annotation, hierarchical structuring and organisation of data can be laborious and challenging. For these reasons, we investigate how the process of creating ontologies can be made less dependent on human supervision by exploring concept analysis techniques in a Semantic Web environment. Only a few methods have been proposed for automatic ontology generation. These are mostly based on statistical methods (e.g., frequency of semantic tags) that generate the taxonomy structure of ontologies as in the studies from Bodner and Songs [1]. The algorithms that have been used for automatic ontology generation are Hierarchical Agglomerative Clustering (HAC), Bi-Section K-Means [2], and Formal Concept Analysis (FCM). Formal Concept Analysis is a well established technique for identifying groups of elements with common sets of properties. Formal Concept Analysis has been used in many software engineering topics such as the identication of ob jects in legacy code, or the identication and restructuring of schema in ob ject-oriented databases [5]. These works are important since ontologies provide the basis for information and database systems [6].}, Author = {Kolozali, S. and Barthet, M. and Fazekas, G. and Sandler, M.}, Booktitle = {Proc. of the 5th {International} {Conference} on {Semantic} and {Digital} {Media} {Technologies} ({SAMT}-10) {Saarbrucken}, Germany}, Date-Added = {2014-08-06 17:49:01 +0000}, Date-Modified = {2014-08-06 17:49:52 +0000}, Keywords = {automatic ontology generation, instrument taxonomy, ontology design}, Title = {Towards the automatic generation of a {Semantic} {Web} ontology for musical instruments}, Year = 2010} @conference{fazekas2009reusable, Abstract = {Content-based metadata is becoming increasingly important for managing audio collections in digital library applications. While Music Information Retrieval (MIR) research provides means for extracting metadata from audio recordings, no common practice emerges for representing analysis results or exchanging algorithms. This paper argues for the need of modularity through interoperable components and data publishing methods in MIR applications. We demonstrate the use of a common API for audio analysis, enhanced with easily extended Semantic Web ontologies for describing results and configuration. Built on the extensible ontological framework provided by the Music Ontology, our system allows for the representation of diverse information such as musical facts, features or analysis parameters in a uniform, reusable and machine interpretable format. Our demonstration will be using SAWA, a Web-application available for researchers interested in these technologies.}, Author = {Fazekas, G. and Cannam, C. and Sandler, M.}, Booktitle = {Proc. of the 9th IEEE/ACM Joint Conference on Digital Libraries (JCDL'09) Workshop on Integrating Digital Library Content with Computational Tools and Services, 14-19 June, Austin, Texas, USA}, Date-Added = {2014-08-02 10:04:50 +0000}, Date-Modified = {2014-08-02 10:04:50 +0000}, Invited = {invited paper}, Keywords = {Semantic Audio, Semantic Web, ontology, RDF, audio analysis}, Title = {Reusable metadata and software components for automatic audio analysis}, Url = {http://www.semanticaudio.net/files/papers/fazeks2009reusable.pdf}, Year = 2009, Bdsk-Url-1 = {http://www.semanticaudio.net/files/papers/fazeks2009reusable.pdf}}