Publications

2020

  • H. B. Martinez, A. Hines, and M. C. Q. Farias, “Unb-av: an audio-visual database for multimedia quality research,” Ieee access, vol. 8, pp. 56641-56649, 2020.
    [Bibtex]
    @ARTICLE{9042343, author={H. B. {Martinez} and A. {Hines} and M. C. Q. {Farias}}, journal={IEEE Access}, title={UnB-AV: An Audio-Visual Database for Multimedia Quality Research}, year={2020}, volume={8}, number={}, pages={56641-56649},}
  • A. A. Barakabitze, A. Ahmad, R. Mijumbi, and A. Hines, “5g network slicing using sdn and nfv: a survey of taxonomy, architectures and future challenges,” Computer networks, vol. 167, p. 106984, 2020.
    [Bibtex]
    @article{barakabitze20205g,
    title={5G network slicing using SDN and NFV: A survey of taxonomy, architectures and future challenges},
    author={Barakabitze, Alcardo Alex and Ahmad, Arslan and Mijumbi, Rashid and Hines, Andrew},
    journal={Computer Networks},
    volume={167},
    pages={106984},
    year={2020},
    publisher={Elsevier}
    }
  • H. Z. Jahromi, D. T. Delaney, and A. Hines, “Beyond first impressions: estimating quality of experience for interactive web applications,” Ieee access, vol. 8, pp. 47741-47755, 2020.
    [Bibtex]
    @ARTICLE{9027906, author={H. Z. {Jahromi} and D. T. {Delaney} and A. {Hines}}, journal={IEEE Access}, title={Beyond First Impressions: Estimating Quality of Experience for Interactive Web Applications}, year={2020}, volume={8}, number={}, pages={47741-47755},}
  • [DOI] B. García, F. Gortázar, M. Gallego, and A. Hines, “Assessment of qoe for video and audio in webrtc applications using full-reference models,” Electronics, vol. 9, iss. 3, p. 462, 2020.
    [Bibtex]
    @article{García_Gortázar_Gallego_Hines_2020, title={Assessment of QoE for Video and Audio in WebRTC Applications Using Full-Reference Models}, volume={9}, url={http://dx.doi.org/10.3390/electronics9030462}, DOI={10.3390/electronics9030462}, abstractNote={WebRTC is a set of standard technologies that allows exchanging video and audio in real time on the Web. As with other media-related applications, the user-perceived audiovisual quality can be estimated using Quality of Experience (QoE) measurements. This paper analyses the behavior of different objective Full-Reference (FR) models for video and audio in WebRTC applications. FR models calculate the video and audio quality by comparing some original media reference with the degraded signal. To compute these models, we have created an open-source benchmark in which different types of reference media inputs are sent browser to browser while simulating different kinds of network conditions in terms of packet loss and jitter. Our benchmark provides recording capabilities of the impairment WebRTC streams. Then, we use different existing FR metrics for video (VMAF, VIFp, SSIM, MS-SSIM, PSNR, PSNR-HVS, and PSNR-HVS-M) and audio (PESQ, ViSQOL, and POLQA) recordings together with their references. Moreover, we use the same recordings to carry out a subjective analysis in which real users rate the video and audio quality using a Mean Opinion Score (MOS). Finally, we calculate the correlations between the objective and subjective results to find the objective models that better correspond with the subjective outcome, which is considered the ground truth QoE. We find that some of the studied objective models, such as VMAF, VIFp, and POLQA, show a strong correlation with the subjective results in packet loss scenarios.}, number={3}, journal={Electronics}, publisher={MDPI AG}, author={García, Boni and Gortázar, Francisco and Gallego, Micael and Hines, Andrew}, year={2020}, month={Mar}, pages={462} }
  • A. A. Barakabitze, A. Ahmad, R. Mijumbi, and A. Hines, “5g network slicing using sdn and nfv: a survey of taxonomy, architectures and future challenges,” Computer networks, vol. 167, p. 106984, 2020.
    [Bibtex]
    @article{barakabitze20205g,
    title={5G network slicing using SDN and NFV: A survey of taxonomy, architectures and future challenges},
    author={Barakabitze, Alcardo Alex and Ahmad, Arslan and Mijumbi, Rashid and Hines, Andrew},
    journal={Computer Networks},
    volume={167},
    pages={106984},
    year={2020},
    publisher={Elsevier}
    }
  • [DOI] H. Z. Jahromi, I. Bartolec, E. Gamboa, A. Hines, and R. Schatz, “You drive me crazy! interactive qoe assessment for telepresence robot control,” in 2020 twelfth international conference on quality of multimedia experience (qomex), 2020, pp. 1-6.
    [Bibtex]
    @INPROCEEDINGS{9123117,
    author={H. Z. {Jahromi} and I. {Bartolec} and E. {Gamboa} and A. {Hines} and R. {Schatz}},
    booktitle={2020 Twelfth International Conference on Quality of Multimedia Experience (QoMEX)},
    title={You Drive Me Crazy! Interactive QoE Assessment for Telepresence Robot Control},
    year={2020},
    volume={},
    number={},
    pages={1-6},
    abstract={Telepresence robots (TPRs) are versatile, remotely controlled vehicles that enable physical presence and human-to-human interaction over a distance. Thanks to improving hardware and dropping price points, TPRs enjoy the growing interest in various industries and application domains. Still, a satisfying experience remains key for their acceptance and successful adoption, not only in terms of enabling remote communication with others, but also in terms of managing robot mobility by means of remote navigation. This paper focuses on the latter aspect of remote operation which has been hitherto neglected. We present the results of an extensive subjective study designed to systematically assess remote navigation Quality of Experience (QoE) in the context of using a TPR live over the Internet. Participants were ‘beamed’ into a remote office space and asked to perform characteristic TPR remote operation tasks (driving, turning, parking). Visual and control dimensions of their experience were systematically impaired by altering network characteristics (bandwidth, delay and packet loss rate) in a controlled fashion. Our results show that users can differentiate well between visual and navigation/control aspects of their experience. Furthermore, QoE impairment sensitivity varies with the actual task at hand.},
    keywords={Telepresence Robotics;Remote Navigation;Subjective QoE Assessment;Interactive QoE;Network Impairments},
    doi={10.1109/QoMEX48832.2020.9123117},
    ISSN={2472-7814},
    month={May},}
  • B. Garc{‘i}a, F. Gortázar, M. Gallego, and A. Hines, “Assessment of qoe for video and audio in webrtc applications using full-reference models,” Electronics, vol. 9, iss. 3, p. 462, 2020.
    [Bibtex]
    @article{garcia2020assessment,
    title={Assessment of QoE for Video and Audio in WebRTC Applications Using Full-Reference Models},
    author={Garc{\'\i}a, Boni and Gort{\'a}zar, Francisco and Gallego, Micael and Hines, Andrew},
    journal={Electronics},
    volume={9},
    number={3},
    pages={462},
    year={2020},
    publisher={Multidisciplinary Digital Publishing Institute}
    }
  • H. Z. Jahromi, D. T. Delaney, and A. Hines, “Beyond first impressions: estimating quality of experience for interactive web applications,” Ieee access, 2020.
    [Bibtex]
    @article{jahromi2020beyond,
    title={Beyond First Impressions: Estimating Quality of Experience for Interactive Web Applications},
    author={Jahromi, Hamed Z and Delaney, Declan T and Hines, Andrew},
    journal={IEEE Access},
    year={2020},
    publisher={IEEE}
    }
  • H. Z. Jahromi, I. Bartolec, E. Gamboa, A. Hines, and R. Schatz, “You drive me crazy! interactive qoe assessment for telepresence robot control,” Arxiv preprint arxiv:2003.10914, 2020.
    [Bibtex]
    @article{jahromi2020you,
    title={You Drive Me Crazy! Interactive QoE Assessment for Telepresence Robot Control},
    author={Jahromi, Hamed Z and Bartolec, Ivan and Gamboa, Edwin and Hines, Andrew and Schatz, Raimund},
    journal={arXiv preprint arXiv:2003.10914},
    year={2020}
    }
  • A. Ragano, E. Benetos, and A. Hines, “Audio impairment recognition using a correlation-based feature representation,” Arxiv preprint arxiv:2003.09889, 2020.
    [Bibtex]
    @article{ragano2020audio,
    title={Audio Impairment Recognition Using a Correlation-Based Feature Representation},
    author={Ragano, Alessandro and Benetos, Emmanouil and Hines, Andrew},
    journal={arXiv preprint arXiv:2003.09889},
    year={2020}
    }
  • W. A. Jassim, J. Skoglund, M. Chinen, and A. Hines, “Speech quality factors for traditional and neural-based low bit rate vocoders,” Arxiv preprint arxiv:2003.11882, 2020.
    [Bibtex]
    @article{jassim2020speech,
    title={Speech Quality Factors for Traditional and Neural-Based Low Bit Rate Vocoders},
    author={Jassim, Wissam A and Skoglund, Jan and Chinen, Michael and Hines, Andrew},
    journal={arXiv preprint arXiv:2003.11882},
    year={2020}
    }
  • H. Martinez, A. Hines, and M. C. Farias, “How deep is your encoder: an analysis of features descriptors for an autoencoder-based audio-visual quality metric,” Arxiv preprint arxiv:2003.11100, 2020.
    [Bibtex]
    @article{martinez2020deep,
    title={How deep is your encoder: an analysis of features descriptors for an autoencoder-based audio-visual quality metric},
    author={Martinez, Helard and Hines, Andrew and Farias, Mylene CQ},
    journal={arXiv preprint arXiv:2003.11100},
    year={2020}
    }

2019

  • A. Hines, J. Skoglund, A. Allen, and M. Narbutt, Objective quality metrics for ambisonic spatial audio, 2019.
    [Bibtex]
    @misc{hines2019objective,
    title={Objective quality metrics for ambisonic spatial audio},
    author={Hines, Andrew and Skoglund, Jan and Allen, Andrew and Narbutt, Miroslaw},
    year={2019},
    month=nov # "~7",
    note={US Patent App. 15/973,287}
    }
  • H. Martinez, F. M. CQ, and A. Hines, “Navidad: a no-reference audio-visual quality metric based on a deep autoencoder,” in 2019 27th european signal processing conference (eusipco), 2019, p. 1–5.
    [Bibtex]
    @inproceedings{martinez2019navidad,
    title={NAViDAd: A No-Reference Audio-Visual Quality Metric Based on a Deep Autoencoder},
    author={Martinez, Helard and CQ, Farias Myl{\`e}ne and Hines, Andrew},
    booktitle={2019 27th European Signal Processing Conference (EUSIPCO)},
    pages={1--5},
    year={2019},
    organization={IEEE}
    }
  • H. Z. Jahromi, D. T. Delaney, B. Rooney, and A. Hines, “Establishing waiting time thresholds in interactive web mapping applications for network qoe management,” in 2019 30th irish signals and systems conference (issc), 2019, p. 1–7.
    [Bibtex]
    @inproceedings{jahromi2019establishing,
    title={Establishing waiting time thresholds in interactive web mapping applications for network QoE management},
    author={Jahromi, Hamed Z and Delaney, Declan T and Rooney, Brendan and Hines, Andrew},
    booktitle={2019 30th Irish Signals and Systems Conference (ISSC)},
    pages={1--7},
    year={2019},
    organization={IEEE}
    }
  • T. Mo and A. Hines, “Jitter buffer compensation in voice over ip quality estimation,” in 2019 30th irish signals and systems conference (issc), 2019, p. 1–6.
    [Bibtex]
    @inproceedings{mo2019jitter,
    title={Jitter Buffer Compensation in Voice over IP Quality Estimation},
    author={Mo, Tong and Hines, Andrew},
    booktitle={2019 30th Irish Signals and Systems Conference (ISSC)},
    pages={1--6},
    year={2019},
    organization={IEEE}
    }
  • A. Siddig, P. W. Sun, M. Parker, and A. Hines, “Perception deception: audio-visual mismatch in virtual reality using the mcgurk effect.” 2019.
    [Bibtex]
    @inproceedings{siddig2019perception,
    title={Perception Deception: Audio-Visual Mismatch in Virtual Reality Using the McGurk Effect},
    author={Siddig, AbuBakr and Sun, Pheobe Wenyi and Parker, Matthew and Hines, Andrew},
    year={2019}
    }
  • A. Siddig and A. Hines, “A psychologist chatbot developing experience,” in Ceur workshop proceedings, 2019.
    [Bibtex]
    @inproceedings{siddig2019psychologist,
    booktitle = {CEUR Workshop Proceedings},
    title={A Psychologist Chatbot Developing Experience},
    author={Siddig, Abubakr and Hines, Andrew},
    year={2019}
    }
  • [DOI] H. Martinez, M. Farias, and A. Hines, “A no-reference autoencoder video quality metric,” in 2019 ieee international conference on image processing (icip), 2019.
    [Bibtex]
    @inproceedings{martinez2019ametric,
    author = {Martinez, HB and Farias, MCQ and Hines, A},
    booktitle = {2019 IEEE International Conference on Image Processing (ICIP)},
    month = {Sep},
    publisher = {IEEE},
    title = {A No-Reference Autoencoder Video Quality Metric},
    year = {2019},
    doi = {10.1109/icip.2019.8803204},
    startyear = {2019},
    startmonth = {Sep},
    startday = {22},
    finishyear = {2019},
    finishmonth = {Sep},
    finishday = {25},
    conference = {2019 IEEE International Conference on Image Processing (ICIP)},
    publicationstatus = {published},
    }
  • [DOI] A. Siddig, H. Jahromi, A. Ragano, and A. Hines, “Fusion confusion: exploring ambisonic spatial localisation for audio-visual immersion using the mcgurk effect,” in Proceedings of the 11th acm workshop on immersive mixed and virtual environment systems, mmve 2019, 2019, p. 28–33.
    [Bibtex]
    @inproceedings{siddig2019fusioneffect,
    author = {Siddig, A and Jahromi, HZ and Ragano, A and Hines, A},
    booktitle = {Proceedings of the 11th ACM Workshop on Immersive Mixed and Virtual Environment Systems, MMVE 2019},
    month = {Jun},
    pages = {28--33},
    title = {Fusion confusion: Exploring ambisonic spatial localisation for audio-visual immersion using the McGurk effect},
    year = {2019},
    abstract = {© 2019 Association for Computing Machinery. Virtual Reality (VR) is attracting the attention of application developers for purposes beyond entertainment including serious games, health, education and training. By including 3D audio the overall VR quality of experience (QoE) will be enhanced through greater immersion. Better understanding the perception of spatial audio localisation in audio-visual immersion is needed especially in streaming applications where bandwidth is limited and compression is required. This paper explores the impact of audio-visual fusion on speech due to mismatches in a perceived talker location and the corresponding sound using a phenomenon known as the McGurk effect and binaurally rendered Ambisonic spatial audio. The illusion of the McGurk effect happens when a sound of a syllable paired with a video of a second syllable, gives the perception of a third syllable. For instance the sound of /ba/ dubbed in video of /ga/ will lead to the illusion of hearing /da/. Several studies investigated factors involved in the McGurk effect, but a little has been done to understand the audio spatial effect on this illusion. 3D spatial audio generated with Ambisonics has been shown to provide satisfactory QoE with respect to localisation of sound sources which makes it suitable for VR applications but not for audio visual talker scenarios. In order to test the perception of the McGurk effect at different direction of arrival (DOA) of sound, we rendered Ambisonics signals at the azimuth of 0°, 30°, 60°, and 90° to both the left and right of the video source. The results show that the audio visual fusion significantly affects the perception of the speech. Yet the spatial audio does not significantly impact the illusion. This finding suggests that precise localisation of speech audio might not be as critical for speech intelligibility. It was found that a more significant factor was the intelligibility of speech itself.},
    doi = {10.1145/3304113.3326112},
    isbn = {9781450362993},
    day = {18},
    publicationstatus = {published},
    }
  • [DOI] A. Ragano, E. Benetos, and A. Hines, “Adapting the quality of experience framework for audio archive evaluation,” in 2019 11th international conference on quality of multimedia experience, qomex 2019, 2019.
    [Bibtex]
    @inproceedings{ragano2019adaptingevaluation,
    author = {Ragano, A and Benetos, E and Hines, A},
    booktitle = {2019 11th International Conference on Quality of Multimedia Experience, QoMEX 2019},
    month = {Jun},
    title = {Adapting the quality of experience framework for audio archive evaluation},
    year = {2019},
    abstract = {© 2019 IEEE. Perceived quality of historical audio material that is subjected to digitisation and restoration is typically evaluated by individual judgements or with inappropriate objective quality models. This paper presents a Quality of Experience (QoE) framework for predicting perceived audio quality of sound archives. The approach consists in adapting concepts used in QoE evaluation to digital audio archives. Limitations of current objective quality models employed in audio archives are provided and reasons why a QoE-based framework can overcome these limitations are discussed. This paper shows that applying a QoE framework to audio archives is feasible and it helps to identify the stages, stakeholders and models for a QoE centric approach.},
    doi = {10.1109/QoMEX.2019.8743302},
    isbn = {9781538682128},
    day = {1},
    publicationstatus = {published},
    }
  • M. Montagud, F. De Simone, N. Murray, A. Hines, R. Eg, A. Covaci, C. Keighrey, and J. Gutiérrez, “Foreword,” Proceedings of the 11th acm workshop on immersive mixed and virtual environment systems, mmve 2019, 2019.
    [Bibtex]
    @article{montagud2019forewordforeword,
    author = {Montagud, M and De Simone, F and Murray, N and Hines, A and Eg, R and Covaci, A and Keighrey, C and Gutiérrez, J},
    journal = {Proceedings of the 11th ACM Workshop on Immersive Mixed and Virtual Environment Systems, MMVE 2019},
    month = {Jun},
    title = {Foreword},
    year = {2019},
    isbn = {9781450362993},
    day = {18},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim and M. Zilany, “Nsqm: a non-intrusive assessment of speech quality using normalized energies of the neurogram,” Computer speech and language, vol. 58, p. 260–279, 2019.
    [Bibtex]
    @article{jassim2019nsqmneurogram,
    author = {Jassim, WA and Zilany, MS},
    journal = {Computer Speech and Language},
    month = {Nov},
    pages = {260--279},
    title = {NSQM: A non-intrusive assessment of speech quality using normalized energies of the neurogram},
    volume = {58},
    year = {2019},
    abstract = {© 2019 This study proposes a new non-intrusive measure of speech quality, the neurogram speech quality measure (NSQM), based on the responses of a biologically-inspired computational model of the auditory system for listeners with normal hearing. The model simulates the responses of an auditory-nerve fiber with a characteristic frequency to a speech signal, and the population response of the model is represented by a neurogram (2D time-frequency representation). The responses of each characteristic frequency in the neurogram were decomposed into sub-bands using 1D discrete Wavelet transform. The normalized energy corresponding to each sub-band was used as an input to a support vector regression model to predict the quality score of the processed speech. The performance of the proposed non-intrusive measure was compared to the results from a range of intrusive and non-intrusive measures using three standard databases: the EXP1 and EXP3 of supplement 23 to the P series (P.Supp23) of ITU-T Recommendations and the NOIZEUS databases. The proposed NSQM achieved an overall better result over most of the existing metrics for the effects of compression codecs, additive and channel noises.},
    doi = {10.1016/j.csl.2019.04.005},
    issn = {0885-2308},
    eissn = {1095-8363},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Ramli, B. Mahmmod, M. Saripan, S. Al-Haddad, and W. Jassim, “Shot boundary detection based on orthogonal polynomial,” Multimedia tools and applications, vol. 78, iss. 14, p. 20361–20382, 2019.
    [Bibtex]
    @article{abdulhussain2019shotpolynomial,
    author = {Abdulhussain, SH and Ramli, AR and Mahmmod, BM and Saripan, MI and Al-Haddad, SAR and Jassim, WA},
    journal = {Multimedia Tools and Applications},
    month = {Jul},
    number = {14},
    pages = {20361--20382},
    title = {Shot boundary detection based on orthogonal polynomial},
    volume = {78},
    year = {2019},
    abstract = {© 2019, Springer Science+Business Media, LLC, part of Springer Nature. Shot boundary detection (SBD) is a substantial step in video content analysis, indexing, retrieval, and summarization. SBD is the process of automatically partitioning video into its basic units, known as shots, through detecting transitions between shots. The design of SBD algorithms developed from simple feature comparison to rigorous probabilistic and using of complex models. Nevertheless, accelerate the detection of transitions with higher accuracy need to be improved. Extensive research has employed orthogonal polynomial (OP) and their moments in computer vision and signal processing owing to their powerful performance in analyzing signals. A new SBD algorithm based on OP has been proposed in this paper. The Features are derived from orthogonal transform domain (moments) to detect the hard transitions in video sequences. Moments are used because of their ability to represent signal (video frame) without information redundancy. These features are the moments of smoothed and gradients of video frames. The moments are computed using a developed OP which is squared Krawtchouk-Tchebichef polynomial. These moments (smoothed and gradients) are fused to form a feature vector. Finally, the support vector machine is utilized to detect hard transitions. In addition, a comparison between the proposed algorithm and other state-of-the-art algorithms is performed to reinforce the capability of the proposed work. The proposed algorithm is examined using three well-known datasets which are TRECVID2005, TRECVID2006, and TRECVID2007. The outcomes of the comparative analysis show the superior performance of the proposed algorithm against other existing algorithms.},
    doi = {10.1007/s11042-019-7364-3},
    issn = {1380-7501},
    eissn = {1573-7721},
    day = {30},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Rahman Ramli, B. Mahmmod, M. Iqbal Saripan, S. Al-Haddad, T. Baker, W. Flayyih, and W. Jassim, “A fast feature extraction algorithm for image and video processing,” in Proceedings of the international joint conference on neural networks, 2019.
    [Bibtex]
    @inproceedings{abdulhussain2019aprocessing,
    author = {Abdulhussain, SH and Rahman Ramli, A and Mahmmod, BM and Iqbal Saripan, M and Al-Haddad, SAR and Baker, T and Flayyih, WN and Jassim, WA},
    booktitle = {Proceedings of the International Joint Conference on Neural Networks},
    month = {Jul},
    title = {A Fast Feature Extraction Algorithm for Image and Video Processing},
    volume = {2019-July},
    year = {2019},
    abstract = {© 2019 IEEE. Medical images and videos are utilized to discover, diagnose and treat diseases. Managing, storing, and retrieving stored images effectively are considered important topics. The rapid growth of multimedia data, including medical images and videos, has caused a swift rise in data transmission volume and repository size. Multimedia data contains useful information; however, it consumes an enormous storage space. Therefore, high processing time for that sheer volume of data will be required. Image and video applications demand for reduction in computational cost (processing time) when extracting features. This paper introduces a novel method to compute transform coefficients (features) from images or video frames. These features are used to represent the local visual content of images and video frames. We compared the proposed method with the traditional approach of feature extraction using a standard image technique. Furthermore, the proposed method is employed for shot boundary detection (SBD) applications to detect transitions in video frames. The standard TRECVID 2005, 2006, and 2007 video datasets are used to evaluate the performance of the SBD applications. The achieved results show that the proposed algorithm significantly reduces the computational cost in comparison to the traditional method.},
    doi = {10.1109/IJCNN.2019.8851750},
    isbn = {9781728119854},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Ramli, B. Mahmmod, M. Saripan, S. Al-Haddad, and W. Jassim, “A new hybrid form of krawtchouk and tchebichef polynomials: design and application,” Journal of mathematical imaging and vision, vol. 61, iss. 4, p. 555–570, 2019.
    [Bibtex]
    @article{abdulhussain2019aapplication,
    author = {Abdulhussain, SH and Ramli, AR and Mahmmod, BM and Saripan, MI and Al-Haddad, SAR and Jassim, WA},
    journal = {Journal of Mathematical Imaging and Vision},
    month = {May},
    number = {4},
    pages = {555--570},
    title = {A New Hybrid form of Krawtchouk and Tchebichef Polynomials: Design and Application},
    volume = {61},
    year = {2019},
    abstract = {© 2018, Springer Science+Business Media, LLC, part of Springer Nature. In the past decades, orthogonal moments (OMs) have received a significant attention and have widely been applied in various applications. OMs are considered beneficial and effective tools in different digital processing fields. In this paper, a new hybrid set of orthogonal polynomials (OPs) is presented. The new set of OPs is termed as squared Krawtchouk–Tchebichef polynomial (SKTP). SKTP is formed based on two existing hybrid OPs which are originated from Krawtchouk and Tchebichef polynomials. The mathematical design of the proposed OP is presented. The performance of the SKTP is evaluated and compared with the existing hybrid OPs in terms of signal representation, energy compaction (EC) property, and localization property. The achieved results show that SKTP outperforms the existing hybrid OPs. In addition, face recognition system is employed using a well-known database under clean and different noisy environments to evaluate SKTP capabilities. Particularly, SKTP is utilized to transform face images into moment (transform) domain to extract features. The performance of SKTP is compared with existing hybrid OPs. The comparison results confirm that SKTP displays remarkable and stable results for face recognition system.},
    doi = {10.1007/s10851-018-0863-4},
    issn = {0924-9907},
    eissn = {1573-7683},
    day = {15},
    publicationstatus = {published},
    }
  • [DOI] B. Mahmmod, A. Ramli, T. Baker, F. Al-Obeidat, S. Abdulhussain, and W. Jassim, “Speech enhancement algorithm based on super-gaussian modeling and orthogonal polynomials,” Ieee access, vol. 7, p. 103485–103504, 2019.
    [Bibtex]
    @article{mahmmod2019speechpolynomials,
    author = {Mahmmod, BM and Ramli, AR and Baker, T and Al-Obeidat, F and Abdulhussain, SH and Jassim, WA},
    journal = {IEEE Access},
    pages = {103485--103504},
    publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
    title = {Speech Enhancement Algorithm Based on Super-Gaussian Modeling and Orthogonal Polynomials},
    volume = {7},
    year = {2019},
    doi = {10.1109/access.2019.2929864},
    eissn = {2169-3536},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Ramli, A. Hussain, B. Mahmmod, and W. Jassim, “Orthogonal polynomial embedded image kernel,” in Acm international conference proceeding series, 2019, p. 215–221.
    [Bibtex]
    @inproceedings{abdulhussain2019orthogonalkernel,
    author = {Abdulhussain, SH and Ramli, AR and Hussain, AJ and Mahmmod, BM and Jassim, WA},
    booktitle = {ACM International Conference Proceeding Series},
    month = {Apr},
    pages = {215--221},
    title = {Orthogonal polynomial embedded image kernel},
    year = {2019},
    abstract = {© 2019 Association for Computing Machinery. Preprocessing operations of images and video frame sequences are beneficial in computer vision algorithms. For example, smoothing frames is used to eliminate noise; while computing frame gradient in x-direction and y-direction is used for frame feature extraction or for finding frame edges. Such operations involve convolving operators (image kernels) with an image precomputing moments will add extra computation cost to computer vision algorithm. In case of video, the computational time accumulatively increased because of the convolution operation for each frame is performed. To overcome this problem, a mathematical model is established for computing preprocessed frame moments via embedding the operator (image kernel) in the orthogonal polynomial (OP) functions. The experimental results show that the computation time for feature extraction using the proposed method is noticeably reduced in the both trends: image size and moment selection order. The average speed up ratio of the proposed method to traditional method is 3x, 5x, 8x, and 40x for moment selection ratio 100\%, 25\%, 10\%, and 5\%, respectively. In addition, the percentage reduction in processing time for small image size is ∼ 99\% and for large image size is ∼ 40\%.},
    doi = {10.1145/3321289.3321310},
    isbn = {9781450366434},
    day = {15},
    publicationstatus = {published},
    }

2018

  • [DOI] A. Cullen, A. Hines, and N. Harte, “Perception and prediction of speaker appeal – a single speaker study,” Computer speech and language, vol. 52, p. 23–40, 2018.
    [Bibtex]
    @article{cullen2018perceptionstudy,
    author = {Cullen, A and Hines, A and Harte, N},
    journal = {Computer Speech and Language},
    month = {Nov},
    pages = {23--40},
    title = {Perception and prediction of speaker appeal – A single speaker study},
    volume = {52},
    year = {2018},
    abstract = {© 2018 Elsevier Ltd In this paper we explore the automatic prediction of speaker appeal from recordings of political speech. The database used contains recordings of a single speaker in a wide range of situations (interview, election rally etc.) which has been annotated for six speaker traits: boring; charismatic; enthusiastic; inspiring; likeable; and persuasive. The aim of this study is to predict these ratings using acoustic features of the speech. We offer three key contributions in this paper. Firstly, we explore the effect of acoustic environment on the perception of speaker ability. We find significant biases in the perception of all six traits, with interview speech being consistently rated as less appealing, and election rally speech as more appealing. In our second contribution, we attempt to exploit this bias by modelling speech from each situation separately, which gives a significant improvement in classification performance. Finally, the database covers 7 years. Thus, our third contribution is an analysis of the variance in both annotations and acoustic features over time to uncover temporal trends in speaker appeal. We find significant trends which show a decline in the speaker's prosodic activity over time, which mirror a decline in the perception of speaker appeal as measured by the database annotations.},
    doi = {10.1016/j.csl.2018.04.004},
    issn = {0885-2308},
    eissn = {1095-8363},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] M. Narbutt, A. Allen, J. Skoglund, M. Chinen, and A. Hines, “Ambiqual-a full reference objective quality metric for ambisonic spatial audio,” in 2018 10th international conference on quality of multimedia experience, qomex 2018, 2018.
    [Bibtex]
    @inproceedings{narbutt2018ambiqualaaudio,
    author = {Narbutt, M and Allen, A and Skoglund, J and Chinen, M and Hines, A},
    booktitle = {2018 10th International Conference on Quality of Multimedia Experience, QoMEX 2018},
    month = {Sep},
    title = {AMBIQUAL-A full reference objective quality metric for ambisonic spatial audio},
    year = {2018},
    abstract = {© 2018 IEEE. Streaming spatial audio over networks requires efficient encoding techniques that compress the raw audio content without compromising quality of experience. Streaming service providers such as YouTube need a perceptually relevant objective audio quality metric to monitor users' perceived quality and spatial localization accuracy. In this paper we introduce a full reference objective spatial audio quality metric, AMBIQUAL, which assesses both Listening Quality and Localization Accuracy. In our solution both metrics are derived directly from the B-format Ambisonic audio. The metric extends and adapts the algorithm used in ViSQOLAudio, a full reference objective metric designed for assessing speech and audio quality. In particular, Listening Quality is derived from the omnidirectional channel and Localization Accuracy is derived from a weighted sum of similarity from B-format directional channels. This paper evaluates whether the proposed AMBIQUAL objective spatial audio quality metric can predict two factors: Listening Quality and Localization Accuracy by comparing its predictions with results from MUSHRA subjective listening tests. In particular, we evaluated the Listening Quality and Localization Accuracy of First and Third-Order Ambisonic audio compressed with the OPUS 1.2 codec at various bitrates (i.e. 32, 128 and 256, 512kbps respectively). The sample set for the tests comprised both recorded and synthetic audio clips with a wide range of time-frequency characteristics. To evaluate Localization Accuracy of compressed audio a number of fixed and dynamic (moving vertically and horizontally) source positions were selected for the test samples. Results showed a strong correlation (PCC=0.919; Spearman=0.882 regarding Listening Quality and PCC=0.854; Spearman=0.842 regarding Localization Accuracy) between objective quality scores derived from the B-format Ambisonic audio using AMBIQUAL and subjective scores obtained during listening MUSHRA tests. AMBIQUAL displays very promising quality assessment predictions for spatial audio. Future work will optimise the algorithm to generalise and validate it for any Higher Order Ambisonic formats.},
    doi = {10.1109/QoMEX.2018.8463408},
    isbn = {9781538626054},
    day = {11},
    publicationstatus = {published},
    }
  • [DOI] H. Martinez, M. Farias, and A. Hines, “Perceived quality of audio-visual stimuli containing streaming audio degradations,” in European signal processing conference, 2018, p. 2529–2533.
    [Bibtex]
    @inproceedings{martinez2018perceiveddegradations,
    author = {Martinez, H and Farias, MCQ and Hines, A},
    booktitle = {European Signal Processing Conference},
    month = {Nov},
    pages = {2529--2533},
    title = {Perceived quality of audio-visual stimuli containing streaming audio degradations},
    volume = {2018-September},
    year = {2018},
    abstract = {© EURASIP 2018. Multimedia services play an important role in modern human communication. Understanding the impact of multi-sensory input (audio and video) on perceived quality is important for optimizing the delivery of these services. This work explores the impact of audio degradations on audio-visual quality. With this goal, we present a new dataset that contains audio-visual sequences with distortions only in the audio component (Im-AV-Exp2). The degradations in this new dataset correspond to commonly encountered streaming degradations, matching those found in the audio-only TCD-VoIP dataset. Using the Immersive Methodology, we perform a subjective experiment with the Im-AV-Exp2 dataset. We analyze the experimental data and compared the quality scores of the Im-AV-Exp2 and TCD-VoIP datasets. Results show that the video component act as a masking factor for certain classes of audio degradations (e.g. echo), showing that there is an interaction of video and audio quality that may depend on content.},
    doi = {10.23919/EUSIPCO.2018.8553541},
    isbn = {9789082797015},
    issn = {2219-5491},
    day = {29},
    publicationstatus = {published},
    }
  • [DOI] H. Jahromi, A. Hines, and D. Delanev, “Towards application-aware networking: ml-based end-to-end application kpi/qoe metrics characterization in sdn,” in International conference on ubiquitous and future networks, icufn, 2018, p. 126–131.
    [Bibtex]
    @inproceedings{jahromi2018towardssdn,
    author = {Jahromi, HZ and Hines, A and Delanev, DT},
    booktitle = {International Conference on Ubiquitous and Future Networks, ICUFN},
    month = {Aug},
    pages = {126--131},
    title = {Towards Application-Aware Networking: ML-Based End-to-End Application KPI/QoE Metrics Characterization in SDN},
    volume = {2018-July},
    year = {2018},
    abstract = {© 2018 IEEE. Software Defined Networking (SDN) presents a unique networking paradigm that facilitates the development of network innovations. This paper aims to improve application awareness by incorporating Machine Learning (ML) techniques within an open source SDN architecture. The paper explores how end-to-end application Key Performance Indicator (KPI) metrics can be designed and utilized for the purpose of application awareness in networks. The main goal of this research is to characterize application KPI metrics using a suitable ML approach based on available network data. Resource allocation and network orchestration tasks can be automated based on the findings. A key facet of this research is introducing a novel feedback interface to the SDN's Northbound Interface that receives realtime performance feedback from applications. This paper aim to show how could we exploit the applications feedback to determine useful characteristics of an application's traffic. A mapping application with a defined KPI is used for experimentation. Linear multiple regression is used to derive a characteristic relationship between the application KPI and the network metrics.},
    doi = {10.1109/ICUFN.2018.8436625},
    isbn = {9781538646465},
    issn = {2165-8528},
    eissn = {2165-8536},
    day = {14},
    publicationstatus = {published},
    }
  • [DOI] D. Becker and A. Hines, “Micro-benchmarking property preserving encryption: balancing performance, security and functionality,” in 29th irish signals and systems conference, issc 2018, 2018.
    [Bibtex]
    @inproceedings{becker2018microbenchmarkingfunctionality,
    author = {Becker, D and Hines, A},
    booktitle = {29th Irish Signals and Systems Conference, ISSC 2018},
    month = {Dec},
    title = {Micro-Benchmarking Property Preserving Encryption: Balancing Performance, Security and Functionality},
    year = {2018},
    abstract = {© 2018 IEEE. Practical encryption systems with new and more flexible capabilities have been enabled by recent advances in computing hardware performance and Property Preserving Encryption (PPE) schemes. PPE schemes allow limited and preselected operations to be performed on encrypted data allowing system designers to trade-off between performance, security and functionality. This paper uses micro-benchmark to evaluate three interdependent factors of PPE: performance, security and functionality. The findings validate the efficacy of this technique and provide guidance to application designers and technology evaluators seeking to understand these interdependent relationships for PPE database applications. Experiments were performed using the CryptDB research system. Results validate the previous assessments of CryptDB and provide supplemental detail on performance, security and functionality.},
    doi = {10.1109/ISSC.2018.8585377},
    isbn = {9781538660461},
    day = {20},
    publicationstatus = {published},
    }
  • R. Jaiswal and A. Hines, “The sound of silence: how traditional and deep learning based voice activity detection influences speech quality monitoring,” in Ceur workshop proceedings, 2018, p. 174–185.
    [Bibtex]
    @inproceedings{jaiswal2018themonitoring,
    author = {Jaiswal, R and Hines, A},
    booktitle = {CEUR Workshop Proceedings},
    month = {Jan},
    pages = {174--185},
    title = {The sound of silence: How traditional and deep learning based Voice Activity Detection influences speech quality monitoring},
    volume = {2259},
    year = {2018},
    abstract = {© 2018 CEUR Workshop Proceedings. All rights reserved. Real-time speech quality assessment is important for VoIP applications such as Google Hangouts, Microsoft Skype, and Apple Face-Time. Conventionally, subjective listening tests are used to quantify speech quality but are impractical for real-time monitoring scenarios. Objective speech quality assessment metrics can predict human judgement of perceived speech quality. Originally designed for narrow-band telephony applications, ITU-T P.563 is a single-ended or non-intrusive speech quality assessment that predicts speech quality without access to a reference signal. This paper investigates the suitability of P.563 in Voice over Internet Protocol (VoIP) scenarios and specifically the influence of silences on the predicted speech quality. The performance of P.563 was evaluated using TCD-VoIP dataset, containing speech with degradations commonly experienced with VoIP. The predictive capability of P.563 was established by comparing with subjective listening test results. The effect of pre-processing the signal to remove silences using Voice Activity Detection (VAD) was evaluated for five acoustic feature-based VAD algorithms: energy, energy and spectral centroid, Mahalanobis distance, weighted energy, weighted spectral centroid and four Deep learning model-based VAD algorithms: Deep Neural Network, Boosted Deep Neural Network, Long Short-Term Memory and Adaptive context attention model. Analysis shows P.563 prediction accuracy improves for different speech conditions of VoIP when the silences were removed by a VAD. The improvements varied with input content highlighting a potential to switch the VAD used based on the input to create a content aware speech quality monitoring system.},
    issn = {1613-0073},
    day = {1},
    publicationstatus = {published},
    }
  • A. Ragano and A. Hines, “Exploring a perceptually-weighted dnn-based fusion model for speech separation,” in Ceur workshop proceedings, 2018, p. 21–32.
    [Bibtex]
    @inproceedings{ragano2018exploringseparation,
    author = {Ragano, A and Hines, A},
    booktitle = {CEUR Workshop Proceedings},
    month = {Jan},
    pages = {21--32},
    title = {Exploring a perceptually-weighted DNN-based fusion model for speech separation},
    volume = {2259},
    year = {2018},
    abstract = {© 2018 CEUR Workshop Proceedings. All rights reserved. Deep Neural Network (DNN)-based fusion approaches for single-channel speech separation have recently been introduced but the non uniform perceptual weighting of the human auditory system has not been exploited during the DNN training phase. In addition, the perceived quality of the speech signal has not been assessed using a DNN-based fusion model. We propose a new perceptually-weighted DNN-based fusion model which employs a perceptual cost function and assess the perceived quality of several DNN-based fusion models. Objective and subjective evaluations for speech quality are compared. The results show that the perceptually-weighted DNN-based fusion model displays a significant improvement in terms of Source To Interferences Ratio (SIR) compared to a combined mask. However subjective quality assessment listening tests suggests that the proposed DNN-based fusion model does not result in improved perceived speech quality.},
    issn = {1613-0073},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] M. Narbutt, S. O’Leary, A. Allen, J. Skoglund, and A. Hines, “Streaming vr for immersion: quality aspects of compressed spatial audio,” in Proceedings of the 2017 23rd international conference on virtual systems and multimedia, vsmm 2017, 2018, p. 1–6.
    [Bibtex]
    @inproceedings{narbutt2018streamingaudio,
    author = {Narbutt, M and O'Leary, S and Allen, A and Skoglund, J and Hines, A},
    booktitle = {Proceedings of the 2017 23rd International Conference on Virtual Systems and Multimedia, VSMM 2017},
    month = {Apr},
    pages = {1--6},
    title = {Streaming VR for immersion: Quality aspects of compressed spatial audio},
    volume = {2018-January},
    year = {2018},
    abstract = {© 2017 IEEE. Delivering a 360-degree soundscape that matches full sphere visuals is an essential aspect of immersive VR. Ambisonics is a full sphere surround sound technique that takes into account the azimuth and elevation of sound sources, portraying source location above and below as well as around the horizontal plane of the listener. In contrast to channel-based methods, ambisonics representation offers the advantage of being independent of a specific loudspeaker set-up. Streaming ambisonics over networks requires efficient encoding techniques that compress the raw audio content without compromising quality of experience (QoE). This work investigates the effect of audio channel compression via the OPUS 1.2 codec on the quality of spatial audio as perceived by listeners. In particular we evaluate the listening quality and localization accuracy of first-order ambisonic audio (FOA) and third-order ambisonic audio (HOA) compressed at various bitrates (i.e. 32, 64, 128 and 128, 256, 512kbps respectively). To assess the impact of OPUS compression on spatial audio a number of subjective listening tests were carried out. The sample set for the tests comprises both recorded and synthetic audio clips with a wide range of time-frequency characteristics. In order to evaluate localization accuracy of compressed audio a number of fixed and dynamic (moving vertically and horizontally) source positions were selected for the test samples. The results show that for compressed spatial audio, perceived quality and localization accuracy are influenced more by compression scheme, bitrate and ambisonic order than by sample content. The insights provided by this work into factors and parameters influencing QoE will guide future development of a objective spatial audio quality metric.},
    doi = {10.1109/VSMM.2017.8346301},
    isbn = {9781538644935},
    day = {24},
    publicationstatus = {published},
    }
  • [DOI] M. Alam, W. Jassim, and M. Zilany, “Radon transform of auditory neurograms: a robust feature set for phoneme classification,” Iet signal processing, vol. 12, iss. 3, p. 260–268, 2018.
    [Bibtex]
    @article{alam2018radonclassification,
    author = {Alam, MS and Jassim, WA and Zilany, MSA},
    journal = {IET Signal Processing},
    month = {May},
    number = {3},
    pages = {260--268},
    title = {Radon transform of auditory neurograms: A robust feature set for phoneme classification},
    volume = {12},
    year = {2018},
    abstract = {© The Institution of Engineering and Technology 2017. Classification of speech phonemes is challenging, especially under noisy environments, and hence traditional speech recognition systems do not perform well in the presence of noise. Unlike traditional methods in which features are mostly extracted from the properties of the acoustic signal, this study proposes a new feature for phoneme classification using neural responses from a physiologically based computational model of the auditory periphery. The two-dimensional neurogram was constructed from the simulated responses of auditory-nerve fibres to speech phonemes. Features of neurogram images were extracted using the Discrete Radon Transform, and the dimensionality of features was reduced using an efficient feature selection technique. A standard classifier, Support Vector Machine, was employed to model and test the phoneme classes. Classification performance was evaluated in quiet and under noisy conditions in which test data were corrupted with various environmental distortions such as additive noise, room reverberation, and telephone-channel noise. Performances were also compared with the results from existing methods such as the Mel-frequency cepstral coefficient, Gammatone frequency cepstral coefficient, and frequency-domain linear prediction-based phoneme classification methods. In general, the proposed neural feature exhibited a better classification accuracy in quiet and under noisy conditions compared with the performance of most existing acoustic-signal-based methods.},
    doi = {10.1049/iet-spr.2017.0170},
    issn = {1751-9675},
    eissn = {1751-9683},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Ramli, M. Saripan, B. Mahmmod, S. Al-Haddad, and W. Jassim, “Methods and challenges in shot boundary detection: a review,” Entropy, vol. 20, iss. 4, 2018.
    [Bibtex]
    @article{abdulhussain2018methodsreview,
    author = {Abdulhussain, SH and Ramli, AR and Saripan, MI and Mahmmod, BM and Al-Haddad, SAR and Jassim, WA},
    journal = {Entropy},
    month = {Apr},
    number = {4},
    title = {Methods and challenges in Shot boundary detection: A review},
    volume = {20},
    year = {2018},
    abstract = {© 2018 by the authors. The recent increase in the number of videos available in cyberspace is due to the availability of multimedia devices, highly developed communication technologies, and low-cost storage devices. These videos are simply stored in databases through text annotation. Content-based video browsing and retrieval are inefficient due to the method used to store videos in databases. Video databases are large in size and contain voluminous information, and these characteristics emphasize the need for automated video structure analyses. Shot boundary detection (SBD) is considered a substantial process of video browsing and retrieval. SBD aims to detect transition and their boundaries between consecutive shots; hence, shots with rich information are used in the content-based video indexing and retrieval. This paper presents a review of an extensive set for SBD approaches and their development. The advantages and disadvantages of each approach are comprehensively explored. The developed algorithms are discussed, and challenges and recommendations are presented.},
    doi = {10.3390/e20040214},
    eissn = {1099-4300},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Ramli, S. Al-Haddad, B. Mahmmod, and W. Jassim, “Fast recursive computation of krawtchouk polynomials,” Journal of mathematical imaging and vision, vol. 60, iss. 3, p. 285–303, 2018.
    [Bibtex]
    @article{abdulhussain2018fastpolynomials,
    author = {Abdulhussain, SH and Ramli, AR and Al-Haddad, SAR and Mahmmod, BM and Jassim, WA},
    journal = {Journal of Mathematical Imaging and Vision},
    month = {Mar},
    number = {3},
    pages = {285--303},
    title = {Fast Recursive Computation of Krawtchouk Polynomials},
    volume = {60},
    year = {2018},
    abstract = {© 2017, Springer Science+Business Media, LLC. Krawtchouk polynomials (KPs) and their moments are used widely in the field of signal processing for their superior discriminatory properties. This study proposes a new fast recursive algorithm to compute Krawtchouk polynomial coefficients (KPCs). This algorithm is based on the symmetry property of KPCs along the primary and secondary diagonals of the polynomial array. The n- x plane of the KP array is partitioned into four triangles, which are symmetrical across the primary and secondary diagonals. The proposed algorithm computes the KPCs for only one triangle (partition), while the coefficients of the other three triangles (partitions) can be computed using the derived symmetry properties of the KP. Therefore, only N / 4 recursion times are required. The proposed algorithm can also be used to compute polynomial coefficients for different values of the parameter p in interval (0, 1). The performance of the proposed algorithm is compared with that in previous literature in terms of image reconstruction error, polynomial size, and computation cost. Moreover, the proposed algorithm is applied in a face recognition system to determine the impact of parameter p on feature extraction ability. Simulation results show that the proposed algorithm has a remarkable advantage over other existing algorithms for a wide range of parameters p and polynomial size N, especially in reducing the computation time and the number of operations utilized.},
    doi = {10.1007/s10851-017-0758-9},
    issn = {0924-9907},
    eissn = {1573-7683},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] B. Mahmmod, A. bin Ramli, S. Abdulhussain, S. Al-Haddad, and W. Jassim, “Signal compression and enhancement using a new orthogonal-polynomial-based discrete transform,” Iet signal processing, vol. 12, iss. 1, p. 129–142, 2018.
    [Bibtex]
    @article{mahmmod2018signaltransform,
    author = {Mahmmod, BM and bin Ramli, AR and Abdulhussain, SH and Al-Haddad, SAR and Jassim, WA},
    journal = {IET Signal Processing},
    month = {Feb},
    number = {1},
    pages = {129--142},
    title = {Signal compression and enhancement using a new orthogonal-polynomial-based discrete transform},
    volume = {12},
    year = {2018},
    abstract = {© The Institution of Engineering and Technology. Discrete orthogonal functions are important tools in digital signal processing. These functions received considerable attention in the last few decades. This study proposes a new set of orthogonal functions called discrete Krawtchouk-Tchebichef transform (DKTT). Two traditional orthogonal polynomials, namely, Krawtchouk and Tchebichef, are combined to form DKTT. The theoretical and mathematical frameworks of the proposed transform are provided. DKTT was tested using speech and image signals from a well-known database under clean and noisy environments. DKTT was applied in a speech enhancement algorithm to evaluate the efficient removal of noise from speech signal. The performance of DKTT was compared with that of standard transforms. Different types of distance (similarity index) and objective measures in terms of image quality, speech quality, and speech intelligibility assessments were used for comparison. Experimental tests show that DKTT exhibited remarkable achievements and excellent results in signal compression and speech enhancement. Therefore, DKTT can be considered as a new set of orthogonal functions for futuristic applications of signal processing.},
    doi = {10.1049/iet-spr.2016.0449},
    issn = {1751-9675},
    eissn = {1751-9683},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim and N. Harte, “Voice activity detection using neurograms,” in Icassp, ieee international conference on acoustics, speech and signal processing – proceedings, 2018, p. 5524–5528.
    [Bibtex]
    @inproceedings{jassim2018voiceneurograms,
    author = {Jassim, WA and Harte, N},
    booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
    month = {Sep},
    pages = {5524--5528},
    title = {Voice Activity Detection Using Neurograms},
    volume = {2018-April},
    year = {2018},
    abstract = {© 2018 IEEE. Existing acoustic-signal-based algorithms for Voice Activity Detection (VAD) do not perform well in the presence of noise. In this study, we propose a method to improve VAD accuracy by employing another type of signal representation which is derived from the response of the human Auditory-Nerve (AN) system. The neural responses referred to as a neurogram are simulated using a computational model of the AN system for a range of Characteristic Frequencies (CFs). Features are extracted from neurograms using the Discrete Cosine Transform (DCT), and are then trained using a Multilayer Perceptron (MLP) classifier to predict the VAD intervals. The proposed method was evaluated using the QUT-NOISE-TIMIT corpus, and the NIST scoring algorithm for VAD was employed as an accuracy measure. The proposed neural-response-based method exhibited an overall better VAD accuracy over most of the existing methods.},
    doi = {10.1109/ICASSP.2018.8461952},
    isbn = {9781538646588},
    issn = {1520-6149},
    day = {10},
    publicationstatus = {published},
    }

2017

  • [DOI] C. Sloan, N. Harte, D. Kelly, A. Kokaram, and A. Hines, “Objective assessment of perceptual audio quality using visqolaudio,” Ieee transactions on broadcasting, vol. 63, iss. 4, p. 693–705, 2017.
    [Bibtex]
    @article{sloan2017objectivevisqolaudio,
    author = {Sloan, C and Harte, N and Kelly, D and Kokaram, AC and Hines, A},
    journal = {IEEE Transactions on Broadcasting},
    month = {Dec},
    number = {4},
    pages = {693--705},
    title = {Objective Assessment of Perceptual Audio Quality Using ViSQOLAudio},
    volume = {63},
    year = {2017},
    abstract = {© 1963-12012 IEEE. Digital audio broadcasting services transmit substantial amounts of data that is encoded to minimize bandwidth whilst maximizing user quality of experience. Many large service providers continually alter codecs to improve the encoding process. Performing subjective tests to validate each codec alteration would be impractical, necessitating the use of objective perceptual audio quality models. This paper evaluates the quality scores from ViSQOLAudio, an objective perceptual audio quality model, against the quality scores of PEAQ, POLQA, and PEMO-Q on three datasets containing fullband audio encoded with a variety of codecs and bitrates. The results show that ViSQOLAudio was more accurate than all other models on two of the datasets and performed well on the third, demonstrating the utility of ViSQOLAudio for predicting the perceptual audio quality for encoded music.},
    doi = {10.1109/TBC.2017.2704421},
    issn = {0018-9316},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] A. Hines and J. Kelleher, “A framework for post-stroke quality of life prediction using structured prediction,” in 2017 9th international conference on quality of multimedia experience, qomex 2017, 2017.
    [Bibtex]
    @inproceedings{hines2017aprediction,
    author = {Hines, A and Kelleher, JD},
    booktitle = {2017 9th International Conference on Quality of Multimedia Experience, QoMEX 2017},
    month = {Jun},
    title = {A framework for post-stroke quality of life prediction using structured prediction},
    year = {2017},
    abstract = {© 2017 IEEE. This paper presents a conceptual model that relates Quality of Life to the established Quality of Experience formation process. It uses concepts developed by the Quality of Experience community to propose an adapted framework for developing predictive models for Quality of Life. A mapping of common factors that can be applied to health related quality of life is proposed and practical challenges for modelling and applications are presented and discussed. The process of identifying and categorising factors and features is illustrated using stroke patient treatment as an example use case.},
    doi = {10.1109/QoMEX.2017.7965672},
    isbn = {9781538640241},
    day = {30},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim, R. Paramesran, and N. Harte, “Speech emotion classification using combined neurogram and interspeech 2010 paralinguistic challenge features,” Iet signal processing, vol. 11, iss. 5, p. 587–595, 2017.
    [Bibtex]
    @article{jassim2017speechfeatures,
    author = {Jassim, WA and Paramesran, R and Harte, N},
    journal = {IET Signal Processing},
    month = {Jul},
    number = {5},
    pages = {587--595},
    title = {Speech emotion classification using combined neurogram and INTERSPEECH 2010 paralinguistic challenge features},
    volume = {11},
    year = {2017},
    abstract = {© 2017, The Institution of Engineering and Technology. Recently, increasing attention has been directed to study and identify the emotional content of a spoken utterance. This study introduces a method to improve emotion classification performance under clean and noisy environments by combining two types of features: the proposed neural-responses-based features and the traditional INTERSPEECH 2010 paralinguistic emotion challenge features. The neural-responses-based features are represented by the responses of a computational model of the auditory system for listeners with normal hearing. The model simulates the responses of an auditory-nerve fibre with a characteristic frequency to a speech signal. The simulated responses of the model are represented by the 2D neurogram (time-frequency representation). The neurogram image is sub-divided into non-overlapped blocks and the averaged value of each block is computed. The neurogram features and the traditional emotion features are combined together to form the feature vector for each speech signal. The features are trained using support vector machines to predict the emotion of speech. The performance of the proposed method is evaluated on two well-known databases: the eNTERFACE and Berlin emotional speech data set. The results show that the proposed method performed better when compared with the classification results obtained using neurogram and INTERSPEECH features separately.},
    doi = {10.1049/iet-spr.2016.0336},
    issn = {1751-9675},
    eissn = {1751-9683},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] M. Alam, M. Zilany, W. Jassim, and M. Ahmad, “Phoneme classification using the auditory neurogram,” Ieee access, vol. 5, p. 633–642, 2017.
    [Bibtex]
    @article{alam2017phonemeneurogram,
    author = {Alam, MS and Zilany, MSA and Jassim, WA and Ahmad, MY},
    journal = {IEEE Access},
    month = {Jan},
    pages = {633--642},
    title = {Phoneme classification using the auditory neurogram},
    volume = {5},
    year = {2017},
    abstract = {© 2013 IEEE. In order to mimic the capability of human listeners identifying speech in noisy environments, this paper proposes a phoneme classification technique using simulated neural responses from a physiologically based computational model of the auditory periphery instead of using features directly from the acoustic signal. The 2-D neurograms were constructed from the simulated responses of the auditory-nerve fibers to speech phonemes. The features of the neurograms were extracted using the Radon transform and used to train the classification system using a deep neural network classifier. Classification performance was evaluated in quiet and under noisy conditions for different types of phonemes extracted from the TIMIT database. Based on simulation results, the proposed method outperformed most of the traditional acoustic-property-based phoneme classification methods for both in quiet and under noisy conditions. The proposed method could easily be extended to develop an automatic speech recognition system.},
    doi = {10.1109/ACCESS.2016.2647229},
    eissn = {2169-3536},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Ramli, S. Al-Haddad, B. Mahmmod, and W. Jassim, “On computational aspects of tchebichef polynomials for higher polynomial order,” Ieee access, vol. 5, p. 2470–2478, 2017.
    [Bibtex]
    @article{abdulhussain2017onorder,
    author = {Abdulhussain, SH and Ramli, AR and Al-Haddad, SAR and Mahmmod, BM and Jassim, WA},
    journal = {IEEE Access},
    month = {Jan},
    pages = {2470--2478},
    title = {On Computational Aspects of Tchebichef Polynomials for Higher Polynomial Order},
    volume = {5},
    year = {2017},
    abstract = {© 2017 IEEE. Tchebichef polynomials (TPs) and their moments are widely used in signal processing due to their remarkable performance in signal analysis, feature extraction, and compression capability. The common problem of the TP is that the coefficients computation is prone to numerical instabilities when the polynomial order becomes large. In this paper, a new algorithm is proposed to compute the TP coefficients (TPCs) for higher polynomial order by combining two existing recurrence algorithms: The three-term recurrence relations in the n-direction and x-direction. First, the TPCs are computed for x,n=0,1,...,(N/2)-1 using the recurrence in the x-direction. Second, the TPCs for x=0,1,...,(N/2)-1 and n=(N/2), (N/2)+1,...,N-1 based on n and x directions are calculated. Finally, the symmetry condition is applied to calculate the rest of the coefficients for x=(N/2),(N/2)+1,...,N-1. In addition to the ability of the proposed algorithm to reduce the numerical propagation errors, it also accelerates the computational speed of the TPCs. The performance of the proposed algorithm was compared to that of existing algorithms for the reconstruction of speech and image signals taken from different databases. The performance of the TPCs computed by the proposed algorithm was also compared with the performance of the discrete cosine transform coefficients for speech compression systems. Different types of speech quality measures were used for evaluation. According to the results of the comparative analysis, the proposed algorithm makes the computation of the TP superior to that of conventional recurrence algorithms when the polynomial order is large.},
    doi = {10.1109/ACCESS.2017.2669218},
    eissn = {2169-3536},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] B. Mahmmod, A. Ramli, S. Abdulhussian, S. Al-Haddad, and W. Jassim, “Low-distortion mmse speech enhancement estimator based on laplacian prior,” Ieee access, vol. 5, p. 9866–9881, 2017.
    [Bibtex]
    @article{mahmmod2017lowdistortionprior,
    author = {Mahmmod, BM and Ramli, AR and Abdulhussian, SH and Al-Haddad, SAR and Jassim, WA},
    journal = {IEEE Access},
    month = {Jan},
    pages = {9866--9881},
    title = {Low-Distortion MMSE Speech Enhancement Estimator Based on Laplacian Prior},
    volume = {5},
    year = {2017},
    abstract = {© 2013 IEEE. The most well-known conventional speech enhancement algorithms introduce unwanted artifact noise and speech distortion to the enhanced signal. Reducing the effects of such issues require more robust linear and non-linear estimators. This paper proposes new optimum linear and non-linear Laplacian distribution-based estimators. The proposed estimators are derived based on a minimum mean squared error (MMSE) sense to minimize the distortion in different conditions of the underlying speech. Thus, artifact noise is reduced without compromising the noise reduction process. The analytical solutions of the Laplacian distribution-based estimators, linear bilateral Laplacian gain estimator (LBLG), and non-linear bilateral Laplacian gain estimator (NBLG), are presented. The proposed estimators are implemented in three steps. First, the observation signal is decorrelated through a real transform domain to obtain its transform coefficients. Second, the proposed estimators are applied to estimate the clean speech signal from the noisy signal in the decorrelated domain. Finally, the inverse of the real transform is applied to obtain the original speech signal in the time domain. Two conditions in these estimators account for interference events between the speech signal and noise coefficients in the decorrelated domain. Moreover, a mathematical aspect of mean square error of LBLG is evaluated, which presents a significant improvement over other methods. Furthermore, a comprehensive description of the whole variations of the LBLG and NBLG gains characteristics is presented. A comparative evaluation is performed with effective quality metrics, segmental signal-to-noise ratio and perceptual evaluation of speech quality, to demonstrate the advantage and effectiveness of the proposed estimators. The performance of the proposed estimators outperformed other methods, which are the traditional MMSE approach, perceptually motivated Bayesian estimator, dual gain Wiener estimator, and dual MMSE estimator in terms of different objective measurements.},
    doi = {10.1109/ACCESS.2017.2699782},
    eissn = {2169-3536},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] S. Abdulhussain, A. Ramli, B. Mahmmod, S. Al-Haddad, and W. Jassim, “Image edge detection operators based on orthogonal polynomials,” International journal of image and data fusion, vol. 8, iss. 3, p. 293–308, 2017.
    [Bibtex]
    @article{abdulhussain2017imagepolynomials,
    author = {Abdulhussain, SH and Ramli, AR and Mahmmod, BM and Al-Haddad, SAR and Jassim, WA},
    journal = {International Journal of Image and Data Fusion},
    month = {Jul},
    number = {3},
    pages = {293--308},
    title = {Image edge detection operators based on orthogonal polynomials},
    volume = {8},
    year = {2017},
    abstract = {© 2017 Informa UK Limited, trading as Taylor \& Francis Group. Orthogonal polynomials (OPs) are beneficial for image processing. OPs are used to reflect an image or a scene to a moment domain, and moments are subsequently used to extract object contours utilised in various applications. In this study, OP-based edge detection operators are introduced to replace traditional convolution-based and block processing methods with direct matrix multiplication. A mathematical model with empirical study results is established to investigate the performance of the proposed detectors compared with that of traditional algorithms, such as Sobel and Canny operators. The proposed operators are then evaluated by using entire images from a well-known data set. Experimental results reveal that the proposed operator achieves a more favourable interpretation, especially for images distorted by motion effects, than traditional methods do.},
    doi = {10.1080/19479832.2017.1326405},
    issn = {1947-9832},
    eissn = {1947-9824},
    day = {3},
    publicationstatus = {published},
    }

2016

  • [DOI] C. Sloan, N. Harte, D. Kelly, A. Kokaram, and A. Hines, “Bitrate classification of twice-encoded audio using objective quality features,” in 2016 8th international conference on quality of multimedia experience, qomex 2016, 2016.
    [Bibtex]
    @inproceedings{sloan2016bitratefeatures,
    author = {Sloan, C and Harte, N and Kelly, D and Kokaram, AC and Hines, A},
    booktitle = {2016 8th International Conference on Quality of Multimedia Experience, QoMEX 2016},
    month = {Jun},
    title = {Bitrate classification of twice-encoded audio using objective quality features},
    year = {2016},
    abstract = {© 2016 IEEE. When a user uploads audio files to a music streaming service, these files are subsequently re-encoded to lower bitrates to target different devices, e.g. low bitrate for mobile. To save time and bandwidth uploading files, some users encode their original files using a lossy codec. The metadata for these files cannot always be trusted as users might have encoded their files more than once. Determining the lowest bitrate of the files allows the streaming service to skip the process of encoding the files to bitrates higher than that of the uploaded files, saving on processing and storage space. This paper presents a model that uses quality predictions from ViSQOLAudio, a full reference objective audio quality metric, as features in combination with a multi-class support vector machine classifier. An experiment on twice-encoded files found that low bitrate codecs could be classified using audio quality features. The experiment also provides insights into the implications of multiple transcodes from a quality perspective.},
    doi = {10.1109/QoMEX.2016.7498956},
    isbn = {9781509003549},
    day = {23},
    publicationstatus = {published},
    }
  • “Monitoring voip speech quality for chopped and clipped speech,” Komunikacie, vol. 18, iss. 1, p. 3–10, 2016.
    [Bibtex]
    @article{2016monitoringspeech,
    author = {},
    journal = {Komunikacie},
    month = {Feb},
    number = {1},
    pages = {3--10},
    title = {Monitoring voip speech quality for chopped and clipped speech},
    volume = {18},
    year = {2016},
    issn = {1335-4205},
    day = {1},
    publicationstatus = {published},
    }
  • A. Hines, J. Skoglund, A. Kokaram, and N. Harte, “Monitoring voip speech quality for chopped and clipped speech,” Communications – scientific letters of the university of zilina, vol. 18, iss. 1, p. 3–10, 2016.
    [Bibtex]
    @article{hines2016monitoringspeech,
    author = {Hines, A and Skoglund, J and Kokaram, AC and Harte, N},
    journal = {Communications - Scientific Letters of the University of Zilina},
    month = {Jan},
    number = {1},
    pages = {3--10},
    title = {Monitoring voip speech quality for chopped and clipped speech},
    volume = {18},
    year = {2016},
    abstract = {Real-time monitoring of speech quality for VoIP calls is a significant challenge. This paper presents early work on a no-reference objective model for quantifying perceived speech quality in VoIP. The overall approach uses a modular design that will be able to help pinpoint the reason for degradations as well as quantifying their impact on speech quality. The model is being designed to work with narrowband and wideband signals. This initial work is focused on rating amplitude clipped or chopped speech, which are common problems in VoIP. A model sensitive to each of these degradations is presented and then tested with both synthetic and real examples of chopped and clipped speech. The results were compared with predicted MOS outputs from four objective speech quality models: ViSQOL, PESQ, POLQA and P.563. The model output showed consistent relationships between this model's clip and chop detection modules and the quality predictions from the other objective speech quality models. Further work is planned to widen the range of degradation types captured by the model, such as non-stationary background noise and speaker echo. While other components (e.g. a voice activity detector) would be necessary to deploy the model for stand-alone VoIP monitoring, the results show good potential for using the model in a realtime monitoring tool.},
    issn = {1335-4205},
    eissn = {2585-7878},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] C. Lim, R. Paramesran, W. Jassim, Y. Yu, and K. Ngan, “Blind image quality assessment for gaussian blur images using exact zernike moments and gradient magnitude,” Journal of the franklin institute, vol. 353, iss. 17, p. 4715–4733, 2016.
    [Bibtex]
    @article{lim2016blindmagnitude,
    author = {Lim, CL and Paramesran, R and Jassim, WA and Yu, YP and Ngan, KN},
    journal = {Journal of the Franklin Institute},
    month = {Nov},
    number = {17},
    pages = {4715--4733},
    title = {Blind image quality assessment for Gaussian blur images using exact Zernike moments and gradient magnitude},
    volume = {353},
    year = {2016},
    abstract = {© 2016 The Franklin Institute Features that exhibit human perception on the effect of blurring on digital images are useful in constructing a blur image quality metric. In this paper, we show some of the exact Zernike moments (EZMs) that closely model the human quality scores for images of varying degrees of blurriness can be used to measure these distortions. A theoretical framework is developed to identify these EZMs. Together with the selected EZMs, the gradient magnitude (GM), which measures the contrast information, is used as a weight in the formulation of the proposed blur metric. The design of the proposed metric consists of two stages. In the first stage, the EZM differences and the GM dissimilarities between the edge points of the test image and the same re-blurred image are extracted. Next, the mean of the weighted EZM features are then pooled to produce a quality score using support vector machine regressor (SVR). We compare the performance of the proposed blur metric with other state-of-the-art full-reference (FR) and no-reference (NR) blur metrics on three benchmark databases. The results using Pearson׳s correlation coefficient (CC) and Spearman׳s ranked-order correlation coefficient (SROCC) for the LIVE image database are 0.9659 and 0.9625 respectively. Similarly, high correlations with the subjective scores are achieved for the other two databases as well.},
    doi = {10.1016/j.jfranklin.2016.08.012},
    issn = {0016-0032},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] M. Islam, W. Jassim, N. Cheok, and M. Zilany, “A robust speaker identification system using the responses from a model of the auditory periphery,” Plos one, vol. 11, iss. 7, 2016.
    [Bibtex]
    @article{islam2016aperiphery,
    author = {Islam, MA and Jassim, WA and Cheok, NS and Zilany, MSA},
    journal = {PLoS ONE},
    month = {Jul},
    number = {7},
    title = {A robust speaker identification system using the responses from a model of the auditory periphery},
    volume = {11},
    year = {2016},
    abstract = {© 2016 Islam et al. This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited. Speaker identification under noisy conditions is one of the challenging topics in the field of speech processing applications. Motivated by the fact that the neural responses are robust against noise, this paper proposes a new speaker identification system using 2-D neurograms constructed from the responses of a physiologically-based computational model of the auditory periphery. The responses of auditory-nerve fibers for a wide range of characteristic frequency were simulated to speech signals to construct neurograms. The neurogram coefficients were trained using the well-known Gaussian mixture model-universal background model classification technique to generate an identity model for each speaker. In this study, three text-independent and one text-dependent speaker databases were employed to test the identification performance of the proposed method. Also, the robustness of the proposed method was investigated using speech signals distorted by three types of noise such as the white Gaussian, pink, and street noises with different signal-to-noise ratios. The identification results of the proposed neural-response-based method were compared to the performances of the traditional speaker identification methods using features such as the Mel-frequency cepstral coefficients, Gamma-tone frequency cepstral coefficients and frequency domain linear prediction. Although the classification accuracy achieved by the proposed method was comparable to the performance of those traditional techniques in quiet, the new feature was found to provide lower error rates of classification under noisy environments.},
    doi = {10.1371/journal.pone.0158520},
    eissn = {1932-6203},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim and M. Zilany, “Speech quality assessment using 2d neurogram orthogonal moments,” Speech communication, vol. 80, p. 34–48, 2016.
    [Bibtex]
    @article{jassim2016speechmoments,
    author = {Jassim, WA and Zilany, MSA},
    journal = {Speech Communication},
    month = {Jun},
    pages = {34--48},
    title = {Speech quality assessment using 2D neurogram orthogonal moments},
    volume = {80},
    year = {2016},
    abstract = {© 2016 Elsevier B.V. This study proposes a new objective speech quality measure using the responses of a physiologically-based computational model of auditory nerve (AN). The population response of the model AN fibers to a speech signal is represented by a 2D neurogram, and features of the neurogram are extracted by orthogonal moments. A special type of orthogonal moment, the orthogonal Tchebichef-Krawtchouk moment, is used in this study. The proposed measure is compared to the subjective scores from two standard databases, the NOIZEUS and the supplement 23 to the P series (P.Sup23) of ITU-T Recommendations. The NOIZEUS database is used in the assessment of 11 speech enhancement algorithms whereas the P.Sup23 database is used in the ITU-T 8 kbit/s codec (Recommendation G.729) characterization test. The performance of the proposed speech quality measure is also compared to the results from some traditional objective quality measures. In general, the proposed neural-response-based metric yielded better results than most of the traditional acoustic-property-based quality measures. The proposed metric can be applied to evaluate the performance of various speech-enhancement algorithms and compression systems.},
    doi = {10.1016/j.specom.2016.03.004},
    issn = {0167-6393},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] M. Hossain, W. Jassim, and M. Zilany, “Reference-free assessment of speech intelligibility using bispectrum of an auditory neurogram,” Plos one, vol. 11, iss. 3, 2016.
    [Bibtex]
    @article{hossain2016referencefreeneurogram,
    author = {Hossain, ME and Jassim, WA and Zilany, MSA},
    journal = {PLoS ONE},
    month = {Mar},
    number = {3},
    title = {Reference-free assessment of speech intelligibility using bispectrum of an auditory neurogram},
    volume = {11},
    year = {2016},
    abstract = {© 2016 Hossain et al. This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited. Sensorineural hearing loss occurs due to damage to the inner and outer hair cells of the peripheral auditory system. Hearing loss can cause decreases in audibility, dynamic range, frequency and temporal resolution of the auditory system, and all of these effects are known to affect speech intelligibility. In this study, a new reference-free speech intelligibility metric is proposed using 2-D neurograms constructed from the output of a computational model of the auditory periphery. The responses of the auditory-nerve fibers with a wide range of characteristic frequencies were simulated to construct neurograms. The features of the neurograms were extracted using third-order statistics referred to as bispectrum. The phase coupling of neurogram bispectrum provides a unique insight for the presence (or deficit) of supra-threshold nonlinearities beyond audibility for listeners with normal hearing (or hearing loss). The speech intelligibility scores predicted by the proposed method were compared to the behavioral scores for listeners with normal hearing and hearing loss both in quiet and under noisy background conditions. The results were also compared to the performance of some existing methods. The predicted results showed a good fit with a small error suggesting that the subjective scores can be estimated reliably using the proposed neuralresponsebased metric. The proposed metric also had a wide dynamic range, and the predicted scores were well-separated as a function of hearing loss. The proposed metric successfully captures the effects of hearing loss and supra-threshold nonlinearities on speech intelligibility. This metric could be applied to evaluate the performance of various speechprocessing algorithms designed for hearing aids and cochlear implants.},
    doi = {10.1371/journal.pone.0150415},
    eissn = {1932-6203},
    day = {1},
    publicationstatus = {published},
    }

2015

  • [DOI] A. Hines, J. Skoglund, A. Kokaram, and N. Harte, “Visqol: an objective speech quality model,” Eurasip journal on audio, speech, and music processing, vol. 2015, iss. 1, 2015.
    [Bibtex]
    @article{hines2015visqolmodel,
    author = {Hines, A and Skoglund, J and Kokaram, AC and Harte, N},
    journal = {Eurasip Journal on Audio, Speech, and Music Processing},
    month = {Dec},
    number = {1},
    title = {ViSQOL: an objective speech quality model},
    volume = {2015},
    year = {2015},
    abstract = {© 2015, Hines et al.; licensee Springer. This paper presents an objective speech quality model, ViSQOL, the Virtual Speech Quality Objective Listener. It is a signal-based, full-reference, intrusive metric that models human speech quality perception using a spectro-temporal measure of similarity between a reference and a test speech signal. The metric has been particularly designed to be robust for quality issues associated with Voice over IP (VoIP) transmission. This paper describes the algorithm and compares the quality predictions with the ITU-T standard metrics PESQ and POLQA for common problems in VoIP: clock drift, associated time warping, and playout delays. The results indicate that ViSQOL and POLQA significantly outperform PESQ, with ViSQOL competing well with POLQA. An extensive benchmarking against PESQ, POLQA, and simpler distance metrics using three speech corpora (NOIZEUS and E4 and the ITU-T P.Sup. 23 database) is also presented. These experiments benchmark the performance for a wide range of quality impairments, including VoIP degradations, a variety of background noise types, speech enhancement methods, and SNR levels. The results and subsequent analysis show that both ViSQOL and POLQA have some performance weaknesses and under-predict perceived quality in certain VoIP conditions. Both have a wider application and robustness to conditions than PESQ or more trivial distance metrics. ViSQOL is shown to offer a useful alternative to POLQA in predicting speech quality in VoIP scenarios.},
    doi = {10.1186/s13636-015-0054-9},
    issn = {1687-4714},
    eissn = {1687-4722},
    day = {26},
    publicationstatus = {published},
    }
  • [DOI] A. Hines, E. Gillen, D. Kelly, J. Skoglund, A. Kokaram, and N. Harte, “Visqolaudio: an objective audio quality metric for low bitrate codecs,” Journal of the acoustical society of america, vol. 137, iss. 6, p. EL449–EL455, 2015.
    [Bibtex]
    @article{hines2015visqolaudiocodecs,
    author = {Hines, A and Gillen, E and Kelly, D and Skoglund, J and Kokaram, A and Harte, N},
    journal = {Journal of the Acoustical Society of America},
    month = {Jun},
    number = {6},
    pages = {EL449--EL455},
    title = {ViSQOLAudio: An objective audio quality metric for low bitrate codecs},
    volume = {137},
    year = {2015},
    abstract = {© 2015 Acoustical Society of America. Streaming services seek to optimise their use of bandwidth across audio and visual channels to maximise the quality of experience for users. This letter evaluates whether objective quality metrics can predict the audio quality for music encoded at low bitrates by comparing objective predictions with results from listener tests. Three objective metrics were benchmarked: PEAQ, POLQA, and VISQOLAudio. The results demonstrate objective metrics designed for speech quality assessment have a strong potential for quality assessment of low bitrate audio codecs.},
    doi = {10.1121/1.4921674},
    issn = {0001-4966},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] N. Harte, E. Gillen, and A. Hines, “Tcd-voip, a research database of degraded speech for assessing quality in voip applications,” in 2015 7th international workshop on quality of multimedia experience, qomex 2015, 2015.
    [Bibtex]
    @inproceedings{harte2015tcdvoipapplications,
    author = {Harte, N and Gillen, E and Hines, A},
    booktitle = {2015 7th International Workshop on Quality of Multimedia Experience, QoMEX 2015},
    month = {Jan},
    title = {TCD-VoIP, a research database of degraded speech for assessing quality in VoIP applications},
    year = {2015},
    abstract = {© 2015 IEEE. There are many types of degradation which can occur in Voice over IP calls. Degradations which occur independently of the codec, hardware, or network in use are the focus of this paper. The development of new quality metrics for modern communication systems depends heavily on the availability of suitable test and development data with subjective quality scores. A new dataset of VoIP degradations (TCD-VoIP) has been created and is presented in this paper. The dataset contains speech samples with a range of common VoIP degradations, and the corresponding set of subjective opinion scores from 24 listeners. The dataset is publicly available.},
    doi = {10.1109/QoMEX.2015.7148100},
    isbn = {9781479989584},
    day = {1},
    publicationstatus = {published},
    }
  • A. Hines, E. Gillen, and N. Harte, “Measuring and monitoring speech quality for voice over ip with polqa, visqol and p.563,” in Proceedings of the annual conference of the international speech communication association, interspeech, 2015, p. 438–442.
    [Bibtex]
    @inproceedings{hines2015measuringp563,
    author = {Hines, A and Gillen, E and Harte, N},
    booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
    month = {Jan},
    pages = {438--442},
    title = {Measuring and monitoring speech quality for voice over IP with POLQA, ViSQOL and P.563},
    volume = {2015-January},
    year = {2015},
    abstract = {Copyright © 2015 ISCA. There are many types of degradation which can occur in Voice over IP (VoIP) calls. Of interest in this work are degradations which occur independently of the codec, hardware or network in use. Specifically, their effect on the subjective and objective quality of the speech is examined. Since no dataset suitable for this purpose exists, a new dataset (TCD-VoIP) has been created and has been made publicly available. The dataset contains speech clips suffering from a range of common call quality degradations, as well as a set of subjective opinion scores on the clips from 24 listeners. The performances of three objective quality metrics: POLQA, ViSQOL and P.563, have been evaluated using the dataset. The results show that full reference metrics are capable of accurately predicting a variety of common VoIP degradations. They also highlight the outstanding need for a wideband, single-ended, no-reference metric to monitor accurately speech quality for degradations common in VoIP scenarios.},
    issn = {2308-457X},
    eissn = {1990-9772},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] P. Počta, H. Melvin, and A. Hines, “An analysis of the impact of playout delay adjustments introduced by voip jitter buffers on listening speech quality,” Acta acustica united with acustica, vol. 101, iss. 3, p. 616–631, 2015.
    [Bibtex]
    @article{pota2015anquality,
    author = {Počta, P and Melvin, H and Hines, A},
    journal = {Acta Acustica united with Acustica},
    month = {Jan},
    number = {3},
    pages = {616--631},
    title = {An analysis of the impact of playout delay adjustments introduced by VoIP jitter buffers on listening speech quality},
    volume = {101},
    year = {2015},
    abstract = {© S. Hirzel Verlag • EAA. This paper investigates the impact of frequent and small playout delay adjustments (time-shifting) of 30 ms or less introduced to silence periods by Voice over IP (VoIP) jitter buffer strategies on listening quality perceived by the end user. In particular, the quality impact is assessed using both a subjective method (quality scores obtained from subjective listening test) and an objective method based on perceptual modelling. Two different objective methods are used, PESQ (Perceptual Evaluation of Speech Quality, ITU-T Recommendation P.862) and POLQA (Perceptual Objective Listening Quality Assessment, ITU-T Recommendation P.863). Moreover, the relative accuracy of both objective models is assessed by comparing their predictions with subjective assessments. The results show that the impact of the investigated playout delay adjustments on subjective listening quality scores is negligible. On the other hand, a significant impact is reported for objective listening quality scores predicted by the PESQ model i.e. the PESQ model fails to correctly predict quality scores for this kind of degradation. Finally, the POLQA model is shown to perform significantly better than PESQ. We conclude the paper by identifying further related research that arises from this study.},
    doi = {10.3813/AAA.918857},
    issn = {1610-1928},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] P. Lim, S. Ng, W. Jassim, S. Redmond, M. Zilany, A. Avolio, E. Lim, M. Tan, and N. Lovell, “Improved measurement of blood pressure by extraction of characteristic features from the cuff oscillometric waveform,” Sensors (switzerland), vol. 15, iss. 6, p. 14142–14161, 2015.
    [Bibtex]
    @article{lim2015improvedwaveform,
    author = {Lim, PK and Ng, SC and Jassim, WA and Redmond, SJ and Zilany, M and Avolio, A and Lim, E and Tan, MP and Lovell, NH},
    journal = {Sensors (Switzerland)},
    month = {Jun},
    number = {6},
    pages = {14142--14161},
    title = {Improved measurement of blood pressure by extraction of characteristic features from the cuff oscillometric waveform},
    volume = {15},
    year = {2015},
    abstract = {© 2015 by the authors; licensee MDPI, Basel, Switzerland. We present a novel approach to improve the estimation of systolic (SBP) and diastolic blood pressure (DBP) from oscillometric waveform data using variable characteristic ratios between SBP and DBP with mean arterial pressure (MAP). This was verified in 25 healthy subjects, aged 28 ± 5 years. The multiple linear regression (MLR) and support vector regression (SVR) models were used to examine the relationship between the SBP and the DBP ratio with ten features extracted from the oscillometric waveform envelope (OWE). An automatic algorithm based on relative changes in the cuff pressure and neighbouring oscillometric pulses was proposed to remove outlier points caused by movement artifacts. Substantial reduction in the mean and standard deviation of the blood pressure estimation errors were obtained upon artifact removal. Using the sequential forward floating selection (SFFS) approach, we were able to achieve a significant reduction in the mean and standard deviation of differences between the estimated SBP values and the reference scoring (MLR: mean ± SD = −0.3 ± 5.8 mmHg; SVR and −0.6 ± 5.4 mmHg) with only two features, i.e., Ratio2 and Area3, as compared to the conventional maximum amplitude algorithm (MAA) method (mean ± SD = −1.6 ± 8.6 mmHg). Comparing the performance of both MLR and SVR models, our results showed that the MLR model was able to achieve comparable performance to that of the SVR model despite its simplicity.},
    doi = {10.3390/s150614142},
    issn = {1424-8220},
    day = {16},
    publicationstatus = {published},
    }
  • [DOI] N. Mamun, W. Jassim, and M. Zilany, “Prediction of speech intelligibility using a neurogram orthogonal polynomial measure (nopm),” Ieee/acm transactions on audio speech and language processing, vol. 23, iss. 4, p. 760–773, 2015.
    [Bibtex]
    @article{mamun2015predictionnopm,
    author = {Mamun, N and Jassim, WA and Zilany, MSA},
    journal = {IEEE/ACM Transactions on Audio Speech and Language Processing},
    month = {Apr},
    number = {4},
    pages = {760--773},
    title = {Prediction of speech intelligibility using a neurogram orthogonal polynomial measure (NOPM)},
    volume = {23},
    year = {2015},
    abstract = {© 2015 IEEE. Sensorineural hearing loss (SNHL) is an increasingly prevalent condition, resulting from damage to the inner ear and causing a reduction in speech intelligibility. This paper proposes a new speech intelligibility prediction metric, the neurogram orthogonal polynomial measure (NOPM). This metric applies orthogonal moments to the auditory neurogram to predict speech intelligibility for listeners with and without hearing loss. The model simulates the responses of auditory-nerve fibers to speech signals under quiet and noisy conditions. Neurograms were created using a physiologically based computational model of the auditory periphery. A well-known orthogonal polynomial measure, Krawtchouk moments, was applied to extract features from the auditory neurogram. The predicted intelligibility scores were compared to subjective results, and NOPM showed a good fit with the subjective scores for normal listeners and also for listeners with hearing loss. The proposed metric has a realistic and wider dynamic range than corresponding existing metrics, such as mean structural similarity index measure and neurogram similarity index measure, and the predicted scores are also well-separated as a function of hearing loss. The application of this metric could be extended for assessing hearing-aid and speech-enhancement algorithms.},
    doi = {10.1109/TASLP.2015.2401513},
    issn = {2329-9290},
    day = {1},
    publicationstatus = {published},
    }

2014

  • A. Hines, P. Kendrick, A. Barri, M. Narwaria, and J. Redi, “Robustness and prediction accuracy of machine learning for objective visual quality assessment,” in European signal processing conference, 2014, p. 2130–2134.
    [Bibtex]
    @inproceedings{hines2014robustnessassessment,
    author = {Hines, A and Kendrick, P and Barri, A and Narwaria, M and Redi, JA},
    booktitle = {European Signal Processing Conference},
    month = {Jan},
    pages = {2130--2134},
    title = {Robustness and prediction accuracy of Machine Learning for objective visual quality assessment},
    year = {2014},
    abstract = {© 2014 EURASIP. Machine Learning (ML) is a powerful tool to support the development of objective visual quality assessment metrics, serving as a substitute model for the perceptual mechanisms acting in visual quality appreciation. Nevertheless, the reliability of ML-based techniques within objective quality assessment metrics is often questioned. In this study, the robustness of ML in supporting objective quality assessment is investigated, specifically when the feature set adopted for prediction is suboptimal. A Principal Component Regression based algorithm and a Feed Forward Neural Network are compared when pooling the Structural Similarity Index (SSIM) features perturbed with noise. The neural network adapts better with noise and intrinsically favours features according to their salient content.},
    isbn = {9780992862619},
    issn = {2219-5491},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] A. Hines, J. Skoglund, E. Gillen, A. Kokaram, D. Kelly, and N. Harte, “Perceived audio quality for streaming stereo music,” in Mm 2014 – proceedings of the 2014 acm conference on multimedia, 2014, p. 1173–1176.
    [Bibtex]
    @inproceedings{hines2014perceivedmusic,
    author = {Hines, A and Skoglund, J and Gillen, E and Kokaram, A and Kelly, D and Harte, N},
    booktitle = {MM 2014 - Proceedings of the 2014 ACM Conference on Multimedia},
    month = {Jan},
    pages = {1173--1176},
    title = {Perceived audio quality for streaming stereo music},
    year = {2014},
    abstract = {Users of audio-visual streaming services expect an ever increasing quality of experience. Channel bandwidth remains a bottleneck commonly addressed with lossy compression schemes for both the video and audio streams. Anecdotal evidence suggests a strongly perceived link between bit rate and quality. This paper presents three audio quality listening experiments using the ITU MUSHRA methodology to assess a number of audio codecs typically used by streaming services. They were assessed for a range of bit rates using three presentation modes: consumer and studio quality headphones and loudspeakers. Our results indicate that with consumer quality headphones, listeners were not differentiating between codecs with bit rates greater than 48 kb/s (p>=0.228). For studio quality headphones and loudspeakers aac-lc at 128 kb/s and higher was differentiated over other codecs (p<=0.001). The results provide insights into quality of experience that will guide future development of objective audio quality metrics.},
    doi = {10.1145/2647868.2655025},
    isbn = {9781450330633},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] A. Cullen, A. Hines, and N. Harte, "Building a database of political speech does culture matter in charisma annotations?," in Avec 2014 - proceedings of the 4th international workshop on audio/visual emotion challenge, workshop of mm 2014, 2014, p. 27–31.
    [Bibtex]
    @inproceedings{cullen2014buildingannotations,
    author = {Cullen, A and Hines, A and Harte, N},
    booktitle = {AVEC 2014 - Proceedings of the 4th International Workshop on Audio/Visual Emotion Challenge, Workshop of MM 2014},
    month = {Jan},
    pages = {27--31},
    title = {Building a database of political speech does culture matter in charisma annotations?},
    year = {2014},
    abstract = {Copyright © 2014 ACM. For both individual politicians and political parties the in- ternet has become a vital tool for self-promotion and the distribution of ideas. The rise of streaming has enabled po- litical debates and speeches to reach global audiences. In this paper, we explore the nature of charisma in political speech, with a view to automatic detection. To this end, we have collected a new database of political speech from YouTube and other on-line resources. Annotation is per- formed both by native listeners, and Amazon Mechanical Turk (AMT) workers. Detailed analysis shows that both la- bel sets are equally reliable. The results support the use of crowd-sourced labels for speaker traits such as charisma in political speech, even where cultural subtleties are present. The impact of these difierent annotations on charisma pre- diction from political speech is also investigated.},
    doi = {10.1145/2661806.2661808},
    isbn = {9781450331197},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim, R. Paramesran, and M. Zilany, "Enhancing noisy speech signals using orthogonal moments," Iet signal processing, vol. 8, iss. 8, p. 891–905, 2014.
    [Bibtex]
    @article{jassim2014enhancingmoments,
    author = {Jassim, WA and Paramesran, R and Zilany, MSA},
    journal = {IET Signal Processing},
    month = {Jan},
    number = {8},
    pages = {891--905},
    title = {Enhancing noisy speech signals using orthogonal moments},
    volume = {8},
    year = {2014},
    abstract = {© The Institution of Engineering and Technology 2014. This study describes a new approach to enhance noisy speech signals using the discrete Tchebichef transform (DTT) and the discrete Krawtchouk transform (DKT). The DTT and DKT are based on well-known orthogonal moments: the Tchebichef and Krawtchouk moments, respectively. The representations of speech signals using a limited number of moment coefficients and their behaviour in the domain of orthogonal moments are shown. The method involves removing noise from the signal using a minimum-mean-square error in the domain of the DTT or DKT. According to comparisons with traditional methods, the initial experiments yield promising results and show that orthogonal moments are applicable in the field of speech signal enhancement. The application of orthogonal moments could be extended to speech analysis, compression and recognition.},
    doi = {10.1049/iet-spr.2013.0322},
    issn = {1751-9675},
    eissn = {1751-9683},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] N. Razali, W. Jassim, L. Roohisefat, and M. Zilany, "Speaker recognition using neural responses from the model of the auditory system," in 2014 international symposium on intelligent signal processing and communication systems, ispacs 2014, 2014, p. 76–79.
    [Bibtex]
    @inproceedings{razali2014speakersystem,
    author = {Razali, NF and Jassim, WA and Roohisefat, L and Zilany, MSA},
    booktitle = {2014 International Symposium on Intelligent Signal Processing and Communication Systems, ISPACS 2014},
    month = {Jan},
    pages = {76--79},
    title = {Speaker recognition using neural responses from the model of the auditory system},
    year = {2014},
    abstract = {© 2014 IEEE. Speaker recognition is a process of determining a person's identity using features in speech signals. In this study, a new speaker recognition (identification and verifica-tion) system is proposed using the responses from a computational model of the auditory system. A neurogram (2D) was constructed from the responses of the model of auditory nerve fibers for a range of characteristic frequencies. The proposed neurogram based speaker recognition system was trained and tested using a Gaussian mixture model classification technique. The performance of the proposed method was evaluated for both clean speech and speech under noisy environment. The result of the proposed method was compared to a traditional speaker recognition technique, referred to as the mel-frequency cepstral coefficient method. The proposed method showed better performance than the traditional approach, especially under noisy conditions. The proposed method could be applied in security and voice recognition systems.},
    doi = {10.1109/ISPACS.2014.7024428},
    isbn = {9781479961207},
    day = {27},
    publicationstatus = {published},
    }
  • [DOI] N. Mamun, W. Jassim, and M. Zilany, "Robust gender classification using neural responses from the model of the auditory system," in 2014 ieee 19th international functional electrical stimulation society annual conference, ifess 2014 - conference proceedings, 2014.
    [Bibtex]
    @inproceedings{mamun2014robustsystem,
    author = {Mamun, N and Jassim, WA and Zilany, MSA},
    booktitle = {2014 IEEE 19th International Functional Electrical Stimulation Society Annual Conference, IFESS 2014 - Conference Proceedings},
    month = {Feb},
    title = {Robust gender classification using neural responses from the model of the auditory system},
    year = {2014},
    abstract = {© 2014 IEEE. Human listeners are capable of extracting several information of the speaker such as personality, emotional state, gender, and age using features present in speech signal. The gender classification of a speaker based on his or her speech signal is crucial in telecommunication. This study proposes a gender classification technique using the neural responses of a physiologically-based computational model of the auditory periphery. Neurograms were created from the responses of the model auditory nerve to speech signals. Orthogonal moments were applied on the neurogram to extract features for classification using Gaussian mixture model. The performance of the proposed method was evaluated for eight different types of noise. The result showed a high accuracy for gender classification for both under quiet and noisy conditions. The proposed method could be used as a pre-processor in speaker verification system.},
    doi = {10.1109/IFESS.2014.7036748},
    isbn = {9781479964833},
    day = {9},
    publicationstatus = {published},
    }
  • [DOI] M. Alam, W. Jassim, and M. Zilany, "Neural response based phoneme classification under noisy condition," in 2014 international symposium on intelligent signal processing and communication systems, ispacs 2014, 2014, p. 175–179.
    [Bibtex]
    @inproceedings{alam2014neuralcondition,
    author = {Alam, MS and Jassim, WA and Zilany, MSA},
    booktitle = {2014 International Symposium on Intelligent Signal Processing and Communication Systems, ISPACS 2014},
    month = {Jan},
    pages = {175--179},
    title = {Neural response based phoneme classification under noisy condition},
    year = {2014},
    abstract = {© 2014 IEEE. Human listeners are capable of recognizing speech in noisy environment, while most of the traditional speech recognition methods do not perform well in the presence of noise. Unlike traditional Mel-frequency cepstral coefficient (MFCC)-based method, this study proposes a phoneme classification technique using the neural responses of a physiologically-based computational model of the auditory periphery. Neurograms were constructed from the responses of the model auditory nerve to speech phonemes. The features of neurograms were used to train the recognition system using a Gaussian Mixture Model (GMM) classification technique. Performance was evaluated for different types of phonemes such as stops, fricatives and vowels from the TIMIT database for both under quiet and noisy conditions. Although performance of the proposed method is comparable with that of MFCC-based classifier in quiet condition, the neural response-based proposed method outperforms the traditional MFCC-based method under noisy conditions even with the use of less number of features in the proposed method. The proposed method could be used in the field of speech recognition such as speech to text application, especially under noisy conditions.},
    doi = {10.1109/ISPACS.2014.7024447},
    isbn = {9781479961207},
    day = {27},
    publicationstatus = {published},
    }
  • [DOI] M. Ekramul, W. Jassim, and M. Zilany, "Effects of noise on the features of bispectrum," in 2014 ieee 19th international functional electrical stimulation society annual conference, ifess 2014 - conference proceedings, 2014.
    [Bibtex]
    @inproceedings{ekramul2014effectsbispectrum,
    author = {Ekramul, MH and Jassim, WA and Zilany, MSA},
    booktitle = {2014 IEEE 19th International Functional Electrical Stimulation Society Annual Conference, IFESS 2014 - Conference Proceedings},
    month = {Feb},
    title = {Effects of noise on the features of bispectrum},
    year = {2014},
    abstract = {© 2014 IEEE. Higher-order Spectral (HOS) techniques can be used to detect deviations from linearity, stationarity or Guassianity in the signal. Most of the biomédical signals are non-linear, non-stationary, and non-Gaussian in nature. It is more useful to analyze them with HOS compared to the use of second-order statistics (power spectrum). There are some features in the bispectrum that are capable of differentiating between the signal and signal with noise. This study presents a technique of HOS to investigate the effect of noise on the features of third order statistics (bispectrum). The results show that the magnitudes of the bispectrum are consistently changing as a function of the amount of noise. In addition, these features can be extracted from the speech signal to compare with the respective behavioral responses, and thus a new metric to assess speech intelligibility and quality can be developed.},
    doi = {10.1109/IFESS.2014.7036758},
    isbn = {9781479964833},
    day = {9},
    publicationstatus = {published},
    }

2013

  • [DOI] A. Hines, J. Skoglund, A. Kokaram, and N. Harte, "Robustness of speech quality metrics to background noise and network degradations: comparing visqol, pesq and polqa," in Icassp, ieee international conference on acoustics, speech and signal processing - proceedings, 2013, p. 3697–3701.
    [Bibtex]
    @inproceedings{hines2013robustnesspolqa,
    author = {Hines, A and Skoglund, J and Kokaram, A and Harte, N},
    booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
    month = {Oct},
    pages = {3697--3701},
    title = {Robustness of speech quality metrics to background noise and network degradations: Comparing ViSQOL, PESQ and POLQA},
    year = {2013},
    abstract = {The Virtual Speech Quality Objective Listener (ViSQOL) is a new objective speech quality model. It is a signal based full reference metric that uses a spectro-temporal measure of similarity between a reference and a test speech signal. ViSQOL aims to predict the overall quality of experience for the end listener whether the cause of speech quality degradation is due to ambient noise, or transmission channel degradations. This paper describes the algorithm and tests the model using two speech corpora: NOIZEUS and E4. The NOIZEUS corpus contains speech under a variety of background noise types, speech enhancement methods, and SNR levels. The E4 corpus contains voice over IP degradations including packet loss, jitter and clock drift. The results are compared with the ITU-T objective models for speech quality: PESQ and POLQA. The behaviour of the metrics are also evaluated under simulated time warp conditions. The results show that for both datasets ViSQOL performed comparably with PESQ. POLQA was shown to have lower correlation with subjective scores than the other metrics for the NOIZEUS database. © 2013 IEEE.},
    doi = {10.1109/ICASSP.2013.6638348},
    isbn = {9781479903566},
    issn = {1520-6149},
    day = {18},
    publicationstatus = {published},
    }
  • [DOI] A. Hines, P. Pocta, and H. Melvin, "Detailed comparative analysis of pesq and visqol behaviour in the context of playout delay adjustments introduced by voip jitter buffer algorithms," in 2013 5th international workshop on quality of multimedia experience, qomex 2013 - proceedings, 2013, p. 18–23.
    [Bibtex]
    @inproceedings{hines2013detailedalgorithms,
    author = {Hines, A and Pocta, P and Melvin, H},
    booktitle = {2013 5th International Workshop on Quality of Multimedia Experience, QoMEX 2013 - Proceedings},
    month = {Oct},
    pages = {18--23},
    title = {Detailed comparative analysis of PESQ and VISQOL behaviour in the context of playout delay adjustments introduced by VOIP jitter buffer algorithms},
    year = {2013},
    abstract = {This paper undertakes a detailed comparative analysis of both PESQ and VISQOL model behaviour, when tested against speech samples modified through playout delay adjustments. The adjustments are typical (in extent and magnitude) to those introduced by VoIP jitter buffer algorithms. Furthermore, the analysis examines the impact of adjustment location as well as speaker factors on MOS scores predicted by both models and seeks to determine if both models are able to correctly predict the impact on quality perceived by the end user from earlier subjective tests. The earlier results showed speaker voice preference and potentially wideband experience dominating subjective tests more than playout delay adjustment duration or location. By design, PESQ and VISQOL do not qualify speaker voice difference reducing their correlation with the subjective tests. In addition, it was found that PESQ scores are impacted by playout delay adjustments and thus the impact of playout delay adjustments on a quality perceived by the end user is not well modelled. On the other hand, VISQOL model is better in predicting an impact of playout delay adjustments on a quality perceived by the user but there are still some discrepancies in the predicted scores. The reasons for those discrepancies are particularly analysed and discussed. © 2013 IEEE.},
    doi = {10.1109/QoMEX.2013.6603195},
    day = {17},
    publicationstatus = {published},
    }
  • A. Hines, J. Skoglund, A. Kokaram, and N. Harte, "Monitoring the effects of temporal clipping on voip speech quality," in Proceedings of the annual conference of the international speech communication association, interspeech, 2013, p. 1188–1192.
    [Bibtex]
    @inproceedings{hines2013monitoringquality,
    author = {Hines, A and Skoglund, J and Kokaram, A and Harte, N},
    booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
    month = {Jan},
    pages = {1188--1192},
    title = {Monitoring the effects of temporal clipping on VoIP speech quality},
    year = {2013},
    abstract = {This paper presents work on a real-time temporal clipping monitoring tool for VoIP. Temporal clipping can occur as a result of voice activity detection (VAD) or echo cancellation where comfort noise in used in place of clipped speech segments. The algorithm presented will form part of a no-reference objective model for quantifying perceived speech quality in VoIP. The overall approach uses a modular design that will help pinpoint the reason for degradations in addition to quantifying their impact on speech quality. The new algorithm was tested for VAD compared over a range of thresholds and varied speech frame sizes. The results are compared to objective Mean Opinion Scores (MOS-LQO) from POLQA. The results show that the proposed algorithm can efficiently predict temporal clipping in speech and correlates well with the full reference quality predictions from POLQA. The model shows good potential for use in a real-time monitoring tool. Copyright © 2013 ISCA.},
    issn = {2308-457X},
    eissn = {1990-9772},
    day = {1},
    publicationstatus = {published},
    }

2012

  • A. Hines and N. Harte, "Improved speech intelligibility with a chimaera hearing aid algorithm," in 13th annual conference of the international speech communication association 2012, interspeech 2012, 2012, p. 1466–1469.
    [Bibtex]
    @inproceedings{hines2012improvedalgorithm,
    author = {Hines, A and Harte, N},
    booktitle = {13th Annual Conference of the International Speech Communication Association 2012, INTERSPEECH 2012},
    month = {Dec},
    pages = {1466--1469},
    title = {Improved speech intelligibility with a chimaera hearing aid algorithm},
    volume = {2},
    year = {2012},
    abstract = {It is recognised that current hearing aid fitting algorithms can corrupt fine timing cues in speech. This paper presents a fitting algorithm that aims to improve speech intelligibility, while preserving the temporal fine structure. The algorithm combines the signal envelope amplification from a standard hearing aid fitting algorithm with the fine timing information available to unaided listeners. The proposed "chimaera aid" is evaluated with computer simulated listener tests to measure its speech intelligibility for 3 sample hearing losses. In addition, the experiment demonstrates the potential application of auditory nerve models in the development of new hearing aid algorithm designs using the previously developed Neurogram Similarity Index Measure (NSIM) to predict speech intelligibility. The results predict that the new aid restores envelope without degrading fine timing information.},
    isbn = {9781622767595},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] A. Hines and N. Harte, "Speech intelligibility prediction using a neurogram similarity index measure," Speech communication, vol. 54, iss. 2, p. 306–320, 2012.
    [Bibtex]
    @article{hines2012speechmeasure,
    author = {Hines, A and Harte, N},
    journal = {Speech Communication},
    month = {Feb},
    number = {2},
    pages = {306--320},
    title = {Speech intelligibility prediction using a Neurogram Similarity Index Measure},
    volume = {54},
    year = {2012},
    abstract = {Discharge patterns produced by fibres from normal and impaired auditory nerves in response to speech and other complex sounds can be discriminated subjectively through visual inspection. Similarly, responses from auditory nerves where speech is presented at diminishing sound levels progressively deteriorate from those at normal listening levels. This paper presents a Neurogram Similarity Index Measure (NSIM) that automates this inspection process, and translates the response pattern differences into a bounded discrimination metric. Performance intensity functions can be used to provide additional information over measurement of speech reception threshold and maximum phoneme recognition by plotting a test subject's recognition probability over a range of sound intensities. A computational model of the auditory periphery was used to replace the human subject and develop a methodology that simulates a real listener test. The newly developed NSIM is used to evaluate the model outputs in response to Consonant-Vowel-Consonant (CVC) word lists and produce phoneme discrimination scores. The simulated results are rigorously compared to those from normal hearing subjects in both quiet and noise conditions. The accuracy of the tests and the minimum number of word lists necessary for repeatable results is established and the results are compared to predictions using the speech intelligibility index (SII). The experiments demonstrate that the proposed simulated performance intensity function (SPIF) produces results with confidence intervals within the human error bounds expected with real listener tests. This work represents an important step in validating the use of auditory nerve models to predict speech intelligibility. © 2011 Elsevier B.V. All rights reserved.},
    doi = {10.1016/j.specom.2011.09.004},
    issn = {0167-6393},
    day = {1},
    publicationstatus = {published},
    }
  • A. Hines, J. Skoglund, A. Kokaram, and N. Harte, "Visqol: the virtual speech quality objective listener," in International workshop on acoustic signal enhancement, iwaenc 2012, 2012.
    [Bibtex]
    @inproceedings{hines2012visqollistener,
    author = {Hines, A and Skoglund, J and Kokaram, A and Harte, N},
    booktitle = {International Workshop on Acoustic Signal Enhancement, IWAENC 2012},
    month = {Jan},
    title = {VISQOL: The virtual speech quality objective listener},
    year = {2012},
    abstract = {© 2012, Institute of Electrical and Electronics Engineers Inc. All rights reserved. A model of human speech quality perception has been developed to provide an objective measure for predicting subjective quality assessments. The Virtual Speech Quality Objective Listener (ViSQOL) model is a signal based full reference metric that uses a spectro-temporal measure of similarity between a reference and a test speech signal. This paper describes the algorithm and compares the results with PESQ for common problems in VoIP: clock drift, associated time warping and jitter. The results indicate that ViSQOL is less prone to underestimation of speech quality in both scenarios than the ITU standard.},
    isbn = {9783800734511},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim, P. Raveendran, and R. Mukundan, "New orthogonal polynomials for speech signal and image processing," Iet signal processing, vol. 6, iss. 8, p. 713–723, 2012.
    [Bibtex]
    @article{jassim2012newprocessing,
    author = {Jassim, WA and Raveendran, P and Mukundan, R},
    journal = {IET Signal Processing},
    month = {Dec},
    number = {8},
    pages = {713--723},
    title = {New orthogonal polynomials for speech signal and image processing},
    volume = {6},
    year = {2012},
    abstract = {This study introduces a new set of orthogonal polynomials and moments and the set's application in signal and image processing. This polynomial is derived from two well-known orthogonal polynomials: the Tchebichef and Krawtchouk polynomials. This study attempts to present the following: (i) the mathematical and theoretical frameworks for the definition of this polynomial including the modelling of signals with the various analytical properties it contains, as well as, recurrence relations and transform equations that need to be addressed; and (ii) the results of empirical tests that compare the representational capabilities of this polynomial with those of the more traditional Tchebichef and Krawtchouk polynomials using speech and image signals from different databases. This study attempts to demonstrate that the proposed polynomials can be applied in the field of signal and image processing because of the promising properties of this polynomial especially in its localisation and energy compaction capabilities. © The Institution of Engineering and Technology 2012.},
    doi = {10.1049/iet-spr.2011.0004},
    issn = {1751-9675},
    eissn = {1751-9683},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim and P. Raveendran, "Face recognition using discrete tchebichef-krawtchouk transform," in Proceedings - 2012 ieee international symposium on multimedia, ism 2012, 2012, p. 120–127.
    [Bibtex]
    @inproceedings{jassim2012facetransform,
    author = {Jassim, WA and Raveendran, P},
    booktitle = {Proceedings - 2012 IEEE International Symposium on Multimedia, ISM 2012},
    month = {Dec},
    pages = {120--127},
    title = {Face recognition using Discrete Tchebichef-Krawtchouk Transform},
    year = {2012},
    abstract = {In this paper, a face recognition system based on Discrete Tchebichef-Krawtchouk Transform DTKT and Support Vector Machines SVMs is proposed. The objective of this paper is to present the following: (1) the mathematical and theoretical frameworks for the definition of the DTKT including transform equations that need to be addressed. (2) the DTKT features used in the classification of faces. (3) results of empirical tests that compare the representational capabilities of this transform with other types of discrete transforms such as Discrete Tchebichef transform DTT, discrete Krawtchouk Transform DKT, and Discrete Cosine transform DCT. The system is tested on a large number of faces collected from ORL and Yale face databases. Empirical results show that the proposed transform gives very good overall accuracy under clean and noisy conditions. © 2012 IEEE.},
    doi = {10.1109/ISM.2012.31},
    isbn = {9780769548753},
    day = {1},
    publicationstatus = {published},
    }

2011

  • [DOI] A. Hines and N. Harte, "Simulated performance intensity functions," in Proceedings of the annual international conference of the ieee engineering in medicine and biology society, embs, 2011, p. 7139–7142.
    [Bibtex]
    @inproceedings{hines2011simulatedfunctions,
    author = {Hines, A and Harte, N},
    booktitle = {Proceedings of the Annual International Conference of the IEEE Engineering in Medicine and Biology Society, EMBS},
    month = {Dec},
    pages = {7139--7142},
    title = {Simulated performance intensity functions},
    year = {2011},
    abstract = {Measuring speech intelligibility for different hearing aid fitting methods in a simulated environment would allow rapid prototyping and early design assessment. A simulated performance intensity function (SPIF) test methodology has been developed to allow experimentation using an auditory nerve model to predict listeners' phoneme recognition. The test discriminates between normal hearing and progressively degrading levels of sensorineural hearing loss. Auditory nerve discharge patterns, presented as neurograms, can be subjectively ranked by visual inspection. Here, subjective inspection is substituted with an automated ranking using a new image similarity metric that can quantify neurogram degradation in a consistent manner. This work reproduces the test results of a real human listener with moderate hearing loss, in unaided and aided scenarios, using a simulation. The simulated results correlate within comparable error margins to the real listener test performance intensity functions. © 2011 IEEE.},
    doi = {10.1109/IEMBS.2011.6091804},
    isbn = {9781424441211},
    issn = {1557-170X},
    day = {26},
    publicationstatus = {published},
    }
  • [DOI] A. Hines and N. Harte, "Simulated performance intensity functions," Proceedings of the annual international conference of the ieee engineering in medicine and biology society, embs, p. 7139–7142, 2011.
    [Bibtex]
    @article{hines2011simulatedfunctions,
    author = {Hines, A and Harte, N},
    journal = {Proceedings of the Annual International Conference of the IEEE Engineering in Medicine and Biology Society, EMBS},
    month = {Dec},
    pages = {7139--7142},
    title = {Simulated performance intensity functions},
    year = {2011},
    abstract = {Measuring speech intelligibility for different hearing aid fitting methods in a simulated environment would allow rapid prototyping and early design assessment. A simulated performance intensity function (SPIF) test methodology has been developed to allow experimentation using an auditory nerve model to predict listeners' phoneme recognition. The test discriminates between normal hearing and progressively degrading levels of sensorineural hearing loss. Auditory nerve discharge patterns, presented as neurograms, can be subjectively ranked by visual inspection. Here, subjective inspection is substituted with an automated ranking using a new image similarity metric that can quantify neurogram degradation in a consistent manner. This work reproduces the test results of a real human listener with moderate hearing loss, in unaided and aided scenarios, using a simulation. The simulated results correlate within comparable error margins to the real listener test performance intensity functions. © 2011 IEEE.},
    doi = {10.1109/IEMBS.2011.6091804},
    isbn = {9781424441211},
    issn = {1557-170X},
    day = {26},
    publicationstatus = {published},
    }
  • "Comparing hearing-aid algorithm performance using simulated performance intensity functions." 2011, p. 347–354.
    [Bibtex]
    @inproceedings{2011comparingfunctions,
    author = {},
    booktitle = {},
    month = {Jan},
    pages = {347--354},
    title = {Comparing hearing-aid algorithm performance using Simulated Performance Intensity Functions},
    year = {2011},
    conference = {3rd International Symposium on Auditory and Audiological Research (ISAAR)},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] "Abstracts," International journal of audiology, vol. 50, iss. 10, p. 717–780, 2011.
    [Bibtex]
    @article{2011abstractsabstracts,
    author = {},
    journal = {International Journal of Audiology},
    month = {Oct},
    number = {10},
    pages = {717--780},
    publisher = {Informa UK Limited},
    title = {Abstracts},
    volume = {50},
    year = {2011},
    doi = {10.3109/14992027.2011.588967},
    issn = {1499-2027},
    eissn = {1708-8186},
    language = {en},
    publicationstatus = {published},
    }

2010

  • A. Hines and N. Harte, "Evaluating sensorineural hearing loss with an auditory nerve model using a mean structural similarity measure," in European signal processing conference, 2010, p. 1924–1928.
    [Bibtex]
    @inproceedings{hines2010evaluatingmeasure,
    author = {Hines, A and Harte, N},
    booktitle = {European Signal Processing Conference},
    month = {Dec},
    pages = {1924--1928},
    title = {Evaluating sensorineural hearing loss with an auditory nerve model using a mean structural similarity measure},
    year = {2010},
    abstract = {Hearing loss research has traditionally been based on perceptual criteria, speech intelligibility and threshold levels. The development of computational models of the auditory-periphery has allowed experimentation via simulation to provide quantitative, repeatable results at a more granular level than would be practical with clinical research on human subjects. This work seeks to create an objective measure to automate this inspection process and ranks hearing losses based on auditory-nerve discharge patterns. A systematic way of assessing phonemic degradation using the outputs of an auditory nerve model for a range of sensorineural hearing losses would aid in rapid prototyping development of speech-processing algorithms for digital hearing aids. The effect of sensorineural hearing loss (SNHL) on phonemic structure was evaluated in this study using two types of neurograms: temporal fine structure (TFS) and average discharge rate or temporal envelope. The mean structural similarity index (MSSIM) is an objective measure originally developed to assess perceptual image quality. The measure is adapted here for use in measuring the phonemic degradation in neurograms derived from impaired auditory nerve outputs. A full evaluation of the choice of parameters for the metric is presented using a large amount of natural human speech. The metric's boundedness and the results for TFS neurograms indicate it is a superior metric to standard point to point metrics of relative mean absolute error and relative mean squared error. © EURASIP, 2010.},
    issn = {2219-5491},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] A. Hines and N. Harte, "Speech intelligibility from image processing," Speech communication, vol. 52, iss. 9, p. 736–752, 2010.
    [Bibtex]
    @article{hines2010speechprocessing,
    author = {Hines, A and Harte, N},
    journal = {Speech Communication},
    month = {Sep},
    number = {9},
    pages = {736--752},
    title = {Speech intelligibility from image processing},
    volume = {52},
    year = {2010},
    abstract = {Hearing loss research has traditionally been based on perceptual criteria, speech intelligibility and threshold levels. The development of computational models of the auditory periphery has allowed experimentation via simulation to provide quantitative, repeatable results at a more granular level than would be practical with clinical research on human subjects. The responses of the model used in this study have been previously shown to be consistent with a wide range of physiological data from both normal and impaired ears for stimuli presentation levels spanning the dynamic range of hearing. The model output can be assessed by examination of the spectro-temporal output visualised as neurograms. The effect of sensorineural hearing loss (SNHL) on phonemic structure was evaluated in this study using two types of neurograms: temporal fine structure (TFS) and average discharge rate or temporal envelope. A new systematic way of assessing phonemic degradation is proposed using the outputs of an auditory nerve model for a range of SNHLs. The mean structured similarity index (MSSIM) is an objective measure originally developed to assess perceptual image quality. The measure is adapted here for use in measuring the phonemic degradation in neurograms derived from impaired auditory nerve outputs. A full evaluation of the choice of parameters for the metric is presented using a large amount of natural human speech. The metric's boundedness and the results for TFS neurograms indicate it is a superior metric to standard point to point metrics of relative mean absolute error and relative mean squared error. MSSIM as an indicative score of intelligibility is also promising, with results similar to those of the standard speech intelligibility index metric. © 2010 Elsevier B.V. All rights reserved.},
    doi = {10.1016/j.specom.2010.04.006},
    issn = {0167-6393},
    day = {1},
    publicationstatus = {published},
    }

2009

  • A. Hines and N. Harte, "Error metrics for impaired auditory nerve responses of different phoneme groups," in Proceedings of the annual conference of the international speech communication association, interspeech, 2009, p. 1119–1122.
    [Bibtex]
    @inproceedings{hines2009errorgroups,
    author = {Hines, A and Harte, N},
    booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
    month = {Nov},
    pages = {1119--1122},
    title = {Error metrics for impaired auditory nerve responses of different phoneme groups},
    year = {2009},
    abstract = {An auditory nerve model allows faster investigation of new signal processing algorithms for hearing aids. This paper presents a study of the degradation of auditory nerve (AN) responses at a phonetic level for a range of sensorineural hearing losses and flat audiograms. The AN model of Zilany \& Bruce was used to compute responses to a diverse set of phoneme rich sentences from the TIMIT database. The characteristics of both the average discharge rate and spike timing of the responses are discussed. The experiments demonstrate that a mean absolute error metric provides a useful measure of average discharge rates but a more complex measure is required to capture spike timing response errors. Copyright © 2009 ISCA.},
    eissn = {1990-9772},
    day = {26},
    publicationstatus = {published},
    }
  • "Measurement of phonemic degradation in sensorineural hearing loss using a computational model of the auditory periphery." 2009, p. 1–6.
    [Bibtex]
    @inproceedings{2009measurementperiphery,
    author = {},
    booktitle = {},
    month = {Jun},
    pages = {1--6},
    title = {Measurement of phonemic degradation in sensorineural hearing loss using a computational model of the auditory periphery},
    year = {2009},
    conference = {IET Irish Signals and Systems Conference (ISSC 2009)},
    day = {1},
    publicationstatus = {published},
    }
  • [DOI] W. Jassim and R. Paramesran, "Speech signals representation by discrete transforms," in International conference for technical postgraduates 2009, techpos 2009, 2009.
    [Bibtex]
    @inproceedings{jassim2009speechtransforms,
    author = {Jassim, WA and Paramesran, R},
    booktitle = {International Conference for Technical Postgraduates 2009, TECHPOS 2009},
    month = {Dec},
    title = {Speech signals representation by discrete transforms},
    year = {2009},
    abstract = {In this paper, an attempt was made to analyze the speech reconstruction accuracy when using different basis functions as the kernel for a reversible transform. Various transforms such as Discrete Cosine Transform DCT, Discrete Tchebichef Transform DTT, Ordered Hadamard Transform, and Discrete Haar Transform, are defined and examined. We have found that the DCT and DTT transforms have provided the greatest energy compactness properties for noise free speech sets. While, for noisy speech signals, DCT and Haar transform have the best signal representations in the transform domain as shown in the simulation results section.},
    doi = {10.1109/TECHPOS.2009.5412082},
    isbn = {9781424452231},
    day = {1},
    publicationstatus = {published},
    }