@incollection{SisLab4374, month = {January}, author = {Nguyen Tuan Thanh Le and Manh Linh Pham}, series = {Big Data for Industry 4.0: Challenges and Applications}, booktitle = {Industry 4.0 Interoperability, Analytics, Security, and Case studies}, editor = {G. Rajesh and Raajini X. Mercilin and Thi Thu Hien Dang}, title = {Big Data Analytics and Machine Learning for Industry 4.0: An Overview}, address = {Boca Raton, FL, USA}, publisher = {CRC Press - Taylor \& Francis Group, LLC}, year = {2021}, pages = {1--11}, keywords = {Big Data Analytics, Industry 4.0, Machine Learning, Deep Learning}, url = {https://eprints.uet.vnu.edu.vn/eprints/id/eprint/4374/}, abstract = {The concept of ?Big data? was mentioned for the first time by Roger Mougalas in 2005. Volume hints to the size and/or scale of datasets. Until now, there is no universal threshold for data volume to be considered as big data, because of the time and diversity of datasets. Velocity indicates the speed of processing data. It can fall into three categories: streaming processing, real-time processing, or batch processing. Value alludes to the usefulness of data for decision making. Veracity denotes the quality and trustworthiness of datasets. Parallelization allows one to improve computation time by dividing big problems into smaller instances, distributing smaller tasks across multiple threads and then performing them simultaneously. Feature selection is useful for preparing high scale datasets. Sampling is a method for data reducing that helps to derive patterns in big datasets by generating, manipulating, and analyzing subsets of the original data.} }