// https://github.com/pytorch/pytorch/blob/4c55dc50355d5e923642c59ad2a23d6ad54711e7/aten/src/ATen/mps/MPSAllocator.h
// #L244
// m_scalar_pool(m_device, UsageFlags::SMALL | UsageFlags::SHARED | UsageFlags::SCALAR),
m_scalar_pool(m_device, UsageFlags::SMALL | UsageFlags::PRIVATE | UsageFlags::SCALAR),
// https://github.com/pytorch/pytorch/blob/4c55dc50355d5e923642c59ad2a23d6ad54711e7/aten/src/ATen/mps/MPSDevice.mm
// #L96
// if ([device isLowPower]) { // exclude Intel GPUs
if (true) { // include Intel GPUs
_mtl_device = [device retain];
break;
}
// https://github.com/pytorch/pytorch/blob/4c55dc50355d5e923642c59ad2a23d6ad54711e7/aten/src/ATen/native/mps/OperationUtils.mm
// #L391
// Scalar pools are only supported on devices with unified memory
if (false) {
scalar.buffer = getIMPSAllocator()->allocScalarBufferWithValue(&scalar.value, scalar.size);
result = [[[MPSGraphTensorData alloc] initWithMTLBuffer:scalar.getMTLBuffer()
shape:@[ @1 ]
dataType:getMPSScalarType(scalar.type)] autorelease];
if command -v protoc &> /dev/null
then
echo "protoc command exists, please remove it first"
echo "\tbrew uninstall protobuf; pip uninstall protobuf"
exit
fi
USE_NATIVE_ARCH=1 \
USE_DISTRIBUTED=0 \
USE_MKLDNN=OFF \
USE_FBGEMM=OFF \
USE_NNPACK=OFF \
USE_QNNPACK=OFF \
USE_XNNPACK=OFF \
USE_CUDA=0 \
BUILD_TEST=0 \
BUILD_CAFFE2=0 \
BUILD_CAFFE2_OPS=0 \
CPLUS_INCLUDE_PATH=$(python -c "import numpy as np; print(np.get_include())") \
CMAKE_C_COMPILER_LAUNCHER=ccache \
CMAKE_CXX_COMPILER_LAUNCHER=ccache \
CMAKE_CUDA_COMPILER_LAUNCHER=ccache \
CCACHE_COMPILERTYPE=clang \
CC="clang" \
CXX="clang++" \
MAX_JOBS=8 \
MACOSX_DEPLOYMENT_TARGET=14.0 \
USE_MPS=1 \
CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} \
python setup.py bdist_wheel -p macosx_14_0_x86_64 --cmake
# Mac 에서 python 설치
# 구 intel mac 에서 pytorch mps 사용하기 위해 python 3.11 직접 빌드하는 용도
mkdir -p ~/ws
git clone https://github.com/python/cpython ~/ws/cpython
cd ~/ws/cpython
git checkout 3.11
GDBM_CFLAGS="-I$(brew --prefix gdbm)/include" \
GDBM_LIBS="-L$(brew --prefix gdbm)/lib -lgdbm" \
ac_cv_working_openssl_ssl=yes \
ac_cv_working_openssl_hashlib=yes \
CPPFLAGS="-I$(brew --prefix openssl@3.0)/include" \
LDFLAGS="-L$(brew --prefix openssl@3.0)/lib" \
PKG_CONFIG_PATH="$(brew --prefix openssl@3.0)/lib/pkgconfig" \
./configure \
--with-openssl="$(brew --prefix openssl@3.0)" \
--enable-framework \
--enable-optimizations \
--with-lto
echo "export CPLUS_INCLUDE_PATH=/Library/Frameworks/Python.framework/Versions/3.11/Headers" >> ~/.setting
make -s -j8
sudo make install
sudo ln -s -f /usr/local/bin/python3.11 /usr/local/bin/python
sudo ln -s -f /usr/local/bin/python3.11 /usr/local/bin/python3
sudo ln -s -f /usr/local/bin/pip3.11 /usr/local/bin/pip
sudo ln -s -f /usr/local/bin/pip3.11 /usr/local/bin/pip3
#import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
@interface MatMul : NSObject {
MPSGraph *_graph;
MPSGraphTensor *_aTensor;
MPSGraphTensor *_bTensor;
MPSGraphTensor *_resultTensor;
}
- (instancetype)initWithDevice:(id<MTLDevice>)device;
- (void)matmul:(id<MTLBuffer>)A B:(id<MTLBuffer>)B C:(id<MTLBuffer>)C aShape:(MPSShape *)aShape bShape:(MPSShape *)bShape cShape:(MPSShape *)cShape;
@end
@implementation MatMul
- (instancetype)initWithDevice:(id<MTLDevice>)device
{
if (self = [super init]) {
_graph = [[MPSGraph alloc] init];
}
return self;
}
- (void)matmul:(id<MTLBuffer>)A B:(id<MTLBuffer>)B C:(id<MTLBuffer>)C aShape:(MPSShape *)aShape bShape:(MPSShape *)bShape cShape:(MPSShape *)cShape
{
@autoreleasepool {
MPSDataType dataType = MPSDataTypeFloat32;
MPSGraphTensorData *aTensorData = [[MPSGraphTensorData alloc] initWithMTLBuffer:A shape:aShape dataType:dataType];
MPSGraphTensorData *bTensorData = [[MPSGraphTensorData alloc] initWithMTLBuffer:B shape:bShape dataType:dataType];
MPSGraphTensorData *cTensorData = [[MPSGraphTensorData alloc] initWithMTLBuffer:C shape:cShape dataType:dataType];
_aTensor = [_graph placeholderWithShape:aShape dataType:dataType name:NULL];
_bTensor = [_graph placeholderWithShape:bShape dataType:dataType name:NULL];
_resultTensor = [_graph matrixMultiplicationWithPrimaryTensor:_aTensor secondaryTensor:_bTensor name:NULL];
MPSGraphTensorDataDictionary *inputs = @{_aTensor: aTensorData, _bTensor: bTensorData};
MPSGraphTensorDataDictionary *result = [_graph runWithFeeds:inputs targetTensors:@[_resultTensor] targetOperations:@[[_resultTensor operation]]];
MPSGraphTensorData *resultData = result[_resultTensor];
[resultData.mpsndarray readBytes:C.contents strideBytes:nil];
}
}
@end
int main(int argc, const char * argv[]) {
@autoreleasepool {
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
// shape
MPSShape *aShape = @[@1, @128];
MPSShape *bShape = @[@128, @1];
MPSShape *cShape = @[@1, @1];
unsigned long aSize = (unsigned long)[aShape[0] intValue] * (unsigned long)[aShape[1] intValue];
unsigned long bSize = (unsigned long)[bShape[0] intValue] * (unsigned long)[bShape[1] intValue];
unsigned long cSize = (unsigned long)[cShape[0] intValue] * (unsigned long)[cShape[1] intValue];
float *A = (float *)malloc(sizeof(float) * aSize);
float *B = (float *)malloc(sizeof(float) * bSize);
float *C = (float *)malloc(sizeof(float) * cSize);
for (int i = 0; i < aSize; i++) {
A[i] = 1;
}
for (int i = 0; i < bSize; i++) {
B[i] = 1;
}
id<MTLBuffer> A_buffer = [device newBufferWithBytes:A length:sizeof(float) * aSize options:MTLResourceStorageModeShared];
id<MTLBuffer> B_buffer = [device newBufferWithBytes:B length:sizeof(float) * bSize options:MTLResourceStorageModeShared];
id<MTLBuffer> C_buffer = [device newBufferWithBytes:C length:sizeof(float) * cSize options:MTLResourceStorageModeShared];
MatMul *matmul = [[MatMul alloc] initWithDevice:device];
[matmul matmul:A_buffer B:B_buffer C:C_buffer aShape:aShape bShape:bShape cShape:cShape];
memcpy(A, A_buffer.contents, sizeof(float) * aSize);
memcpy(B, B_buffer.contents, sizeof(float) * bSize);
memcpy(C, C_buffer.contents, sizeof(float) * cSize);
for (int i = 0; i < aSize; i++) {
printf("%f ", A[i]);
}
printf("\n");
for (int i = 0; i < bSize; i++) {
printf("%f ", B[i]);
}
printf("\n");
for (int i = 0; i < cSize; i++) {
printf("%f ", C[i]);
}
printf("\n");
free(A);
free(B);
free(C);
}
return 0;
}
#import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
// Objective-C wrapper for MPSGraph matrix multiplication operation
#import <MetalPerformanceShadersGraph/MPSGraph.h>
@interface MPSGraph (MatrixMultiplication)
- (MPSGraphTensor *)matrixMultiplicationWithPrimaryTensor2:(MPSGraphTensor *)selfTensor secondaryTensor:(MPSGraphTensor *)otherTensor name:(NSString * _Nullable)name;
@end
@implementation MPSGraph (MatrixMultiplication)
- (MPSGraphTensor *)_matrixMultiplicationWithPrimaryTensor_impl_3_3:(MPSGraphTensor *)selfTensor secondaryTensor:(MPSGraphTensor *)otherTensor name:(NSString * _Nullable)name {
@autoreleasepool {
// replace matmul with group conv2d
// check length of selfTensor.shape == 3
if (selfTensor.shape.count != 3) {
NSLog(@"(matrixMultiplicationWithPrimaryTensor2) selfTensor.shape.count (== %lu) != 3", selfTensor.shape.count);
return NULL;
}
// check length of otherTensor.shape == 3
if (otherTensor.shape.count != 3) {
NSLog(@"(matrixMultiplicationWithPrimaryTensor2) otherTensor.shape.count (== %lu) != 3", otherTensor.shape.count);
return NULL;
}
// assert selfTensor.shape[2] == otherTensor.shape[1]
if (![selfTensor.shape[2] isEqualToNumber:otherTensor.shape[1]]) {
NSLog(@"(matrixMultiplicationWithPrimaryTensor2) selfTensor.shape[2] (== %@) != otherTensor.shape[1] (== %@)", selfTensor.shape[2], otherTensor.shape[1]);
return NULL;
}
NSNumber *numBatch = selfTensor.shape[0];
NSNumber *selfTensorM = selfTensor.shape[1];
NSNumber *selfTensorK = selfTensor.shape[2];
NSNumber *otherTensorK = otherTensor.shape[1];
NSNumber *otherTensorN = otherTensor.shape[2];
NSInteger axis = 0;
// repeat _matrixMultiplicationWithPrimaryTensor_impl_2_2 numBatch times
NSMutableArray<MPSGraphTensor *> *resultTensorArray = [NSMutableArray array];
NSArray<MPSGraphTensor *> *splitSelfTensorArray = [self splitTensor:selfTensor numSplits:numBatch.intValue axis:axis name:NULL];
NSArray<MPSGraphTensor *> *splitOtherTensorArray = [self splitTensor:otherTensor numSplits:numBatch.intValue axis:axis name:NULL];
for (int i = 0; i < [numBatch intValue]; i++) {
MPSGraphTensor *_a = [self reshapeTensor:splitSelfTensorArray[i] withShape:@[selfTensorM, selfTensorK] name:NULL];
MPSGraphTensor *_b = [self reshapeTensor:splitOtherTensorArray[i] withShape:@[otherTensorK, otherTensorN] name:NULL];
MPSGraphTensor *_resultTensor = [self _matrixMultiplicationWithPrimaryTensor_impl_2_2:_a secondaryTensor:_b name:NULL];
_resultTensor = [self reshapeTensor:_resultTensor withShape:@[@1, selfTensorM, otherTensorN] name:NULL];
[resultTensorArray addObject:_resultTensor];
}
// concat result tensors
MPSGraphTensor *resultTensor = [self concatTensors:resultTensorArray dimension:axis name:name];
return resultTensor;
}
}
- (MPSGraphTensor *)_matrixMultiplicationWithPrimaryTensor_impl_2_2:(MPSGraphTensor *)selfTensor secondaryTensor:(MPSGraphTensor *)otherTensor name:(NSString * _Nullable)name {
@autoreleasepool {
// replace matmul with conv2d
// check length of selfTensor.shape == 2
if (selfTensor.shape.count != 2) {
NSLog(@"(matrixMultiplicationWithPrimaryTensor2) selfTensor.shape.count (== %lu) != 2", selfTensor.shape.count);
return NULL;
}
// assert length of otherTensor.shape == 2
if (otherTensor.shape.count != 2) {
NSLog(@"(matrixMultiplicationWithPrimaryTensor2) otherTensor.shape.count (== %lu) != 2", otherTensor.shape.count);
return NULL;
}
// assert selfTensor.shape[1] == otherTensor.shape[0]
if (![selfTensor.shape[1] isEqualToNumber:otherTensor.shape[0]]) {
NSLog(@"(matrixMultiplicationWithPrimaryTensor2) selfTensor.shape[1] (== %@) != otherTensor.shape[0] (== %@)", selfTensor.shape[1], otherTensor.shape[0]);
return NULL;
}
NSNumber *selfTensorM = selfTensor.shape[0];
NSNumber *selfTensorK = selfTensor.shape[1];
NSNumber *otherTensorK = otherTensor.shape[0];
NSNumber *otherTensorN = otherTensor.shape[1];
// 1. create MPSGraphConvolution2DOpDescriptor
MPSGraphConvolution2DOpDescriptor *_mpsGraphConv2dOpDescriptor = [MPSGraphConvolution2DOpDescriptor descriptorWithStrideInX:1
strideInY:1
dilationRateInX:1
dilationRateInY:1
groups:1
paddingStyle:MPSGraphPaddingStyleTF_VALID
dataLayout:MPSGraphTensorNamedDataLayoutNHWC
weightsLayout:MPSGraphTensorNamedDataLayoutHWIO];
// 2. reshape tensors
MPSGraphTensor *selfTensorReshaped = [self reshapeTensor:selfTensor withShape:@[selfTensorM, @1, @1, selfTensorK] name:NULL];
MPSGraphTensor *otherTensorReshaped = [self reshapeTensor:otherTensor withShape:@[@1, @1, otherTensorK, otherTensorN] name:NULL];
// 3. create MPSGraphConvolution2DOp
MPSGraphTensor *resultTensor = [self convolution2DWithSourceTensor:selfTensorReshaped
weightsTensor:otherTensorReshaped
descriptor:_mpsGraphConv2dOpDescriptor
name:NULL];
// 4. reshape result tensor
resultTensor = [self reshapeTensor:resultTensor withShape:@[selfTensorM, otherTensorN] name:name];
return resultTensor;
}
}
- (MPSGraphTensor *)matrixMultiplicationWithPrimaryTensor2:(MPSGraphTensor *)selfTensor secondaryTensor:(MPSGraphTensor *)otherTensor name:(NSString * _Nullable)name {
@autoreleasepool {
if (selfTensor.shape.count == 2 && otherTensor.shape.count == 2) {
return [self _matrixMultiplicationWithPrimaryTensor_impl_2_2:selfTensor secondaryTensor:otherTensor name:name];
} else if (selfTensor.shape.count == 3 && otherTensor.shape.count == 3) {
return [self _matrixMultiplicationWithPrimaryTensor_impl_3_3:selfTensor secondaryTensor:otherTensor name:name];
} else {
NSLog(@"(matrixMultiplicationWithPrimaryTensor2) selfTensor.shape.count (== %lu) != 2 or 3", selfTensor.shape.count);
return NULL;
}
}
}
@end